diff --git a/CMakeLists.txt b/CMakeLists.txt index bf3809b5810a34b0a7c70a64d9d70359c46ebc98..c664f43e9e446a08bdcbe844ee7741a86a72660e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.0.0) option(USE_OPENMP "build with openmp support" ON) option(USE_EXCEPTION "build with exception" ON) -option(WITH_LOGGING "print logging for debug" ON) +option(WITH_LOGGING "print logging for debug" OFF) option(WITH_SYMBOL "build with all symbols" ON) # turn off if use jni or ios io option(WITH_PROFILE "print op profile for debug" OFF) option(WITH_TEST "build with unit tests" ON) diff --git a/metal/MobileNetDemo/MobileNetDemo.xcodeproj/project.pbxproj b/metal/MobileNetDemo/MobileNetDemo.xcodeproj/project.pbxproj index dbacb00f0d857655ef6048cff24ad6cab5cb91f4..245483a35b2b722f11b9b53c1691f44d6274d945 100644 --- a/metal/MobileNetDemo/MobileNetDemo.xcodeproj/project.pbxproj +++ b/metal/MobileNetDemo/MobileNetDemo.xcodeproj/project.pbxproj @@ -15,6 +15,7 @@ FCB40E5121E0CEBB0075EC91 /* mobilenet_model in Resources */ = {isa = PBXBuildFile; fileRef = FCB40E4F21E0CEBB0075EC91 /* mobilenet_model */; }; FCB40E5221E0CEBB0075EC91 /* mobilenet_params in Resources */ = {isa = PBXBuildFile; fileRef = FCB40E5021E0CEBB0075EC91 /* mobilenet_params */; }; FCB40E5421E0CEF80075EC91 /* synset.txt in Resources */ = {isa = PBXBuildFile; fileRef = FCB40E5321E0CEF80075EC91 /* synset.txt */; }; + FCC15E13221E715400DC3CB2 /* paddle-mobile-metallib.metallib in Resources */ = {isa = PBXBuildFile; fileRef = FCC15E12221E715400DC3CB2 /* paddle-mobile-metallib.metallib */; }; FCD3873821E1C31F0052F3D0 /* paddle_mobile.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = FCD3873721E1C31F0052F3D0 /* paddle_mobile.framework */; }; FCD3873921E1C31F0052F3D0 /* paddle_mobile.framework in Embed Frameworks */ = {isa = PBXBuildFile; fileRef = FCD3873721E1C31F0052F3D0 /* paddle_mobile.framework */; settings = {ATTRIBUTES = (CodeSignOnCopy, RemoveHeadersOnCopy, ); }; }; FCF2870921DFAEC7009A87DA /* AppDelegate.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCF2870821DFAEC7009A87DA /* AppDelegate.swift */; }; @@ -49,6 +50,7 @@ FCB40E4F21E0CEBB0075EC91 /* mobilenet_model */ = {isa = PBXFileReference; lastKnownFileType = file; path = mobilenet_model; sourceTree = ""; }; FCB40E5021E0CEBB0075EC91 /* mobilenet_params */ = {isa = PBXFileReference; lastKnownFileType = file; path = mobilenet_params; sourceTree = ""; }; FCB40E5321E0CEF80075EC91 /* synset.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = synset.txt; sourceTree = ""; }; + FCC15E12221E715400DC3CB2 /* paddle-mobile-metallib.metallib */ = {isa = PBXFileReference; lastKnownFileType = "archive.metal-library"; name = "paddle-mobile-metallib.metallib"; path = "../../../../Library/Developer/Xcode/DerivedData/paddle-mobile-hdsimtkoxoondndnjczkbkchcwyh/Build/Products/Release-iphoneos/paddle-mobile-metallib.metallib"; sourceTree = ""; }; FCD3873721E1C31F0052F3D0 /* paddle_mobile.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; path = paddle_mobile.framework; sourceTree = BUILT_PRODUCTS_DIR; }; FCF2870521DFAEC7009A87DA /* MobileNetDemo.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = MobileNetDemo.app; sourceTree = BUILT_PRODUCTS_DIR; }; FCF2870821DFAEC7009A87DA /* AppDelegate.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AppDelegate.swift; sourceTree = ""; }; @@ -127,6 +129,7 @@ FCF286FC21DFAEC7009A87DA = { isa = PBXGroup; children = ( + FCC15E12221E715400DC3CB2 /* paddle-mobile-metallib.metallib */, FCD3873721E1C31F0052F3D0 /* paddle_mobile.framework */, FCF2870721DFAEC7009A87DA /* MobileNetDemo */, FCF2870621DFAEC7009A87DA /* Products */, @@ -225,6 +228,7 @@ FCB40E5121E0CEBB0075EC91 /* mobilenet_model in Resources */, FCB40DE921E0B9410075EC91 /* banana.jpeg in Resources */, FCF2871021DFAEC8009A87DA /* Assets.xcassets in Resources */, + FCC15E13221E715400DC3CB2 /* paddle-mobile-metallib.metallib in Resources */, FCB40E5421E0CEF80075EC91 /* synset.txt in Resources */, FCB40E5221E0CEBB0075EC91 /* mobilenet_params in Resources */, FCF2870E21DFAEC7009A87DA /* Main.storyboard in Resources */, diff --git a/metal/MobileNetDemo/MobileNetDemo/AppDelegate.swift b/metal/MobileNetDemo/MobileNetDemo/AppDelegate.swift index 4152b9be890fe3101e3137f02686fb6359cb108d..9596c1a535c587897d40cae7c73a4d5b6b442a11 100644 --- a/metal/MobileNetDemo/MobileNetDemo/AppDelegate.swift +++ b/metal/MobileNetDemo/MobileNetDemo/AppDelegate.swift @@ -10,37 +10,37 @@ import UIKit @UIApplicationMain class AppDelegate: UIResponder, UIApplicationDelegate { - - var window: UIWindow? - - - func application(_ application: UIApplication, didFinishLaunchingWithOptions launchOptions: [UIApplication.LaunchOptionsKey: Any]?) -> Bool { - // Override point for customization after application launch. - return true - } - - func applicationWillResignActive(_ application: UIApplication) { - // Sent when the application is about to move from active to inactive state. This can occur for certain types of temporary interruptions (such as an incoming phone call or SMS message) or when the user quits the application and it begins the transition to the background state. - // Use this method to pause ongoing tasks, disable timers, and invalidate graphics rendering callbacks. Games should use this method to pause the game. - } - - func applicationDidEnterBackground(_ application: UIApplication) { - // Use this method to release shared resources, save user data, invalidate timers, and store enough application state information to restore your application to its current state in case it is terminated later. - // If your application supports background execution, this method is called instead of applicationWillTerminate: when the user quits. - } - - func applicationWillEnterForeground(_ application: UIApplication) { - // Called as part of the transition from the background to the active state; here you can undo many of the changes made on entering the background. - } - - func applicationDidBecomeActive(_ application: UIApplication) { - // Restart any tasks that were paused (or not yet started) while the application was inactive. If the application was previously in the background, optionally refresh the user interface. - } - - func applicationWillTerminate(_ application: UIApplication) { - // Called when the application is about to terminate. Save data if appropriate. See also applicationDidEnterBackground:. - } - - + + var window: UIWindow? + + + func application(_ application: UIApplication, didFinishLaunchingWithOptions launchOptions: [UIApplication.LaunchOptionsKey: Any]?) -> Bool { + // Override point for customization after application launch. + return true + } + + func applicationWillResignActive(_ application: UIApplication) { + // Sent when the application is about to move from active to inactive state. This can occur for certain types of temporary interruptions (such as an incoming phone call or SMS message) or when the user quits the application and it begins the transition to the background state. + // Use this method to pause ongoing tasks, disable timers, and invalidate graphics rendering callbacks. Games should use this method to pause the game. + } + + func applicationDidEnterBackground(_ application: UIApplication) { + // Use this method to release shared resources, save user data, invalidate timers, and store enough application state information to restore your application to its current state in case it is terminated later. + // If your application supports background execution, this method is called instead of applicationWillTerminate: when the user quits. + } + + func applicationWillEnterForeground(_ application: UIApplication) { + // Called as part of the transition from the background to the active state; here you can undo many of the changes made on entering the background. + } + + func applicationDidBecomeActive(_ application: UIApplication) { + // Restart any tasks that were paused (or not yet started) while the application was inactive. If the application was previously in the background, optionally refresh the user interface. + } + + func applicationWillTerminate(_ application: UIApplication) { + // Called when the application is about to terminate. Save data if appropriate. See also applicationDidEnterBackground:. + } + + } diff --git a/metal/MobileNetDemo/MobileNetDemo/MobileNet.swift b/metal/MobileNetDemo/MobileNetDemo/MobileNet.swift index 9a50f3db8d7d1b57853d5ba6893e4af14879173c..c0814601bb3b3221bc1eac14c16fe0b10ef2a90e 100644 --- a/metal/MobileNetDemo/MobileNetDemo/MobileNet.swift +++ b/metal/MobileNetDemo/MobileNetDemo/MobileNet.swift @@ -16,48 +16,52 @@ import Foundation import paddle_mobile public class MobileNet: Net{ - class MobilenetPreProccess: CusomKernel { - init(device: MTLDevice) { - let s = Shape.init(inWidth: 224, inHeight: 224, inChannel: 3) - super.init(device: device, inFunctionName: "mobilenet_preprocess", outputDim: s, metalLoadModel: .LoadMetalInDefaultLib, metalLibPath: nil) + class MobilenetPreProccess: CusomKernel { + init(device: MTLDevice) { + let s = Shape.init(inWidth: 224, inHeight: 224, inChannel: 3) + super.init(device: device, inFunctionName: "mobilenet_preprocess", outputDim: s, metalLoadModel: .LoadMetalInDefaultLib, metalLibPath: nil) + } } - } - - class PreWords { - var contents: [String] = [] - init(fileName: String, type: String = "txt", inBundle: Bundle = Bundle.main) { - if let filePath = inBundle.path(forResource: fileName, ofType: type) { - let string = try! String.init(contentsOfFile: filePath) - contents = string.components(separatedBy: CharacterSet.newlines).filter{$0.count > 10}.map{ - String($0[$0.index($0.startIndex, offsetBy: 10)...]) + + class PreWords { + var contents: [String] = [] + init(fileName: String, type: String = "txt", inBundle: Bundle = Bundle.main) { + if let filePath = inBundle.path(forResource: fileName, ofType: type) { + let string = try! String.init(contentsOfFile: filePath) + contents = string.components(separatedBy: CharacterSet.newlines).filter{$0.count > 10}.map{ + String($0[$0.index($0.startIndex, offsetBy: 10)...]) + } + }else{ + fatalError("no file call \(fileName)") + } + } + subscript(index: Int) -> String { + return contents[index] } - }else{ - fatalError("no file call \(fileName)") - } } - subscript(index: Int) -> String { - return contents[index] + + let labels = PreWords.init(fileName: "synset") + + override public func resultStr(res: [ResultHolder]) -> String { + let firstRes = res[0] + let resPointer = firstRes.result + var s: [String] = [] + (0.. String { - let resPointer = res.result - var s: [String] = [] - (0.. outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]]) { - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height()) { - return; - } - const auto means = float4(123.68f, 116.78f, 103.94f, 0.0f); - const float4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017; - outTexture.write(float4(inColor.z, inColor.y, inColor.x, 0.0f), gid); + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height()) { + return; + } + const auto means = float4(123.68f, 116.78f, 103.94f, 0.0f); + const float4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017; + outTexture.write(float4(inColor.z, inColor.y, inColor.x, 0.0f), gid); } kernel void mobilenet_preprocess_half( @@ -28,11 +28,11 @@ kernel void mobilenet_preprocess_half( texture2d outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]]) { - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height()) { - return; - } - const auto means = half4(123.68f, 116.78f, 103.94f, 0.0f); - const half4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017; - outTexture.write(half4(inColor.z, inColor.y, inColor.x, 0.0f), gid); + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height()) { + return; + } + const auto means = half4(123.68f, 116.78f, 103.94f, 0.0f); + const half4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017; + outTexture.write(half4(inColor.z, inColor.y, inColor.x, 0.0f), gid); } diff --git a/metal/MobileNetDemo/MobileNetDemo/ViewController.swift b/metal/MobileNetDemo/MobileNetDemo/ViewController.swift index 4e31282f0356bdd3ce4bc2b7ef69e7ad0bd5ef89..a0d69c5c0633b68adf82582e1cef6357137645a5 100644 --- a/metal/MobileNetDemo/MobileNetDemo/ViewController.swift +++ b/metal/MobileNetDemo/MobileNetDemo/ViewController.swift @@ -10,84 +10,84 @@ import UIKit import paddle_mobile class ViewController: UIViewController { - @IBOutlet weak var resultTextView: UITextView! - @IBOutlet weak var selectImageView: UIImageView! - @IBOutlet weak var elapsedTimeLabel: UILabel! - var net: MobileNet! - var runner: Runner! - var toPredictTexture: MTLTexture? - - override func viewDidLoad() { - super.viewDidLoad() - GlobalConfig.shared.computePrecision = .Float16 - net = MobileNet.init(device: MetalHelper.shared.device) - runner = Runner.init(inNet: net, commandQueue: MetalHelper.shared.queue) + @IBOutlet weak var resultTextView: UITextView! + @IBOutlet weak var selectImageView: UIImageView! + @IBOutlet weak var elapsedTimeLabel: UILabel! + var net: MobileNet! + var runner: Runner! + var toPredictTexture: MTLTexture? - if let selectImage = UIImage.init(named: "banana.jpeg") { - selectImageView.image = selectImage - runner.getTexture(image: selectImage.cgImage!) {[weak self] (texture) in - self?.toPredictTexture = texture - } + override func viewDidLoad() { + super.viewDidLoad() + GlobalConfig.shared.computePrecision = .Float16 + net = MobileNet.init(device: MetalHelper.shared.device) + runner = Runner.init(inNet: net, commandQueue: MetalHelper.shared.queue) + + if let selectImage = UIImage.init(named: "banana.jpeg") { + selectImageView.image = selectImage + runner.getTexture(image: selectImage.cgImage!) {[weak self] (texture) in + self?.toPredictTexture = texture + } + } + + } + + @IBAction func loadAct(_ sender: Any) { + if runner.load() { + let resutText = " load success ! " + print(resutText) + self.resultTextView.text = resutText + } else { + fatalError(" load error ") + } + } + + @IBAction func selectImageAct(_ sender: Any) { + let imagePicker = UIImagePickerController() + imagePicker.sourceType = .camera + imagePicker.delegate = self + self.present(imagePicker, animated: true, completion: nil) } - } - - @IBAction func loadAct(_ sender: Any) { - if runner.load() { - let resutText = " load success ! " - print(resutText) - self.resultTextView.text = resutText - } else { - fatalError(" load error ") + @IBAction func clearAct(_ sender: Any) { + runner.clear() } - } - - @IBAction func selectImageAct(_ sender: Any) { - let imagePicker = UIImagePickerController() - imagePicker.sourceType = .camera - imagePicker.delegate = self - self.present(imagePicker, animated: true, completion: nil) - } - - @IBAction func clearAct(_ sender: Any) { - runner.clear() - } - - @IBAction func predictAct(_ sender: Any) { - if let texture = toPredictTexture { - let beginDate = Date.init() - runner.predict(texture: texture) { [weak self] (success, resultHolder) in - if success, let inResultHolder = resultHolder { - let timeUse = Date.init().timeIntervalSince(beginDate) - DispatchQueue.main.async { - self?.elapsedTimeLabel.text = "\(timeUse * 1000)ms" - self?.resultTextView.text = self?.net.resultStr(res: inResultHolder) - } - + @IBAction func predictAct(_ sender: Any) { + + if let texture = toPredictTexture { + let beginDate = Date.init() + runner.predict(texture: texture) { [weak self] (success, resultHolder) in + if success, let inResultHolder = resultHolder { + let timeUse = Date.init().timeIntervalSince(beginDate) + DispatchQueue.main.async { + self?.elapsedTimeLabel.text = "\(timeUse * 1000)ms" + self?.resultTextView.text = self?.net.resultStr(res: inResultHolder) + } + + } else { + print(" predict fail ") + } + } } else { - print(" predict fail ") + print(" toPredictTexture is nil ") } - } - } else { - print(" toPredictTexture is nil ") + } - } - } extension ViewController: UIImagePickerControllerDelegate, UINavigationControllerDelegate { - func imagePickerController(_ picker: UIImagePickerController, didFinishPickingMediaWithInfo info: [String : Any]) { - picker.dismiss(animated: true){[weak self] in - guard let sSelf = self, let image = info["UIImagePickerControllerOriginalImage"] as? UIImage else { - fatalError("no image") - } - sSelf.selectImageView.image = image - sSelf.runner.getTexture(image: image.cgImage!, getTexture: { (texture) in - sSelf.toPredictTexture = texture - }) + func imagePickerController(_ picker: UIImagePickerController, didFinishPickingMediaWithInfo info: [String : Any]) { + picker.dismiss(animated: true){[weak self] in + guard let sSelf = self, let image = info["UIImagePickerControllerOriginalImage"] as? UIImage else { + fatalError("no image") + } + sSelf.selectImageView.image = image + sSelf.runner.getTexture(image: image.cgImage!, getTexture: { (texture) in + sSelf.toPredictTexture = texture + }) + } } - } } diff --git a/metal/Podfile b/metal/Podfile index f07622c920f286102e29e9a09bdee52cbcebf116..b3a6d8a9df3a8f5af9d3c6ab9c3169ff658a422e 100644 --- a/metal/Podfile +++ b/metal/Podfile @@ -27,3 +27,8 @@ target 'MobileNetDemo' do pod 'Protobuf', '~> 3.0.0' end + +target 'paddle-mobile-metallib' do + project 'paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj' +end + diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.pbxproj b/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.pbxproj index 9e7bab8b8afa48656645d953049df8fb51cf5918..749c8b2a92daf71ebbbdf7b0b6fa8b25073e9280 100644 --- a/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.pbxproj +++ b/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.pbxproj @@ -33,8 +33,6 @@ FC5E03B221DCE8D90016C137 /* mingren_input_data in Resources */ = {isa = PBXBuildFile; fileRef = FC5E03B121DCE8D90016C137 /* mingren_input_data */; }; FC704C1921D2375300F98BAB /* super_params in Resources */ = {isa = PBXBuildFile; fileRef = FC704C1721D2375300F98BAB /* super_params */; }; FC704C1A21D2375300F98BAB /* super_model in Resources */ = {isa = PBXBuildFile; fileRef = FC704C1821D2375300F98BAB /* super_model */; }; - FC704C2221D237FC00F98BAB /* combined_mobilenet_params in Resources */ = {isa = PBXBuildFile; fileRef = FC704C1D21D237FC00F98BAB /* combined_mobilenet_params */; }; - FC704C2321D237FC00F98BAB /* combined_mobilenet_model in Resources */ = {isa = PBXBuildFile; fileRef = FC704C1E21D237FC00F98BAB /* combined_mobilenet_model */; }; FC704C2421D237FC00F98BAB /* yolo_params in Resources */ = {isa = PBXBuildFile; fileRef = FC704C2021D237FC00F98BAB /* yolo_params */; }; FC704C2521D237FC00F98BAB /* yolo_model in Resources */ = {isa = PBXBuildFile; fileRef = FC704C2121D237FC00F98BAB /* yolo_model */; }; FC803BCD214D27930094B8E5 /* FPSCounter.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC803BCB214D27920094B8E5 /* FPSCounter.swift */; }; @@ -44,11 +42,18 @@ FC9797C321D608E000F2FD90 /* mobilenet_params in Resources */ = {isa = PBXBuildFile; fileRef = FC9797C121D608DF00F2FD90 /* mobilenet_params */; }; FC9797C721D609FB00F2FD90 /* synset.txt in Resources */ = {isa = PBXBuildFile; fileRef = FC9797C621D609FB00F2FD90 /* synset.txt */; }; FC9797CF21D6506F00F2FD90 /* mingren.jpg in Resources */ = {isa = PBXBuildFile; fileRef = FC9797CE21D6506F00F2FD90 /* mingren.jpg */; }; + FCAFD84B2231614200496A36 /* yolo_16_param in Resources */ = {isa = PBXBuildFile; fileRef = FCAFD8492231614200496A36 /* yolo_16_param */; }; + FCAFD84C2231614200496A36 /* yolo_16_model in Resources */ = {isa = PBXBuildFile; fileRef = FCAFD84A2231614200496A36 /* yolo_16_model */; }; FCBCCC552122EF5500D94F7E /* MetalHelper.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC542122EF5400D94F7E /* MetalHelper.swift */; }; + FCC15E15221E716500DC3CB2 /* paddle-mobile-metallib.metallib in Resources */ = {isa = PBXBuildFile; fileRef = FCC15E14221E716400DC3CB2 /* paddle-mobile-metallib.metallib */; }; FCCED60521D7646E00BE8D5F /* test_image_super in Resources */ = {isa = PBXBuildFile; fileRef = FCCED60421D7646E00BE8D5F /* test_image_super */; }; + FCE834AE2232A4AE0057BF43 /* combined_mobilenet_params in Resources */ = {isa = PBXBuildFile; fileRef = FCE834AC2232A4AE0057BF43 /* combined_mobilenet_params */; }; + FCE834AF2232A4AE0057BF43 /* combined_mobilenet_model in Resources */ = {isa = PBXBuildFile; fileRef = FCE834AD2232A4AE0057BF43 /* combined_mobilenet_model */; }; + FCE834B12232B6DC0057BF43 /* vision_synset.txt in Resources */ = {isa = PBXBuildFile; fileRef = FCE834B02232B6DC0057BF43 /* vision_synset.txt */; }; FCEBEC2C20E1391F00C0B14D /* paddle_mobile.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = FCEBEC2B20E1391F00C0B14D /* paddle_mobile.framework */; }; FCEBEC2D20E1391F00C0B14D /* paddle_mobile.framework in Embed Frameworks */ = {isa = PBXBuildFile; fileRef = FCEBEC2B20E1391F00C0B14D /* paddle_mobile.framework */; settings = {ATTRIBUTES = (CodeSignOnCopy, RemoveHeadersOnCopy, ); }; }; FCF437E8214B6DDB00943429 /* MultiPredictViewController.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCF437E7214B6DDB00943429 /* MultiPredictViewController.swift */; }; + FCFADE34222F63CC0037DCE8 /* test_big.JPG in Resources */ = {isa = PBXBuildFile; fileRef = FCFADE33222F63CB0037DCE8 /* test_big.JPG */; }; /* End PBXBuildFile section */ /* Begin PBXCopyFilesBuildPhase section */ @@ -101,8 +106,6 @@ FC5E03B121DCE8D90016C137 /* mingren_input_data */ = {isa = PBXFileReference; lastKnownFileType = file; path = mingren_input_data; sourceTree = ""; }; FC704C1721D2375300F98BAB /* super_params */ = {isa = PBXFileReference; lastKnownFileType = file; path = super_params; sourceTree = ""; }; FC704C1821D2375300F98BAB /* super_model */ = {isa = PBXFileReference; lastKnownFileType = file; path = super_model; sourceTree = ""; }; - FC704C1D21D237FC00F98BAB /* combined_mobilenet_params */ = {isa = PBXFileReference; lastKnownFileType = file; path = combined_mobilenet_params; sourceTree = ""; }; - FC704C1E21D237FC00F98BAB /* combined_mobilenet_model */ = {isa = PBXFileReference; lastKnownFileType = file; path = combined_mobilenet_model; sourceTree = ""; }; FC704C2021D237FC00F98BAB /* yolo_params */ = {isa = PBXFileReference; lastKnownFileType = file; path = yolo_params; sourceTree = ""; }; FC704C2121D237FC00F98BAB /* yolo_model */ = {isa = PBXFileReference; lastKnownFileType = file; path = yolo_model; sourceTree = ""; }; FC803BCB214D27920094B8E5 /* FPSCounter.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = FPSCounter.swift; sourceTree = ""; }; @@ -112,10 +115,17 @@ FC9797C121D608DF00F2FD90 /* mobilenet_params */ = {isa = PBXFileReference; lastKnownFileType = file; path = mobilenet_params; sourceTree = ""; }; FC9797C621D609FB00F2FD90 /* synset.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = synset.txt; sourceTree = ""; }; FC9797CE21D6506F00F2FD90 /* mingren.jpg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = mingren.jpg; sourceTree = ""; }; + FCAFD8492231614200496A36 /* yolo_16_param */ = {isa = PBXFileReference; lastKnownFileType = file; path = yolo_16_param; sourceTree = ""; }; + FCAFD84A2231614200496A36 /* yolo_16_model */ = {isa = PBXFileReference; lastKnownFileType = file; path = yolo_16_model; sourceTree = ""; }; FCBCCC542122EF5400D94F7E /* MetalHelper.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = MetalHelper.swift; sourceTree = ""; }; + FCC15E14221E716400DC3CB2 /* paddle-mobile-metallib.metallib */ = {isa = PBXFileReference; lastKnownFileType = "archive.metal-library"; name = "paddle-mobile-metallib.metallib"; path = "../../../../Library/Developer/Xcode/DerivedData/paddle-mobile-hdsimtkoxoondndnjczkbkchcwyh/Build/Products/Release-iphoneos/paddle-mobile-metallib.metallib"; sourceTree = ""; }; FCCED60421D7646E00BE8D5F /* test_image_super */ = {isa = PBXFileReference; lastKnownFileType = file; path = test_image_super; sourceTree = ""; }; + FCE834AC2232A4AE0057BF43 /* combined_mobilenet_params */ = {isa = PBXFileReference; lastKnownFileType = file; path = combined_mobilenet_params; sourceTree = ""; }; + FCE834AD2232A4AE0057BF43 /* combined_mobilenet_model */ = {isa = PBXFileReference; lastKnownFileType = file; path = combined_mobilenet_model; sourceTree = ""; }; + FCE834B02232B6DC0057BF43 /* vision_synset.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = vision_synset.txt; sourceTree = ""; }; FCEBEC2B20E1391F00C0B14D /* paddle_mobile.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; path = paddle_mobile.framework; sourceTree = BUILT_PRODUCTS_DIR; }; FCF437E7214B6DDB00943429 /* MultiPredictViewController.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MultiPredictViewController.swift; sourceTree = ""; }; + FCFADE33222F63CB0037DCE8 /* test_big.JPG */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = test_big.JPG; sourceTree = ""; }; /* End PBXFileReference section */ /* Begin PBXFrameworksBuildPhase section */ @@ -153,6 +163,7 @@ FC039B7520E11C550081E9F8 = { isa = PBXGroup; children = ( + FCC15E14221E716400DC3CB2 /* paddle-mobile-metallib.metallib */, FCEBEC2B20E1391F00C0B14D /* paddle_mobile.framework */, FC039B8020E11C550081E9F8 /* paddle-mobile-demo */, FC039B7F20E11C550081E9F8 /* Products */, @@ -193,6 +204,7 @@ FC203FA821CBFDBA00B37166 /* images */ = { isa = PBXGroup; children = ( + FCFADE33222F63CB0037DCE8 /* test_big.JPG */, FC2BFCBF21DF279900C262B2 /* classify-img-output.png */, FC2BFCBD21DF15D900C262B2 /* 123.jpg */, FC2BFCBB21DF0A8600C262B2 /* 00001.jpg */, @@ -257,21 +269,13 @@ FC704C1B21D237FC00F98BAB /* vision_model */ = { isa = PBXGroup; children = ( - FC704C1C21D237FC00F98BAB /* mobilenet */, + FCE834AB2232A4AE0057BF43 /* vision_mobilenet */, + FCAFD8482231614200496A36 /* yolo_16 */, FC704C1F21D237FC00F98BAB /* yolo */, ); path = vision_model; sourceTree = ""; }; - FC704C1C21D237FC00F98BAB /* mobilenet */ = { - isa = PBXGroup; - children = ( - FC704C1D21D237FC00F98BAB /* combined_mobilenet_params */, - FC704C1E21D237FC00F98BAB /* combined_mobilenet_model */, - ); - path = mobilenet; - sourceTree = ""; - }; FC704C1F21D237FC00F98BAB /* yolo */ = { isa = PBXGroup; children = ( @@ -316,6 +320,25 @@ path = mobilenet; sourceTree = ""; }; + FCAFD8482231614200496A36 /* yolo_16 */ = { + isa = PBXGroup; + children = ( + FCAFD8492231614200496A36 /* yolo_16_param */, + FCAFD84A2231614200496A36 /* yolo_16_model */, + ); + path = yolo_16; + sourceTree = ""; + }; + FCE834AB2232A4AE0057BF43 /* vision_mobilenet */ = { + isa = PBXGroup; + children = ( + FCE834B02232B6DC0057BF43 /* vision_synset.txt */, + FCE834AC2232A4AE0057BF43 /* combined_mobilenet_params */, + FCE834AD2232A4AE0057BF43 /* combined_mobilenet_model */, + ); + path = vision_mobilenet; + sourceTree = ""; + }; /* End PBXGroup section */ /* Begin PBXNativeTarget section */ @@ -381,20 +404,25 @@ FCCED60521D7646E00BE8D5F /* test_image_super in Resources */, FC039B8C20E11C560081E9F8 /* LaunchScreen.storyboard in Resources */, FC9797CF21D6506F00F2FD90 /* mingren.jpg in Resources */, - FC704C2221D237FC00F98BAB /* combined_mobilenet_params in Resources */, + FCAFD84B2231614200496A36 /* yolo_16_param in Resources */, + FCE834AF2232A4AE0057BF43 /* combined_mobilenet_model in Resources */, FC704C1921D2375300F98BAB /* super_params in Resources */, FC2BFCBE21DF15D900C262B2 /* 123.jpg in Resources */, FC039B8920E11C560081E9F8 /* Assets.xcassets in Resources */, FC9797C721D609FB00F2FD90 /* synset.txt in Resources */, + FCFADE34222F63CC0037DCE8 /* test_big.JPG in Resources */, FC5E03B221DCE8D90016C137 /* mingren_input_data in Resources */, FC704C1A21D2375300F98BAB /* super_model in Resources */, FC039B8720E11C550081E9F8 /* Main.storyboard in Resources */, + FCE834B12232B6DC0057BF43 /* vision_synset.txt in Resources */, FC9797C221D608E000F2FD90 /* mobilenet_model in Resources */, + FCAFD84C2231614200496A36 /* yolo_16_model in Resources */, FC2BFCC021DF279900C262B2 /* classify-img-output.png in Resources */, FC203FB221CBFDBA00B37166 /* test.jpg in Resources */, - FC704C2321D237FC00F98BAB /* combined_mobilenet_model in Resources */, + FCC15E15221E716500DC3CB2 /* paddle-mobile-metallib.metallib in Resources */, FC9797C321D608E000F2FD90 /* mobilenet_params in Resources */, FC704C2421D237FC00F98BAB /* yolo_params in Resources */, + FCE834AE2232A4AE0057BF43 /* combined_mobilenet_params in Resources */, FC2BFCBC21DF0A8600C262B2 /* 00001.jpg in Resources */, FC9797BE21D6045B00F2FD90 /* banana.jpeg in Resources */, FC704C2521D237FC00F98BAB /* yolo_model in Resources */, diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/AppDelegate.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/AppDelegate.swift index 537fb06ed9e5b9100bea43b7acae9c014e0f4a78..557f5eef35c47e0a341223acfd8ec3ef8d77de31 100644 --- a/metal/paddle-mobile-demo/paddle-mobile-demo/AppDelegate.swift +++ b/metal/paddle-mobile-demo/paddle-mobile-demo/AppDelegate.swift @@ -16,36 +16,36 @@ import UIKit @UIApplicationMain class AppDelegate: UIResponder, UIApplicationDelegate { - + var window: UIWindow? - + func application(_ application: UIApplication, didFinishLaunchingWithOptions launchOptions: [UIApplicationLaunchOptionsKey: Any]?) -> Bool { // Override point for customization after application launch. return true } - + func applicationWillResignActive(_ application: UIApplication) { // Sent when the application is about to move from active to inactive state. This can occur for certain types of temporary interruptions (such as an incoming phone call or SMS message) or when the user quits the application and it begins the transition to the background state. // Use this method to pause ongoing tasks, disable timers, and invalidate graphics rendering callbacks. Games should use this method to pause the game. } - + func applicationDidEnterBackground(_ application: UIApplication) { // Use this method to release shared resources, save user data, invalidate timers, and store enough application state information to restore your application to its current state in case it is terminated later. // If your application supports background execution, this method is called instead of applicationWillTerminate: when the user quits. } - + func applicationWillEnterForeground(_ application: UIApplication) { // Called as part of the transition from the background to the active state; here you can undo many of the changes made on entering the background. } - + func applicationDidBecomeActive(_ application: UIApplication) { // Restart any tasks that were paused (or not yet started) while the application was inactive. If the application was previously in the background, optionally refresh the user interface. } - + func applicationWillTerminate(_ application: UIApplication) { // Called when the application is about to terminate. Save data if appropriate. See also applicationDidEnterBackground:. } - - + + } diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Assets.xcassets/paddle-mobile.imageset/Contents.json b/metal/paddle-mobile-demo/paddle-mobile-demo/Assets.xcassets/paddle-mobile.imageset/Contents.json new file mode 100644 index 0000000000000000000000000000000000000000..9b6282f9578ef476c0d88b40a7629dd1afd0004e --- /dev/null +++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Assets.xcassets/paddle-mobile.imageset/Contents.json @@ -0,0 +1,21 @@ +{ + "images" : [ + { + "idiom" : "universal", + "filename" : "paddle-mobile.png", + "scale" : "1x" + }, + { + "idiom" : "universal", + "scale" : "2x" + }, + { + "idiom" : "universal", + "scale" : "3x" + } + ], + "info" : { + "version" : 1, + "author" : "xcode" + } +} \ No newline at end of file diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Assets.xcassets/paddle-mobile.imageset/paddle-mobile.png b/metal/paddle-mobile-demo/paddle-mobile-demo/Assets.xcassets/paddle-mobile.imageset/paddle-mobile.png new file mode 100644 index 0000000000000000000000000000000000000000..7cb32991117140a3016c24bc6d3b96f696facf82 Binary files /dev/null and b/metal/paddle-mobile-demo/paddle-mobile-demo/Assets.xcassets/paddle-mobile.imageset/paddle-mobile.png differ diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Base.lproj/Main.storyboard b/metal/paddle-mobile-demo/paddle-mobile-demo/Base.lproj/Main.storyboard index 4060bb7b3566fc79cc043e861e7917ebf1d91f65..585fc3417d3003686b03bbfcf594d0fde62f4f4a 100644 --- a/metal/paddle-mobile-demo/paddle-mobile-demo/Base.lproj/Main.storyboard +++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Base.lproj/Main.storyboard @@ -155,7 +155,7 @@ - + @@ -246,10 +246,6 @@ - - - - + + + + + + @@ -295,20 +300,26 @@ + + + + + + - + - + diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/MetalHelper.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/MetalHelper.swift index 54b4371d64800f25cfb9a347ddd303074fb89a7d..8252258c978a2b74a298389bec240eb256fc9126 100644 --- a/metal/paddle-mobile-demo/paddle-mobile-demo/MetalHelper.swift +++ b/metal/paddle-mobile-demo/paddle-mobile-demo/MetalHelper.swift @@ -17,14 +17,15 @@ import MetalKit import Foundation import paddle_mobile -public class MetalHelper { - let device: MTLDevice - let queue: MTLCommandQueue - let textureLoader: MTKTextureLoader - static let shared: MetalHelper = MetalHelper.init() - private init(){ - device = MTLCreateSystemDefaultDevice()! - queue = device.makeCommandQueue()! - textureLoader = MTKTextureLoader.init(device: device) - } +@objc public class MetalHelper: NSObject { + @objc let device: MTLDevice + @objc let queue: MTLCommandQueue + @objc let textureLoader: MTKTextureLoader + @objc static let shared: MetalHelper = MetalHelper.init() + private override init(){ + device = MTLCreateSystemDefaultDevice()! + queue = device.makeCommandQueue()! + textureLoader = MTKTextureLoader.init(device: device) + super.init() + } } diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/MultiPredictViewController.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/MultiPredictViewController.swift index 22fb5723ac9e6f358f2632467389f277603fc59d..8af436d7796e445dc60d138927d07d7187db6bf6 100644 --- a/metal/paddle-mobile-demo/paddle-mobile-demo/MultiPredictViewController.swift +++ b/metal/paddle-mobile-demo/paddle-mobile-demo/MultiPredictViewController.swift @@ -16,51 +16,51 @@ import UIKit import paddle_mobile class MultiPredictViewController: UIViewController { - var runner1: Runner! - var runner2: Runner! - override func viewDidLoad() { - super.viewDidLoad() - let mobileNet = MobileNet_ssd_hand.init(device: MetalHelper.shared.device) - let genet = Genet.init(device: MetalHelper.shared.device) - runner1 = Runner.init(inNet: mobileNet, commandQueue: MetalHelper.shared.queue) - let queue2 = MetalHelper.shared.device.makeCommandQueue() + var runner1: Runner! + var runner2: Runner! + override func viewDidLoad() { + super.viewDidLoad() + let mobileNet = MobileNet_ssd_hand.init(device: MetalHelper.shared.device) + let genet = Genet.init(device: MetalHelper.shared.device) + runner1 = Runner.init(inNet: mobileNet, commandQueue: MetalHelper.shared.queue) + let queue2 = MetalHelper.shared.device.makeCommandQueue() + + runner2 = Runner.init(inNet: genet, commandQueue: MetalHelper.shared.queue) + } - runner2 = Runner.init(inNet: genet, commandQueue: MetalHelper.shared.queue) - } - - @IBAction func predictAct(_ sender: Any) { - let success = self.runner2.load() -// DispatchQueue.global().async { - let image1 = UIImage.init(named: "hand.jpg") -// let success = self.runner2.load() -// if success { -// for i in 0..<10000 { -// print(i) -// self.runner2.predict(cgImage: image1!.cgImage!, completion: { (success, res) in -// print("result1: ") -//// print(res) -// }) -// } -// } else { -// print("load failed") -// } -// self.runner1.clear() -// } -// return -// DispatchQueue.global().async { -//// sleep(1) -// let image1 = UIImage.init(named: "banana.jpeg") -//// if success { -// for _ in 0..<10 { -// self.runner2.predict(cgImage: image1!.cgImage!, completion: { (success, res) in -// print("result2: ") -// print(res) -// }) -// } -//// } else { -//// print("load failed") -//// } -//// self.runner2.clear() -// } - } + @IBAction func predictAct(_ sender: Any) { + let success = self.runner2.load() + // DispatchQueue.global().async { + let image1 = UIImage.init(named: "hand.jpg") + // let success = self.runner2.load() + // if success { + // for i in 0..<10000 { + // print(i) + // self.runner2.predict(cgImage: image1!.cgImage!, completion: { (success, res) in + // print("result1: ") + //// print(res) + // }) + // } + // } else { + // print("load failed") + // } + // self.runner1.clear() + // } + // return + // DispatchQueue.global().async { + //// sleep(1) + // let image1 = UIImage.init(named: "banana.jpeg") + //// if success { + // for _ in 0..<10 { + // self.runner2.predict(cgImage: image1!.cgImage!, completion: { (success, res) in + // print("result2: ") + // print(res) + // }) + // } + //// } else { + //// print("load failed") + //// } + //// self.runner2.clear() + // } + } } diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/CPUCompute.mm b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/CPUCompute.mm index fac8af25278e8aa2350669fb4b921049a512e241..ddfc5f770d578dde5f345bcb5776bb1504078456 100644 --- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/CPUCompute.mm +++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/CPUCompute.mm @@ -20,30 +20,30 @@ #import struct NMSParam { - - float *score_data; - - float *box_data; - - float *output; - - int output_size; - - std::vector score_dim; - - std::vector box_dim; - - float scoreThredshold; - - int nmsTopK; - - int keepTopK; - - float nmsEta; - - float nmsThreshold; - - int background_label; + + float *score_data; + + float *box_data; + + float *output; + + int output_size; + + std::vector score_dim; + + std::vector box_dim; + + float scoreThredshold; + + int nmsTopK; + + int keepTopK; + + float nmsEta; + + float nmsThreshold; + + int background_label; }; @@ -53,63 +53,63 @@ constexpr int kBBoxSize = 4; template bool SortScorePairDescend(const std::pair& pair1, const std::pair& pair2) { - return pair1.first > pair2.first; + return pair1.first > pair2.first; } template static inline void GetMaxScoreIndex( const std::vector& scores, const T threshold, int top_k, std::vector>* sorted_indices) { - for (size_t i = 0; i < scores.size(); ++i) { - if (scores[i] > threshold) { - sorted_indices->push_back(std::make_pair(scores[i], i)); + for (size_t i = 0; i < scores.size(); ++i) { + if (scores[i] > threshold) { + sorted_indices->push_back(std::make_pair(scores[i], i)); + } + } + // Sort the score pair according to the scores in descending order + std::stable_sort(sorted_indices->begin(), sorted_indices->end(), + SortScorePairDescend); + // Keep top_k scores if needed. + if (top_k > -1 && top_k < static_cast(sorted_indices->size())) { + sorted_indices->resize(top_k); } - } - // Sort the score pair according to the scores in descending order - std::stable_sort(sorted_indices->begin(), sorted_indices->end(), - SortScorePairDescend); - // Keep top_k scores if needed. - if (top_k > -1 && top_k < static_cast(sorted_indices->size())) { - sorted_indices->resize(top_k); - } } template static inline T BBoxArea(const T* box, const bool normalized) { - if (box[2] < box[0] || box[3] < box[1]) { - // If coordinate values are is invalid - // (e.g. xmax < xmin or ymax < ymin), return 0. - return static_cast(0.); - } else { - const T w = box[2] - box[0]; - const T h = box[3] - box[1]; - if (normalized) { - return w * h; + if (box[2] < box[0] || box[3] < box[1]) { + // If coordinate values are is invalid + // (e.g. xmax < xmin or ymax < ymin), return 0. + return static_cast(0.); } else { - // If coordinate values are not within range [0, 1]. - return (w + 1) * (h + 1); + const T w = box[2] - box[0]; + const T h = box[3] - box[1]; + if (normalized) { + return w * h; + } else { + // If coordinate values are not within range [0, 1]. + return (w + 1) * (h + 1); + } } - } } template static inline T JaccardOverlap(const T* box1, const T* box2, const bool normalized) { - if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] || - box2[3] < box1[1]) { - return static_cast(0.); - } else { - const T inter_xmin = std::max(box1[0], box2[0]); - const T inter_ymin = std::max(box1[1], box2[1]); - const T inter_xmax = std::min(box1[2], box2[2]); - const T inter_ymax = std::min(box1[3], box2[3]); - const T inter_w = inter_xmax - inter_xmin; - const T inter_h = inter_ymax - inter_ymin; - const T inter_area = inter_w * inter_h; - const T bbox1_area = BBoxArea(box1, normalized); - const T bbox2_area = BBoxArea(box2, normalized); - return inter_area / (bbox1_area + bbox2_area - inter_area); - } + if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] || + box2[3] < box1[1]) { + return static_cast(0.); + } else { + const T inter_xmin = std::max(box1[0], box2[0]); + const T inter_ymin = std::max(box1[1], box2[1]); + const T inter_xmax = std::min(box1[2], box2[2]); + const T inter_ymax = std::min(box1[3], box2[3]); + const T inter_w = inter_xmax - inter_xmin; + const T inter_h = inter_ymax - inter_ymin; + const T inter_area = inter_w * inter_h; + const T bbox1_area = BBoxArea(box1, normalized); + const T bbox2_area = BBoxArea(box2, normalized); + return inter_area / (bbox1_area + bbox2_area - inter_area); + } } template @@ -120,40 +120,40 @@ static inline void NMSFast( const T score_threshold, const T nms_threshold, const T eta, const int top_k, std::vector* selected_indices) { - // The total boxes for each instance. - int num_boxes = bbox_dim[0]; - // 4: [xmin ymin xmax ymax] - int box_size = bbox_dim[1]; - - std::vector scores_data(num_boxes); - std::copy_n(score_data, num_boxes, scores_data.begin()); - std::vector> sorted_indices; - GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices); - - selected_indices->clear(); - T adaptive_threshold = nms_threshold; - - while (sorted_indices.size() != 0) { - const int idx = sorted_indices.front().second; - bool keep = true; - for (size_t k = 0; k < selected_indices->size(); ++k) { - if (keep) { - const int kept_idx = (*selected_indices)[k]; - T overlap = JaccardOverlap(bbox_data + idx * box_size, - bbox_data + kept_idx * box_size, true); - keep = overlap <= adaptive_threshold; - } else { - break; - } - } - if (keep) { - selected_indices->push_back(idx); - } - sorted_indices.erase(sorted_indices.begin()); - if (keep && eta < 1 && adaptive_threshold > 0.5) { - adaptive_threshold *= eta; + // The total boxes for each instance. + int num_boxes = bbox_dim[0]; + // 4: [xmin ymin xmax ymax] + int box_size = bbox_dim[1]; + + std::vector scores_data(num_boxes); + std::copy_n(score_data, num_boxes, scores_data.begin()); + std::vector> sorted_indices; + GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices); + + selected_indices->clear(); + T adaptive_threshold = nms_threshold; + + while (sorted_indices.size() != 0) { + const int idx = sorted_indices.front().second; + bool keep = true; + for (size_t k = 0; k < selected_indices->size(); ++k) { + if (keep) { + const int kept_idx = (*selected_indices)[k]; + T overlap = JaccardOverlap(bbox_data + idx * box_size, + bbox_data + kept_idx * box_size, true); + keep = overlap <= adaptive_threshold; + } else { + break; + } + } + if (keep) { + selected_indices->push_back(idx); + } + sorted_indices.erase(sorted_indices.begin()); + if (keep && eta < 1 && adaptive_threshold > 0.5) { + adaptive_threshold *= eta; + } } - } } template @@ -165,48 +165,48 @@ void MultiClassNMS(const T *boxes_data, const int& background_label, const int& nms_top_k, const int& keep_top_k, const T& nms_threshold, const T& nms_eta, const T& score_threshold) { - - int64_t class_num = score_dim[0]; - int64_t predict_dim = score_dim[1]; - int num_det = 0; - for (int c = 0; c < class_num; ++c) { - if (c == background_label) continue; - const T *score_data = scores_data + c * predict_dim; - /// [c] is key - NMSFast(boxes_data, box_dim, score_data, score_threshold, nms_threshold, nms_eta, + int64_t class_num = score_dim[0]; + int64_t predict_dim = score_dim[1]; + int num_det = 0; + for (int c = 0; c < class_num; ++c) { + if (c == background_label) continue; + const T *score_data = scores_data + c * predict_dim; + + /// [c] is key + NMSFast(boxes_data, box_dim, score_data, score_threshold, nms_threshold, nms_eta, nms_top_k, &((*indices)[c])); - num_det += (*indices)[c].size(); - } - - *num_nmsed_out = num_det; - if (keep_top_k > -1 && num_det > keep_top_k) { - std::vector>> score_index_pairs; - for (const auto& it : *indices) { - int label = it.first; - const T* sdata = scores_data + label * predict_dim; - const std::vector& label_indices = it.second; - for (size_t j = 0; j < label_indices.size(); ++j) { - int idx = label_indices[j]; - // PADDLE_ENFORCE_LT(idx, predict_dim); - score_index_pairs.push_back(std::make_pair(sdata[idx], std::make_pair(label, idx))); - } + num_det += (*indices)[c].size(); } - // Keep top k results per image. - std::stable_sort(score_index_pairs.begin(), score_index_pairs.end(), - SortScorePairDescend>); - score_index_pairs.resize(keep_top_k); - - // Store the new indices. - std::map> new_indices; - for (size_t j = 0; j < score_index_pairs.size(); ++j) { - int label = score_index_pairs[j].second.first; - int idx = score_index_pairs[j].second.second; - new_indices[label].push_back(idx); + + *num_nmsed_out = num_det; + if (keep_top_k > -1 && num_det > keep_top_k) { + std::vector>> score_index_pairs; + for (const auto& it : *indices) { + int label = it.first; + const T* sdata = scores_data + label * predict_dim; + const std::vector& label_indices = it.second; + for (size_t j = 0; j < label_indices.size(); ++j) { + int idx = label_indices[j]; + // PADDLE_ENFORCE_LT(idx, predict_dim); + score_index_pairs.push_back(std::make_pair(sdata[idx], std::make_pair(label, idx))); + } + } + // Keep top k results per image. + std::stable_sort(score_index_pairs.begin(), score_index_pairs.end(), + SortScorePairDescend>); + score_index_pairs.resize(keep_top_k); + + // Store the new indices. + std::map> new_indices; + for (size_t j = 0; j < score_index_pairs.size(); ++j) { + int label = score_index_pairs[j].second.first; + int idx = score_index_pairs[j].second.second; + new_indices[label].push_back(idx); + } + new_indices.swap(*indices); + *num_nmsed_out = keep_top_k; } - new_indices.swap(*indices); - *num_nmsed_out = keep_top_k; - } } template @@ -215,69 +215,69 @@ void MultiClassOutput(const T *scores_data, const T *bboxes_data, T *outputs_data, const std::map>& selected_indices) { - int predict_dim = score_dim[1]; - int count = 0; - for (const auto& it : selected_indices) { - /// one batch - int label = it.first; - const T* sdata = scores_data + label * predict_dim; - const std::vector& indices = it.second; - for (size_t j = 0; j < indices.size(); ++j) { - int idx = indices[j]; - const T* bdata = bboxes_data + idx * kBBoxSize; - outputs_data[count * kOutputDim] = label; // label - outputs_data[count * kOutputDim + 1] = sdata[idx]; // score - // xmin, ymin, xmax, ymax - std::memcpy(outputs_data + count * kOutputDim + 2, bdata, 4 * sizeof(T)); - count++; + int predict_dim = score_dim[1]; + int count = 0; + for (const auto& it : selected_indices) { + /// one batch + int label = it.first; + const T* sdata = scores_data + label * predict_dim; + const std::vector& indices = it.second; + for (size_t j = 0; j < indices.size(); ++j) { + int idx = indices[j]; + const T* bdata = bboxes_data + idx * kBBoxSize; + outputs_data[count * kOutputDim] = label; // label + outputs_data[count * kOutputDim + 1] = sdata[idx]; // score + // xmin, ymin, xmax, ymax + std::memcpy(outputs_data + count * kOutputDim + 2, bdata, 4 * sizeof(T)); + count++; + } } - } } void MultiClassNMSCompute(NMSParam *param) { - assert(param->score_dim[0] == 1); - assert(param->box_dim[0] == 1); - assert (param->score_dim.size() == 3); - assert(param->box_dim.size() == 3); - - float* outputs; - auto background_label = param->background_label; - auto nms_top_k = param->nmsTopK; - auto keep_top_k = param->keepTopK; - auto nms_threshold = param->nmsThreshold; - auto nms_eta = param->nmsEta; - auto score_threshold = param->scoreThredshold; - - std::vector score_dim_one_batch = {param->score_dim[1], param->score_dim[2]}; - std::vector box_dim_one_batch = {param->box_dim[1], param->box_dim[2]}; - - std::vector batch_starts = {0}; - - std::map> indices; - int num_nmsed_out = 0; - - MultiClassNMS(param->box_data, box_dim_one_batch, param->score_data, score_dim_one_batch, &indices, &num_nmsed_out, - background_label, nms_top_k, keep_top_k, nms_threshold, - nms_eta, score_threshold); - batch_starts.push_back(batch_starts.back() + num_nmsed_out); - - int output_size = 0; - int num_kept = batch_starts.back(); - if (num_kept == 0) { - outputs = new float[1]; - outputs[0] = -1; - output_size = 1; - } else { - outputs = new float[num_kept * kOutputDim]; - int64_t s = batch_starts[0]; - int64_t e = batch_starts[1]; - if (e > s) { - MultiClassOutput(param->score_data, score_dim_one_batch, param->box_data, outputs, indices); + assert(param->score_dim[0] == 1); + assert(param->box_dim[0] == 1); + assert (param->score_dim.size() == 3); + assert(param->box_dim.size() == 3); + + float* outputs; + auto background_label = param->background_label; + auto nms_top_k = param->nmsTopK; + auto keep_top_k = param->keepTopK; + auto nms_threshold = param->nmsThreshold; + auto nms_eta = param->nmsEta; + auto score_threshold = param->scoreThredshold; + + std::vector score_dim_one_batch = {param->score_dim[1], param->score_dim[2]}; + std::vector box_dim_one_batch = {param->box_dim[1], param->box_dim[2]}; + + std::vector batch_starts = {0}; + + std::map> indices; + int num_nmsed_out = 0; + + MultiClassNMS(param->box_data, box_dim_one_batch, param->score_data, score_dim_one_batch, &indices, &num_nmsed_out, + background_label, nms_top_k, keep_top_k, nms_threshold, + nms_eta, score_threshold); + batch_starts.push_back(batch_starts.back() + num_nmsed_out); + + int output_size = 0; + int num_kept = batch_starts.back(); + if (num_kept == 0) { + outputs = new float[1]; + outputs[0] = -1; + output_size = 1; + } else { + outputs = new float[num_kept * kOutputDim]; + int64_t s = batch_starts[0]; + int64_t e = batch_starts[1]; + if (e > s) { + MultiClassOutput(param->score_data, score_dim_one_batch, param->box_data, outputs, indices); + } + output_size = num_kept * kOutputDim; } - output_size = num_kept * kOutputDim; - } - param->output = outputs; - param->output_size = output_size; + param->output = outputs; + param->output_size = output_size; } @implementation CPUResult @@ -286,31 +286,31 @@ void MultiClassNMSCompute(NMSParam *param) { @implementation NMSCompute -(CPUResult *)computeWithScore:(float *)score andBBoxs:(float *)bbox { - NMSParam param; - param.box_data = bbox; - param.score_data = score; - param.background_label = self.background_label; - param.scoreThredshold = self.scoreThredshold; - param.nmsTopK = self.nmsTopK; - param.keepTopK = self.keepTopK; - param.nmsEta = self.nmsEta; - param.nmsThreshold = self.nmsThreshold; - std::vector score_dim; - for (int i = 0; i < self.scoreDim.count; ++i) { - score_dim.push_back(self.scoreDim[i].intValue); - } - param.score_dim = score_dim; - - std::vector box_dim; - for (int i = 0; i < self.bboxDim.count; ++i) { - box_dim.push_back(self.bboxDim[i].intValue); - } - param.box_dim = box_dim; - MultiClassNMSCompute(¶m); - CPUResult *cr = [[CPUResult alloc] init]; - cr.output = param.output; - cr.outputSize = param.output_size; - return cr; + NMSParam param; + param.box_data = bbox; + param.score_data = score; + param.background_label = self.background_label; + param.scoreThredshold = self.scoreThredshold; + param.nmsTopK = self.nmsTopK; + param.keepTopK = self.keepTopK; + param.nmsEta = self.nmsEta; + param.nmsThreshold = self.nmsThreshold; + std::vector score_dim; + for (int i = 0; i < self.scoreDim.count; ++i) { + score_dim.push_back(self.scoreDim[i].intValue); + } + param.score_dim = score_dim; + + std::vector box_dim; + for (int i = 0; i < self.bboxDim.count; ++i) { + box_dim.push_back(self.bboxDim[i].intValue); + } + param.box_dim = box_dim; + MultiClassNMSCompute(¶m); + CPUResult *cr = [[CPUResult alloc] init]; + cr.output = param.output; + cr.outputSize = param.output_size; + return cr; } @end diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/Genet.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/Genet.swift index 04c61201f9b20ac244a8547f00ac19154b28430c..b248e53bac56ba2018b029406486a29bb52e224f 100644 --- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/Genet.swift +++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/Genet.swift @@ -16,33 +16,37 @@ import Foundation import paddle_mobile public class Genet: Net { - @objc public override init(device: MTLDevice) { - super.init(device: device) - modelPath = Bundle.main.path(forResource: "genet_model", ofType: nil) ?! "model null" - paramPath = Bundle.main.path(forResource: "genet_params", ofType: nil) ?! "para null" - preprocessKernel = GenetPreProccess.init(device: device) - inputDim = Dim.init(inDim: [1, 128, 128, 3]) - } - - @objc override public init(device: MTLDevice, paramPointer: UnsafeMutableRawPointer, paramSize:Int, modePointer: UnsafeMutableRawPointer, modelSize: Int) { - super.init(device: device, - paramPointer: paramPointer, - paramSize: paramSize, - modePointer: modePointer, - modelSize: modelSize) - preprocessKernel = GenetPreProccess.init(device: device) - inputDim = Dim.init(inDim: [1, 128, 128, 3]) - } - - class GenetPreProccess: CusomKernel { - init(device: MTLDevice) { - let s = Shape.init(inWidth: 128, inHeight: 128, inChannel: 3) - super.init(device: device, inFunctionName: "genet_preprocess", outputDim: s, metalLoadModel: .LoadMetalInDefaultLib, metalLibPath: nil) + @objc public override init(device: MTLDevice) { + super.init(device: device) + modelPath = Bundle.main.path(forResource: "genet_model", ofType: nil) ?! "model null" + paramPath = Bundle.main.path(forResource: "genet_params", ofType: nil) ?! "para null" + preprocessKernel = GenetPreProccess.init(device: device) + inputDim = Dim.init(inDim: [1, 128, 128, 3]) + metalLoadMode = .LoadMetalInCustomMetalLib + metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib") + } + + @objc override public init(device: MTLDevice, inParamPointer: UnsafeMutableRawPointer, inParamSize:Int, inModelPointer: UnsafeMutableRawPointer, inModelSize: Int) { + super.init(device: device, + inParamPointer: inParamPointer, + inParamSize: inParamSize, + inModelPointer: inModelPointer, + inModelSize: inModelSize) + metalLoadMode = .LoadMetalInCustomMetalLib + metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib") + preprocessKernel = GenetPreProccess.init(device: device) + inputDim = Dim.init(inDim: [1, 128, 128, 3]) + } + + class GenetPreProccess: CusomKernel { + init(device: MTLDevice) { + let s = Shape.init(inWidth: 128, inHeight: 128, inChannel: 3) + super.init(device: device, inFunctionName: "genet_preprocess", outputDim: s, metalLoadModel: .LoadMetalInDefaultLib, metalLibPath: nil) + } + } + + override public func resultStr(res: [ResultHolder]) -> String { + return " \(res[0].result[0]) ... " } - } - - override public func resultStr(res: ResultHolder) -> String { - return " \(res.result[0]) ... " - } - + } diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNet.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNet.swift index 02fcbefe0b265a3f5898bb23956014183c95fc5e..608cd3180b0dedabafecb72baf98bf289163de20 100644 --- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNet.swift +++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNet.swift @@ -16,51 +16,53 @@ import Foundation import paddle_mobile public class MobileNet: Net{ - - class MobilenetPreProccess: CusomKernel { - init(device: MTLDevice) { - let s = Shape.init(inWidth: 224, inHeight: 224, inChannel: 3) - super.init(device: device, inFunctionName: "mobilenet_preprocess", outputDim: s, metalLoadModel: .LoadMetalInDefaultLib, metalLibPath: nil) - } - } - - class PreWords { - var contents: [String] = [] - init(fileName: String, type: String = "txt", inBundle: Bundle = Bundle.main) { - if let filePath = inBundle.path(forResource: fileName, ofType: type) { - let string = try! String.init(contentsOfFile: filePath) - contents = string.components(separatedBy: CharacterSet.newlines).filter{$0.count > 10}.map{ - String($0[$0.index($0.startIndex, offsetBy: 10)...]) + + class MobilenetPreProccess: CusomKernel { + init(device: MTLDevice) { + let s = Shape.init(inWidth: 224, inHeight: 224, inChannel: 3) + super.init(device: device, inFunctionName: "mobilenet_preprocess", outputDim: s, metalLoadModel: .LoadMetalInDefaultLib, metalLibPath: nil) } - }else{ - fatalError("no file call \(fileName)") - } } - subscript(index: Int) -> String { - return contents[index] + + class PreWords { + var contents: [String] = [] + init(fileName: String, type: String = "txt", inBundle: Bundle = Bundle.main) { + if let filePath = inBundle.path(forResource: fileName, ofType: type) { + let string = try! String.init(contentsOfFile: filePath) + contents = string.components(separatedBy: CharacterSet.newlines).filter{$0.count > 10}.map{ + String($0[$0.index($0.startIndex, offsetBy: 10)...]) + } + }else{ + fatalError("no file call \(fileName)") + } + } + subscript(index: Int) -> String { + return contents[index] + } } - } - - let labels = PreWords.init(fileName: "synset") - - override public func resultStr(res: ResultHolder) -> String { - let resPointer = res.result - var s: [String] = [] - (0.. String { + let resPointer = res[0].result + var s: [String] = [] + (0.. String { - return " \(res.result[0]) ... " - } - + @objc public override init(device: MTLDevice) { + super.init(device: device) + except = 0 + modelPath = Bundle.main.path(forResource: "combined_mobilenet_model", ofType: nil) ?! "model null" + paramPath = Bundle.main.path(forResource: "combined_mobilenet_params", ofType: nil) ?! "para null" + inputDim = Dim.init(inDim: [1, 224, 224, 3]) + metalLoadMode = .LoadMetalInCustomMetalLib + let paddleMobileMetallib = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib") + metalLibPath = paddleMobileMetallib + useMPS = true + preprocessKernel = ScaleKernel.init(device: device, shape: Shape.init(inWidth: 224, inHeight: 224, inChannel: 3), metalLoadMode: .LoadMetalInCustomMetalLib, metalLibPath: paddleMobileMetallib) + + } + let labels = PreWords.init(fileName: "vision_synset") + + class PreWords { + var contents: [String] = [] + init(fileName: String, type: String = "txt", inBundle: Bundle = Bundle.main) { + if let filePath = inBundle.path(forResource: fileName, ofType: type) { + let string = try! String.init(contentsOfFile: filePath) + contents = string.components(separatedBy: CharacterSet.newlines).filter{$0.count > 10}.map{ + String($0[$0.index($0.startIndex, offsetBy: 10)...]) + } + }else{ + fatalError("no file call \(fileName)") + } + } + subscript(index: Int) -> String { + return contents[index] + } + } + + override public func resultStr(res: [ResultHolder]) -> String { + let firstRes = res[0] + let resPointer = firstRes.result + var s: [String] = [] + (0.. String { - return " \(res)" - } - - override public func fetchResult(paddleMobileRes: GPUResultHolder) -> ResultHolder { - -// guard let interRes = paddleMobileRes.intermediateResults else { -// fatalError(" need have inter result ") -// } -// -// guard let scores = interRes["Scores"], scores.count > 0, let score = scores[0] as? Texture else { -// fatalError(" need score ") -// } -// -// guard let bboxs = interRes["BBoxes"], bboxs.count > 0, let bbox = bboxs[0] as? Texture else { -// fatalError() -// } -// -// var scoreFormatArr: [Float32] = score.metalTexture.realNHWC(dim: (n: score.padToFourDim[0], h: score.padToFourDim[1], w: score.padToFourDim[2], c: score.padToFourDim[3])) -//// print("score: ") -//// print(scoreFormatArr.strideArray()) -//// -// var bboxArr = bbox.metalTexture.float32Array() -//// print("bbox: ") -//// print(bboxArr.strideArray()) -// -// let nmsCompute = NMSCompute.init() -// nmsCompute.scoreThredshold = 0.01 -// nmsCompute.nmsTopK = 400 -// nmsCompute.keepTopK = 200 -// nmsCompute.nmsEta = 1.0 -// nmsCompute.nmsThreshold = 0.45 -// nmsCompute.background_label = 0; -// -// nmsCompute.scoreDim = [NSNumber.init(value: score.tensorDim[0]), NSNumber.init(value: score.tensorDim[1]), NSNumber.init(value: score.tensorDim[2])] -// -// nmsCompute.bboxDim = [NSNumber.init(value: bbox.tensorDim[0]), NSNumber.init(value: bbox.tensorDim[1]), NSNumber.init(value: bbox.tensorDim[2])] -// guard let result = nmsCompute.compute(withScore: &scoreFormatArr, andBBoxs: &bboxArr) else { -// fatalError( " result error " ) -// } -// -// let output: [Float32] = result.map { $0.floatValue } -// -// -// return output - fatalError() - } - - - - + + @objc override public init(device: MTLDevice,inParamPointer: UnsafeMutableRawPointer, inParamSize:Int, inModelPointer inModePointer: UnsafeMutableRawPointer, inModelSize: Int) { + super.init(device:device,inParamPointer:inParamPointer,inParamSize:inParamSize,inModelPointer:inModePointer,inModelSize:inModelSize) + except = 2 + modelPath = "" + paramPath = "" + metalLoadMode = .LoadMetalInCustomMetalLib + metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib") + preprocessKernel = MobilenetssdPreProccess.init(device: device) + inputDim = Dim.init(inDim: [1, 300, 300, 3]) + } + + class MobilenetssdPreProccess: CusomKernel { + init(device: MTLDevice) { + let s = Shape.init(inWidth: 300, inHeight: 300, inChannel: 3) + super.init(device: device, inFunctionName: "mobilenet_ssd_preprocess", outputDim: s, metalLoadModel: .LoadMetalInDefaultLib, metalLibPath: nil) + } + } + + override public func resultStr(res: [ResultHolder]) -> String { + return " \(res[0])" + } + + override public func fetchResult(paddleMobileRes: [GPUResultHolder]) -> [ResultHolder] { + + // guard let interRes = paddleMobileRes.intermediateResults else { + // fatalError(" need have inter result ") + // } + // + // guard let scores = interRes["Scores"], scores.count > 0, let score = scores[0] as? Texture else { + // fatalError(" need score ") + // } + // + // guard let bboxs = interRes["BBoxes"], bboxs.count > 0, let bbox = bboxs[0] as? Texture else { + // fatalError() + // } + // + // var scoreFormatArr: [Float32] = score.metalTexture.realNHWC(dim: (n: score.padToFourDim[0], h: score.padToFourDim[1], w: score.padToFourDim[2], c: score.padToFourDim[3])) + //// print("score: ") + //// print(scoreFormatArr.strideArray()) + //// + // var bboxArr = bbox.metalTexture.float32Array() + //// print("bbox: ") + //// print(bboxArr.strideArray()) + // + // let nmsCompute = NMSCompute.init() + // nmsCompute.scoreThredshold = 0.01 + // nmsCompute.nmsTopK = 400 + // nmsCompute.keepTopK = 200 + // nmsCompute.nmsEta = 1.0 + // nmsCompute.nmsThreshold = 0.45 + // nmsCompute.background_label = 0; + // + // nmsCompute.scoreDim = [NSNumber.init(value: score.tensorDim[0]), NSNumber.init(value: score.tensorDim[1]), NSNumber.init(value: score.tensorDim[2])] + // + // nmsCompute.bboxDim = [NSNumber.init(value: bbox.tensorDim[0]), NSNumber.init(value: bbox.tensorDim[1]), NSNumber.init(value: bbox.tensorDim[2])] + // guard let result = nmsCompute.compute(withScore: &scoreFormatArr, andBBoxs: &bboxArr) else { + // fatalError( " result error " ) + // } + // + // let output: [Float32] = result.map { $0.floatValue } + // + // + // return output + fatalError() + } + + + + } diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobilenetSSD_AR.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobilenetSSD_AR.swift index 3080dfb469774aa2aa9bd0ed58b9c5d03c52ca5f..76feb0ecd07cd3b7d5405e32d674f695a629aa06 100644 --- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobilenetSSD_AR.swift +++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobilenetSSD_AR.swift @@ -16,133 +16,137 @@ import Foundation import paddle_mobile public class MobileNet_ssd_AR: Net { - @objc public override init(device: MTLDevice) { - super.init(device: device) - except = 2 - modelPath = Bundle.main.path(forResource: "ar_model", ofType: nil) ?! "model null" - paramPath = Bundle.main.path(forResource: "ar_params", ofType: nil) ?! "para null" - preprocessKernel = MobilenetssdPreProccess.init(device: device) - inputDim = Dim.init(inDim: [1, 160, 160, 3]) - } - - @objc override public init(device: MTLDevice,paramPointer: UnsafeMutableRawPointer, paramSize:Int, modePointer: UnsafeMutableRawPointer, modelSize: Int) { - super.init(device:device,paramPointer:paramPointer,paramSize:paramSize,modePointer:modePointer,modelSize:modelSize) - except = 2 - preprocessKernel = MobilenetssdPreProccess.init(device: device) - inputDim = Dim.init(inDim: [1, 160, 160, 3]) - } - - class MobilenetssdPreProccess: CusomKernel { - init(device: MTLDevice) { - let s = Shape.init(inWidth: 160, inHeight: 160, inChannel: 3) - super.init(device: device, inFunctionName: "mobilent_ar_preprocess", outputDim: s, metalLoadModel: .LoadMetalInDefaultLib, metalLibPath: nil) + @objc public override init(device: MTLDevice) { + super.init(device: device) + except = 2 + modelPath = Bundle.main.path(forResource: "ar_model", ofType: nil) ?! "model null" + paramPath = Bundle.main.path(forResource: "ar_params", ofType: nil) ?! "para null" + preprocessKernel = MobilenetssdPreProccess.init(device: device) + inputDim = Dim.init(inDim: [1, 160, 160, 3]) + metalLoadMode = .LoadMetalInCustomMetalLib + metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib") } - } - - override public func resultStr(res: ResultHolder) -> String { - return " \(res.result[0])" - } - - override public func fetchResult(paddleMobileRes: GPUResultHolder) -> ResultHolder { - fatalError() -// guard let interRes = paddleMobileRes.intermediateResults else { -// fatalError(" need have inter result ") -// } -// -// guard let scores = interRes["Scores"], scores.count > 0, let score = scores[0] as? FetchHolder else { -// fatalError(" need score ") -// } -// -// guard let bboxs = interRes["BBoxes"], bboxs.count > 0, let bbox = bboxs[0] as? FetchHolder else { -// fatalError() -// } -// let startDate = Date.init() + @objc override public init(device: MTLDevice, inParamPointer: UnsafeMutableRawPointer, inParamSize:Int, inModelPointer: UnsafeMutableRawPointer, inModelSize: Int) { + super.init(device:device,inParamPointer:inParamPointer,inParamSize:inParamSize,inModelPointer:inModelPointer,inModelSize:inModelSize) + except = 2 + preprocessKernel = MobilenetssdPreProccess.init(device: device) + inputDim = Dim.init(inDim: [1, 160, 160, 3]) + metalLoadMode = .LoadMetalInCustomMetalLib + metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib") + } -// print("scoreFormatArr: ") -//print((0.. String { + return " \(res[0].result[0])" + } + + override public func fetchResult(paddleMobileRes: [GPUResultHolder]) -> [ResultHolder] { + fatalError() + // guard let interRes = paddleMobileRes.intermediateResults else { + // fatalError(" need have inter result ") + // } + // + // guard let scores = interRes["Scores"], scores.count > 0, let score = scores[0] as? FetchHolder else { + // fatalError(" need score ") + // } + // + // guard let bboxs = interRes["BBoxes"], bboxs.count > 0, let bbox = bboxs[0] as? FetchHolder else { + // fatalError() + // } + + // let startDate = Date.init() + + // print("scoreFormatArr: ") + //print((0.. inTexture [[texture(0)]], - texture2d outTexture [[texture(1)]], - uint2 gid [[thread_position_in_grid]]) + texture2d inTexture [[texture(0)]], + texture2d outTexture [[texture(1)]], + uint2 gid [[thread_position_in_grid]]) { if (gid.x >= outTexture.get_width() || gid.y >= outTexture.get_height()) { @@ -31,9 +31,9 @@ kernel void mobilenet_preprocess( } kernel void mobilenet_preprocess_half( - texture2d inTexture [[texture(0)]], - texture2d outTexture [[texture(1)]], - uint2 gid [[thread_position_in_grid]]) + texture2d inTexture [[texture(0)]], + texture2d outTexture [[texture(1)]], + uint2 gid [[thread_position_in_grid]]) { if (gid.x >= outTexture.get_width() || gid.y >= outTexture.get_height()) { @@ -45,9 +45,9 @@ kernel void mobilenet_preprocess_half( } kernel void mobilenet_ssd_preprocess( - texture2d inTexture [[texture(0)]], - texture2d outTexture [[texture(1)]], - uint2 gid [[thread_position_in_grid]]) + texture2d inTexture [[texture(0)]], + texture2d outTexture [[texture(1)]], + uint2 gid [[thread_position_in_grid]]) { if (gid.x >= outTexture.get_width() || gid.y >= outTexture.get_height()) { @@ -59,9 +59,9 @@ kernel void mobilenet_ssd_preprocess( } kernel void mobilenet_ssd_preprocess_half( - texture2d inTexture [[texture(0)]], - texture2d outTexture [[texture(1)]], - uint2 gid [[thread_position_in_grid]]) + texture2d inTexture [[texture(0)]], + texture2d outTexture [[texture(1)]], + uint2 gid [[thread_position_in_grid]]) { if (gid.x >= outTexture.get_width() || gid.y >= outTexture.get_height()) { @@ -74,44 +74,44 @@ kernel void mobilenet_ssd_preprocess_half( kernel void genet_preprocess(texture2d inTexture [[texture(0)]], texture2d outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]]) { - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height()) { - return; - } - const auto means = float4(128.0f, 128.0f, 128.0f, 0.0f); - const float4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017; - outTexture.write(float4(inColor.z, inColor.y, inColor.x, 0.0f), gid); + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height()) { + return; + } + const auto means = float4(128.0f, 128.0f, 128.0f, 0.0f); + const float4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017; + outTexture.write(float4(inColor.z, inColor.y, inColor.x, 0.0f), gid); } kernel void genet_preprocess_half(texture2d inTexture [[texture(0)]], texture2d outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]]) { - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height()) { - return; - } - const auto means = half4(128.0f, 128.0f, 128.0f, 0.0f); - const half4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017; - outTexture.write(half4(inColor.z, inColor.y, inColor.x, 0.0f), gid); + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height()) { + return; + } + const auto means = half4(128.0f, 128.0f, 128.0f, 0.0f); + const half4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017; + outTexture.write(half4(inColor.z, inColor.y, inColor.x, 0.0f), gid); } kernel void mobilent_ar_preprocess(texture2d inTexture [[texture(0)]], texture2d outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]]) { - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height()) { - return; - } - const auto means = float4(128.0f, 128.0f, 128.0f, 0.0f); - const float4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017; - outTexture.write(float4(inColor.z, inColor.y, inColor.x, 0.0f), gid); + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height()) { + return; + } + const auto means = float4(128.0f, 128.0f, 128.0f, 0.0f); + const float4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017; + outTexture.write(float4(inColor.z, inColor.y, inColor.x, 0.0f), gid); } kernel void mobilent_ar_preprocess_half(texture2d inTexture [[texture(0)]], texture2d outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]]) { - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height()) { - return; - } - const auto means = half4(128.0f, 128.0f, 128.0f, 0.0f); - const half4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017; - outTexture.write(half4(inColor.z, inColor.y, inColor.x, 0.0f), gid); + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height()) { + return; + } + const auto means = half4(128.0f, 128.0f, 128.0f, 0.0f); + const half4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017; + outTexture.write(half4(inColor.z, inColor.y, inColor.x, 0.0f), gid); } diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/YoloNet.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/YoloNet.swift index 5017d3274da34459dfd26060e3c3598b76ba8d31..a13b6303ac3c8127761a5449e186d896cbf8925f 100644 --- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/YoloNet.swift +++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/YoloNet.swift @@ -17,18 +17,22 @@ import Foundation import paddle_mobile public class YoloNet: Net { - @objc public override init(device: MTLDevice) { - super.init(device: device) - except = 0 - modelPath = Bundle.main.path(forResource: "yolo_model", ofType: nil) ?! "model null" - paramPath = Bundle.main.path(forResource: "yolo_params", ofType: nil) ?! "para null" - inputDim = Dim.init(inDim: [1, 416, 416, 3]) -// metalLoadMode = .LoadMetalInCustomMetalLib -// metalLibPath = Bundle.main.path(forResource: "PaddleMobileMetal", ofType: "metallib") ?! " can't be nil " - } - - override public func resultStr(res: ResultHolder) -> String { - return " \(res.result[0]) ... " - } - + @objc public override init(device: MTLDevice) { + super.init(device: device) + except = 0 + modelPath = Bundle.main.path(forResource: "yolo_16_model", ofType: nil) ?! "model null" + paramPath = Bundle.main.path(forResource: "yolo_16_param", ofType: nil) ?! "para null" + inputDim = Dim.init(inDim: [1, 416, 416, 3]) + metalLoadMode = .LoadMetalInCustomMetalLib + metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib") + useMPS = true + paramPrecision = .Float16 + preprocessKernel = ScaleKernel.init(device: device, shape: Shape.init(inWidth: 416, inHeight: 416, inChannel: 3), metalLoadMode: .LoadMetalInCustomMetalLib, metalLibPath: Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")) + + } + + override public func resultStr(res: [ResultHolder]) -> String { + return " \(res[0].result[0]) ... " + } + } diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/OCDemo/LoadPointerViewController.h b/metal/paddle-mobile-demo/paddle-mobile-demo/OCDemo/LoadPointerViewController.h index ffb3c0699e6154d72040d49d146e7b7ee21d59fb..82e6ad9467add0e0e561d416eddfa4298883508e 100644 --- a/metal/paddle-mobile-demo/paddle-mobile-demo/OCDemo/LoadPointerViewController.h +++ b/metal/paddle-mobile-demo/paddle-mobile-demo/OCDemo/LoadPointerViewController.h @@ -14,6 +14,10 @@ #import + +/** + @b 从内存中加载模型 Demo, 可以在 main storyboard 中调整 Demo + */ @interface LoadPointerViewController : UIViewController @end diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/OCDemo/LoadPointerViewController.m b/metal/paddle-mobile-demo/paddle-mobile-demo/OCDemo/LoadPointerViewController.m index 657f446a3acb83010b4ec5b5459f67c0aa063883..5bef9317b173d94c40008bf60c98c32a01f32dd2 100644 --- a/metal/paddle-mobile-demo/paddle-mobile-demo/OCDemo/LoadPointerViewController.m +++ b/metal/paddle-mobile-demo/paddle-mobile-demo/OCDemo/LoadPointerViewController.m @@ -13,17 +13,20 @@ limitations under the License. */ #import "PaddleMobileGPU.h" +#import "paddle_mobile_demo-Swift.h" #import "LoadPointerViewController.h" -#import "paddle-mobile-demo-Bridging-Header.h" #import +#import @interface LoadPointerViewController () -@property (strong, nonatomic) id device; +@property (weak, nonatomic) IBOutlet UIImageView *imageView; + +@property (assign, nonatomic) BOOL loaded; @property (strong, nonatomic) id texture; -@property (strong, nonatomic) id queue; -@property (strong, nonatomic) PaddleMobileGPU *runner; + +@property (strong, nonatomic) PaddleMobileGPU *paddleMobile; @property (strong, nonatomic) ModelConfig *modelConfig; @end @@ -32,148 +35,82 @@ - (void)viewDidLoad { [super viewDidLoad]; - - - self.device = MTLCreateSystemDefaultDevice(); - - self.queue = [self.device newCommandQueue]; - - // Do any additional setup after loading the view. -// NSString *modelPath = [[NSBundle mainBundle] URLForResource:@"genet_model" withExtension:nil].path; -// NSString *paramPath = [[NSBundle mainBundle] URLForResource:@"genet_params" withExtension:nil].path; - - NSString *modelPath = [[NSBundle mainBundle] URLForResource:@"ar_model" withExtension:nil].path; - NSString *paramPath = [[NSBundle mainBundle] URLForResource:@"ar_params" withExtension:nil].path; - - long fileSize; - FILE *fp; - fp = fopen([modelPath UTF8String], "rb"); - fseek(fp, 0, SEEK_END); - fileSize = ftell(fp); - rewind(fp); - void *buffer = malloc(fileSize); - fread(buffer, 1, fileSize, fp); - fclose(fp); - - long paramfileSize; - FILE *parmaFilePointer; - parmaFilePointer = fopen([paramPath UTF8String], "rb"); - fseek(parmaFilePointer, 0, SEEK_END); - paramfileSize = ftell(parmaFilePointer); - rewind(parmaFilePointer); - void *parmaBuffer = malloc(paramfileSize); - fread(parmaBuffer, 1, paramfileSize, parmaFilePointer); - fclose(parmaFilePointer); - - _modelConfig = [[ModelConfig alloc] init]; -// _modelConfig.means = @[[NSNumber numberWithFloat:128.0], [NSNumber numberWithFloat:128.0], [NSNumber numberWithFloat:128.0]]; -// _modelConfig.scale = 0.017; -// _modelConfig.dims = @[[NSNumber numberWithFloat:1], [NSNumber numberWithFloat:128.], [NSNumber numberWithFloat:128.0],[NSNumber numberWithFloat:3.0]]; - _modelConfig.means = @[[NSNumber numberWithFloat:103.94], [NSNumber numberWithFloat:116.78], [NSNumber numberWithFloat:123.68]]; - _modelConfig.scale = 1; - _modelConfig.dims = @[[NSNumber numberWithFloat:1], [NSNumber numberWithFloat:160.], [NSNumber numberWithFloat:160.0],[NSNumber numberWithFloat:3.0]]; - _modelConfig.modelPointer = buffer; - _modelConfig.modelSize = (int)fileSize; - _modelConfig.paramPointer = parmaBuffer; - _modelConfig.paramSize = (int)paramfileSize; + + self.imageView.image = [UIImage imageNamed:@"banana.jpeg"]; + + NSString *modelPath = [[NSBundle mainBundle] URLForResource:@"super_model" withExtension:nil].path; + NSString *paramPath = [[NSBundle mainBundle] URLForResource:@"super_params" withExtension:nil].path; + + long fileSize; + FILE *fp; + fp = fopen([modelPath UTF8String], "rb"); + fseek(fp, 0, SEEK_END); + fileSize = ftell(fp); + rewind(fp); + void *buffer = malloc(fileSize); + fread(buffer, 1, fileSize, fp); + fclose(fp); + + long paramfileSize; + FILE *parmaFilePointer; + parmaFilePointer = fopen([paramPath UTF8String], "rb"); + fseek(parmaFilePointer, 0, SEEK_END); + paramfileSize = ftell(parmaFilePointer); + rewind(parmaFilePointer); + void *parmaBuffer = malloc(paramfileSize); + fread(parmaBuffer, 1, paramfileSize, parmaFilePointer); + fclose(parmaFilePointer); + + _modelConfig = [[ModelConfig alloc] init]; + _modelConfig.modelPointer = buffer; + _modelConfig.modelSize = (int)fileSize; + _modelConfig.paramPointer = parmaBuffer; + _modelConfig.paramSize = (int)paramfileSize; } - (IBAction)loaderButtonPressed:(id)sender { -// _runner = [[PaddleMobileGPU alloc] initWithCommandQueue:self.queue net:GenetType modelConfig:_modelConfig]; - _runner = [[PaddleMobileGPU alloc] initWithCommandQueue:self.queue net:MobileNetSSDType modelConfig:_modelConfig]; - - [_runner load]; + self.paddleMobile = [[PaddleMobileGPU alloc] initWithCommandQueue:MetalHelper.shared.queue net:SuperResolutionNetType modelConfig:_modelConfig]; + _loaded = [self.paddleMobile load]; + NSLog(@" load 结果: %@", _loaded ? @"成功" : @"失败"); } - (IBAction)predictButtonPressed:(id)sender { - [self predict]; -} - -- (id) createTextureFromImage:(UIImage*) image device:(id) device -{ - image =[UIImage imageWithCGImage:[image CGImage] - scale:[image scale] - orientation: UIImageOrientationLeft]; - - NSLog(@"orientation and size and stuff %ld %f %f", (long)image.imageOrientation, image.size.width, image.size.height); - - CGImageRef imageRef = image.CGImage; - - size_t width = self.view.frame.size.width; - size_t height = self.view.frame.size.height; - - size_t bitsPerComponent = CGImageGetBitsPerComponent(imageRef); - size_t bitsPerPixel = CGImageGetBitsPerPixel(imageRef); - - CGColorSpaceRef colorSpace = CGImageGetColorSpace(imageRef); - - CGImageAlphaInfo alphaInfo = CGImageGetAlphaInfo(imageRef); - - // NSLog(@"%@ %u", colorSpace, alphaInfo); - - CGBitmapInfo bitmapInfo = kCGBitmapByteOrderDefault | alphaInfo; - // NSLog(@"bitmap info %u", bitmapInfo); - - - CGContextRef context = CGBitmapContextCreate( NULL, width, height, bitsPerComponent, (bitsPerPixel / 8) * width, colorSpace, bitmapInfo); - - if( !context ) - { - NSLog(@"Failed to load image, probably an unsupported texture type"); - return nil; - } - - CGContextDrawImage( context, CGRectMake( 0, 0, width, height ), image.CGImage); - - - MTLPixelFormat format = MTLPixelFormatRGBA8Unorm; - - MTLTextureDescriptor *texDesc = [MTLTextureDescriptor texture2DDescriptorWithPixelFormat:format - width:width - height:height - mipmapped:NO]; - id texture = [device newTextureWithDescriptor:texDesc]; - - [texture replaceRegion:MTLRegionMake2D(0, 0, width, height) - mipmapLevel:0 - withBytes:CGBitmapContextGetData(context) - bytesPerRow:4 * width]; - - return texture; + [self predict]; } - (void)predict { - _texture = [self createTextureFromImage:[UIImage imageNamed:@"hand.jpg"] device:self.device]; - NSTimeInterval startTime = [[NSDate date] timeIntervalSince1970]; - NSInteger max = 428; - for (int i = 0;i < max; i ++) { - [_runner predict:_texture withCompletion:^(BOOL success , NSArray *result) { - if (success) { - if (i == max -1) { - double time = [[NSDate date] timeIntervalSince1970] - startTime; - time = (time/max)*1000; - NSLog(@"gap ==== %fms",time); - } -// for (int i = 0; i < result.count; i ++) { -// NSNumber *number = result[i]; -// NSLog(@"result %d = %f:",i, [number floatValue]); -// } - } - }]; - } + UIImage *image = self.imageView.image; + if (!image) { + NSLog(@" image is nil"); + return; + } + id texture = [MetalHelper.shared.textureLoader newTextureWithCGImage:image.CGImage options:nil error:nil]; + _texture = texture; + if (!_texture) { + NSLog(@" texture is nil"); + return; + } + + if (!self.loaded) { + NSLog(@" not load "); + return; + } + + NSTimeInterval startTime = [[NSDate date] timeIntervalSince1970]; + NSInteger max = 1; + for (int i = 0;i < max; i ++) { + [self.paddleMobile predict:_texture withCompletion:^(BOOL success , NSArray *result) { + if (success) { + if (i == max -1) { + double time = [[NSDate date] timeIntervalSince1970] - startTime; + time = (time/max)*1000; + NSLog(@"gap ==== %fms",time); + } + } + }]; + } } - -- (void)didReceiveMemoryWarning { - [super didReceiveMemoryWarning]; - // Dispose of any resources that can be recreated. -} - -/* -#pragma mark - Navigation - -// In a storyboard-based application, you will often want to do a little preparation before navigation -- (void)prepareForSegue:(UIStoryboardSegue *)segue sender:(id)sender { - // Get the new view controller using [segue destinationViewController]. - // Pass the selected object to the new view controller. +- (IBAction)clear:(id)sender { + [self.paddleMobile clear]; + self.loaded = NO; } -*/ @end diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/PaddleMobileGPU.h b/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/PaddleMobileGPU.h index 683ef3272dddf6f65a52730493bf7a3d12b77c4d..d45d7daaa19d4338238398981285567538bd1d0b 100644 --- a/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/PaddleMobileGPU.h +++ b/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/PaddleMobileGPU.h @@ -16,8 +16,8 @@ #import typedef enum : NSUInteger { - SuperResolutionNetType, - MobileNetSSDType + SuperResolutionNetType, + MobileNetSSDType } NetType; @interface PaddleMobileGPUResult: NSObject @@ -26,6 +26,8 @@ typedef enum : NSUInteger { @property (assign, nonatomic) int outputSize; +@property (strong, nonatomic) NSArray *dim; + -(void)releaseOutput; @end @@ -88,13 +90,13 @@ typedef enum : NSUInteger { * texture: 需要进行预测的图像转换的 texture * completion: 预测完成回调 */ --(void)predict:(id)texture withCompletion:(void (^)(BOOL, NSArray *))completion; +-(void)predict:(id)texture withCompletion:(void (^)(BOOL, NSArray*> *))completion; /* * texture: 需要进行预测的图像转换的 texture * completion: 预测完成回调 */ --(void)predict:(id)texture withResultCompletion:(void (^)(BOOL, PaddleMobileGPUResult *))completion; +-(void)predict:(id)texture withResultCompletion:(void (^)(BOOL, NSArray *))completion; /* * 清理内存 diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/PaddleMobileGPU.m b/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/PaddleMobileGPU.m index 7625788b534a0c4f4dedb4664efe8b47f6bb1eb3..881a6cb5059cd2e30bb78bca6be33beec20c29b2 100644 --- a/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/PaddleMobileGPU.m +++ b/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/PaddleMobileGPU.m @@ -30,65 +30,75 @@ @implementation PaddleMobileGPUResult - (void)setOutputResult:(ResultHolder *)resultHolder { - self.resultHolder = resultHolder; - self.output = resultHolder.result; - self.outputSize = resultHolder.capacity; + self.resultHolder = resultHolder; + self.output = resultHolder.result; + self.outputSize = resultHolder.capacity; } -(void)releaseOutput { - [self.resultHolder releasePointer]; + [self.resultHolder releasePointer]; } @end @interface PaddleMobileGPU () { - Runner *runner; + Runner *runner; } @end @implementation PaddleMobileGPU -(instancetype)initWithCommandQueue:(id)queue net:(NetType)netType modelConfig:(ModelConfig *)config { - self = [super init]; - if (self) { - Net *net = nil; - if (netType == SuperResolutionNetType) { - net = [[SuperResolutionNet alloc] initWithDevice:queue.device]; - } else if (netType == MobileNetSSDType) { - net = [[MobileNet_ssd_AR alloc] initWithDevice:queue.device paramPointer:config.paramPointer paramSize:config.paramSize modePointer:config.modelPointer modelSize:config.modelSize]; + self = [super init]; + if (self) { + Net *net = nil; + if (netType == SuperResolutionNetType) { + net = [[SuperResolutionNet alloc] initWithDevice:queue.device inParamPointer:config.paramPointer inParamSize:config.paramSize inModelPointer:config.modelPointer inModelSize:config.modelSize]; + } else if (netType == MobileNetSSDType) { + net = [[MobileNet_ssd_AR alloc] initWithDevice:queue.device inParamPointer:config.paramPointer inParamSize:config.paramSize inModelPointer:config.modelPointer inModelSize:config.modelSize]; + } + runner = [[Runner alloc] initInNet:net commandQueue:queue]; } - runner = [[Runner alloc] initInNet:net commandQueue:queue]; - } - return self; + return self; } -(BOOL)load { - return [runner load]; + return [runner load]; } --(void)predict:(id)texture withCompletion:(void (^)(BOOL, NSArray *))completion { - - [runner predictWithTexture:texture completion:^(BOOL success, ResultHolder * _Nullable result) { - NSMutableArray *resultArray = [NSMutableArray arrayWithCapacity:result.capacity]; - for (int i = 0; i < result.capacity; ++i) { - [resultArray addObject:[NSNumber numberWithFloat:result.result[i]]]; - } - completion(success, resultArray); - [result releasePointer]; +-(void)predict:(id)texture withCompletion:(void (^)(BOOL, NSArray*> *))completion { - }]; + [runner predictWithTexture:texture completion:^(BOOL success, NSArray * _Nullable resultArr) { + NSMutableArray*> *ocResultArray = [NSMutableArray arrayWithCapacity:resultArr.count]; + for (int i = 0; i < resultArr.count; ++i) { + ResultHolder *resultHolder = resultArr[i]; + NSMutableArray *res = [NSMutableArray arrayWithCapacity:resultHolder.capacity]; + for (int j = 0; j < resultHolder.capacity; ++j) { + [res addObject:[NSNumber numberWithFloat:resultHolder.result[i]]]; + } + [ocResultArray addObject:res]; + [resultHolder releasePointer]; + } + completion(success, ocResultArray); + }]; } --(void)predict:(id)texture withResultCompletion:(void (^)(BOOL, PaddleMobileGPUResult *))completion { - [runner predictWithTexture:texture completion:^(BOOL success, ResultHolder * _Nullable result) { - PaddleMobileGPUResult *gpuResult = [[PaddleMobileGPUResult alloc] init]; - [gpuResult setOutputResult:result]; - completion(success, gpuResult); - }]; +-(void)predict:(id)texture withResultCompletion:(void (^)(BOOL, NSArray *))completion { + [runner predictWithTexture:texture completion:^(BOOL success, NSArray * _Nullable resultArr) { + NSMutableArray *ocResultArr = [NSMutableArray arrayWithCapacity:resultArr.count]; + for (int i = 0; i < resultArr.count; ++i) { + ResultHolder *result = resultArr[i]; + PaddleMobileGPUResult *gpuResult = [[PaddleMobileGPUResult alloc] init]; + gpuResult.dim = result.dim; + [gpuResult setOutputResult:result]; + [ocResultArr addObject:gpuResult]; + } + completion(success, ocResultArr); + }]; } -(void)clear { - [runner clear]; + [runner clear]; } @end diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/SuperResolutionNet.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/SuperResolutionNet.swift index 284612a04df57ffae78d59827547d8d26826b019..50dd29095e19c3cbe4f25e18ecd2690cddea1027 100644 --- a/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/SuperResolutionNet.swift +++ b/metal/paddle-mobile-demo/paddle-mobile-demo/OCInterface/SuperResolutionNet.swift @@ -16,45 +16,57 @@ import Foundation import paddle_mobile @objc public class SuperResolutionNet: Net{ - override public func resultStr(res: ResultHolder) -> String { - return "未实现" - } - - @objc override public init(device: MTLDevice) { - super.init(device: device) - except = 0 - modelPath = Bundle.main.path(forResource: "super_model", ofType: nil) ?! "model null" - paramPath = Bundle.main.path(forResource: "super_params", ofType: nil) ?! "para null" - preprocessKernel = nil - inputDim = Dim.init(inDim: [1, 224, 224, 1]) -// metalLoadMode = .LoadMetalInCustomMetalLib -// metalLibPath = Bundle.main.path(forResource: "PaddleMobileMetal", ofType: "metallib") ?! " can't be nil " - } - - override public func updateProgram(program: Program) { - // n h w c - for block in program.programDesc.blocks { - for varDesc in block.vars { - if !varDesc.persistable { - if varDesc.type == .LodTensor { - let varEle = program.scope.vars[varDesc.name] - if let texture = varEle as? Texture { - let newDim = Dim.init(inDim: [texture.dim[0], inputDim[1], inputDim[2], texture.tensorDim[1]]) - print(" var desc name " + varDesc.name + " new dim" + "\(newDim)") - - texture.updateDims(inTensorDim: Dim.init(inDim: [texture.tensorDim[0], texture.tensorDim[1], inputDim[1], inputDim[2]]), inDim: newDim) - texture.initTexture(device: device, inTranspose: [0, 1, 2, 3], computePrecision: GlobalConfig.shared.computePrecision) - - let output: FetchHolder = program.scope.output() as! FetchHolder - output.dim = newDim - output.capacity = newDim.numel() - output.paddedCapacity = newDim.numel() * 4 - output.initBuffer(device: device) + override public func resultStr(res: [ResultHolder]) -> String { + return "未实现" + } + + public override init(device: MTLDevice, inParamPointer: UnsafeMutableRawPointer, inParamSize: Int, inModelPointer: UnsafeMutableRawPointer, inModelSize: Int) { + super.init(device: device) + except = 0 + metalLoadMode = .LoadMetalInCustomMetalLib + metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib") + inputDim = Dim.init(inDim: [1, 224, 224, 3]) + self.paramPointer = inParamPointer + self.paramSize = inParamSize + self.modelPointer = inModelPointer + self.modelSize = inModelSize + } + + @objc override public init(device: MTLDevice) { + super.init(device: device) + except = 0 + modelPath = Bundle.main.path(forResource: "super_model", ofType: nil) ?! "model null" + paramPath = Bundle.main.path(forResource: "super_params", ofType: nil) ?! "para null" + preprocessKernel = nil + inputDim = Dim.init(inDim: [1, 224, 224, 1]) + metalLoadMode = .LoadMetalInCustomMetalLib + metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib") + } + + override public func updateProgram(program: Program) { + // n h w c + for block in program.programDesc.blocks { + for varDesc in block.vars { + if !varDesc.persistable { + if varDesc.type == .LodTensor { + let varEle = program.scope.vars[varDesc.name] + if let texture = varEle as? Texture { + let newDim = Dim.init(inDim: [texture.dim[0], inputDim[1], inputDim[2], texture.tensorDim[1]]) + print(" var desc name " + varDesc.name + " new dim" + "\(newDim)") + + texture.updateDims(inTensorDim: Dim.init(inDim: [texture.tensorDim[0], texture.tensorDim[1], inputDim[1], inputDim[2]]), inDim: newDim) + texture.initTexture(device: device, inTranspose: [0, 1, 2, 3], computePrecision: GlobalConfig.shared.computePrecision) + + let output: FetchHolder = program.scope.output() as! FetchHolder + output.dim = newDim + output.capacity = newDim.numel() + output.paddedCapacity = newDim.numel() * 4 + output.initBuffer(device: device) + } + } + } } - } } - } } - } } diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/FPSCounter.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/FPSCounter.swift index f9e841f9c2a3060e775726023b6d5cfc3eeb679d..0080aa80f69cdbca5b132cd3019f2d9bedac3397 100644 --- a/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/FPSCounter.swift +++ b/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/FPSCounter.swift @@ -4,28 +4,28 @@ import Foundation import QuartzCore public class FPSCounter { - private(set) public var fps: Double = 0 - - var frames = 0 - var startTime: CFTimeInterval = 0 - - public func start() { - frames = 0 - startTime = CACurrentMediaTime() - } - - public func frameCompleted() { - frames += 1 - let now = CACurrentMediaTime() - let elapsed = now - startTime - if elapsed > 0.1 { - let current = Double(frames) / elapsed - let smoothing = 0.75 - fps = smoothing*fps + (1 - smoothing)*current - if elapsed > 1 { + private(set) public var fps: Double = 0 + + var frames = 0 + var startTime: CFTimeInterval = 0 + + public func start() { frames = 0 startTime = CACurrentMediaTime() - } } - } + + public func frameCompleted() { + frames += 1 + let now = CACurrentMediaTime() + let elapsed = now - startTime + if elapsed > 0.1 { + let current = Double(frames) / elapsed + let smoothing = 0.75 + fps = smoothing*fps + (1 - smoothing)*current + if elapsed > 1 { + frames = 0 + startTime = CACurrentMediaTime() + } + } + } } diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/VideoCapture.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/VideoCapture.swift index c235ed2f0391bdc97e9e182c0e9897814a0518fa..cb639544872439b8595789b3df30cdfe49eb5ef0 100644 --- a/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/VideoCapture.swift +++ b/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/VideoCapture.swift @@ -6,15 +6,15 @@ import AVFoundation @available(iOS 10.0, *) @objc public protocol VideoCaptureDelegate: NSObjectProtocol { - @objc optional func videoCapture(_ capture: VideoCapture, didCaptureSampleBuffer sampleBuffer: CMSampleBuffer, timestamp: CMTime) + @objc optional func videoCapture(_ capture: VideoCapture, didCaptureSampleBuffer sampleBuffer: CMSampleBuffer, timestamp: CMTime) @objc optional func videoCapture(_ capture: VideoCapture, didCaptureVideoTexture texture: MTLTexture?, timestamp: CMTime) @objc optional func videoCapture(_ capture: VideoCapture, didCapturePhoto previewImage: UIImage?) @objc optional func videoCapture(_ capture: VideoCapture, didCapturePhotoTexture texture: MTLTexture?) } /** - Simple interface to the iPhone's camera. -*/ + Simple interface to the iPhone's camera. + */ @available(iOS 10.0, *) public class VideoCapture: NSObject { public var previewLayer: AVCaptureVideoPreviewLayer? @@ -35,9 +35,9 @@ public class VideoCapture: NSObject { self.cameraPosition = position super.init() } - + public func setUp(sessionPreset: AVCaptureSession.Preset = .medium, - completion: @escaping (Bool) -> Void) { + completion: @escaping (Bool) -> Void) { queue.async { let success = self.setUpCamera(sessionPreset: sessionPreset) DispatchQueue.main.async { @@ -45,7 +45,7 @@ public class VideoCapture: NSObject { } } } - + func fontCamera() -> AVCaptureDevice? { let deveices = AVCaptureDevice.DiscoverySession.init(deviceTypes: [.builtInWideAngleCamera], mediaType: AVMediaType.video, position: .front).devices return deveices.first @@ -62,7 +62,7 @@ public class VideoCapture: NSObject { captureSession.beginConfiguration() captureSession.sessionPreset = sessionPreset - + var oCaptureDevice: AVCaptureDevice? switch cameraPosition { case .back: @@ -79,56 +79,56 @@ public class VideoCapture: NSObject { print("Error: no video devices available") return false } - + guard let videoInput = try? AVCaptureDeviceInput(device: captureDevice) else { print("Error: could not create AVCaptureDeviceInput") return false } - + if captureSession.canAddInput(videoInput) { captureSession.addInput(videoInput) } - + let previewLayer = AVCaptureVideoPreviewLayer(session: captureSession) previewLayer.videoGravity = AVLayerVideoGravity.resizeAspect previewLayer.connection?.videoOrientation = self.videoOrientation self.previewLayer = previewLayer - + let settings: [String : Any] = [ - kCVPixelBufferPixelFormatTypeKey as String: NSNumber(value: kCVPixelFormatType_32BGRA) + kCVPixelBufferPixelFormatTypeKey as String: NSNumber(value: kCVPixelFormatType_32BGRA) ] - + videoOutput.videoSettings = settings videoOutput.alwaysDiscardsLateVideoFrames = true videoOutput.setSampleBufferDelegate(self, queue: queue) if captureSession.canAddOutput(videoOutput) { captureSession.addOutput(videoOutput) } - + // We want the buffers to be in portrait orientation otherwise they are // rotated by 90 degrees. Need to set this _after_ addOutput()! videoOutput.connection(with: AVMediaType.video)?.videoOrientation = self.videoOrientation - + if captureSession.canAddOutput(photoOutput) { captureSession.addOutput(photoOutput) } - + captureSession.commitConfiguration() return true } - + public func start() { if !captureSession.isRunning { captureSession.startRunning() } } - + public func stop() { if captureSession.isRunning { captureSession.stopRunning() } } - + /* Captures a single frame of the camera input. */ public func capturePhoto() { let settings = AVCapturePhotoSettings(format: [kCVPixelBufferPixelFormatTypeKey as String: NSNumber(value: kCVPixelFormatType_32BGRA)]) @@ -139,7 +139,7 @@ public class VideoCapture: NSObject { ] photoOutput.capturePhoto(with: settings, delegate: self) } - + func convertToMTLTexture(sampleBuffer: CMSampleBuffer?) -> MTLTexture? { if let textureCache = textureCache, let sampleBuffer = sampleBuffer, let imageBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) { let width = CVPixelBufferGetWidth(imageBuffer) @@ -152,7 +152,7 @@ public class VideoCapture: NSObject { } return nil } - + func convertToUIImage(sampleBuffer: CMSampleBuffer?) -> UIImage? { if let sampleBuffer = sampleBuffer, let imageBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) { @@ -172,47 +172,47 @@ public class VideoCapture: NSObject { @available(iOS 10.0, *) extension VideoCapture: AVCaptureVideoDataOutputSampleBufferDelegate { - public func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) { - // Because lowering the capture device's FPS looks ugly in the preview, - // we capture at full speed but only call the delegate at its desired - // framerate. If `fps` is -1, we run at the full framerate. - let timestamp = CMSampleBufferGetPresentationTimeStamp(sampleBuffer) - let deltaTime = timestamp - lastTimestamp - if fps == -1 || deltaTime >= CMTimeMake(1, Int32(fps)) { - lastTimestamp = timestamp - self.delegate?.videoCapture?(self, didCaptureSampleBuffer: sampleBuffer, timestamp: timestamp) - if self.delegate?.responds(to: #selector(VideoCaptureDelegate.videoCapture(_:didCaptureVideoTexture:timestamp:))) ?? false{ - let texture = convertToMTLTexture(sampleBuffer: sampleBuffer) - delegate?.videoCapture?(self, didCaptureVideoTexture: texture, timestamp: timestamp) + public func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) { + // Because lowering the capture device's FPS looks ugly in the preview, + // we capture at full speed but only call the delegate at its desired + // framerate. If `fps` is -1, we run at the full framerate. + let timestamp = CMSampleBufferGetPresentationTimeStamp(sampleBuffer) + let deltaTime = timestamp - lastTimestamp + if fps == -1 || deltaTime >= CMTimeMake(1, Int32(fps)) { + lastTimestamp = timestamp + self.delegate?.videoCapture?(self, didCaptureSampleBuffer: sampleBuffer, timestamp: timestamp) + if self.delegate?.responds(to: #selector(VideoCaptureDelegate.videoCapture(_:didCaptureVideoTexture:timestamp:))) ?? false{ + let texture = convertToMTLTexture(sampleBuffer: sampleBuffer) + delegate?.videoCapture?(self, didCaptureVideoTexture: texture, timestamp: timestamp) + } } } - } - - public func captureOutput(_ output: AVCaptureOutput, didDrop sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) { - print("dropped frame") - } + + public func captureOutput(_ output: AVCaptureOutput, didDrop sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) { + print("dropped frame") + } } @available(iOS 10.0, *) extension VideoCapture: AVCapturePhotoCaptureDelegate { - public func photoOutput(_ captureOutput: AVCapturePhotoOutput, - didFinishProcessingPhoto photoSampleBuffer: CMSampleBuffer?, - previewPhoto previewPhotoSampleBuffer: CMSampleBuffer?, - resolvedSettings: AVCaptureResolvedPhotoSettings, - bracketSettings: AVCaptureBracketedStillImageSettings?, - error: Error?) { - var imageTexture: MTLTexture? - var previewImage: UIImage? - if error == nil { - if self.delegate?.responds(to: #selector(VideoCaptureDelegate.videoCapture(_:didCapturePhotoTexture:))) ?? false{ - imageTexture = convertToMTLTexture(sampleBuffer: photoSampleBuffer) - self.delegate?.videoCapture?(self, didCapturePhotoTexture: imageTexture) - } - - if self.delegate?.responds(to: #selector(VideoCaptureDelegate.videoCapture(_:didCapturePhoto:))) ?? false{ - previewImage = convertToUIImage(sampleBuffer: previewPhotoSampleBuffer) - self.delegate?.videoCapture?(self, didCapturePhoto: previewImage) + public func photoOutput(_ captureOutput: AVCapturePhotoOutput, + didFinishProcessingPhoto photoSampleBuffer: CMSampleBuffer?, + previewPhoto previewPhotoSampleBuffer: CMSampleBuffer?, + resolvedSettings: AVCaptureResolvedPhotoSettings, + bracketSettings: AVCaptureBracketedStillImageSettings?, + error: Error?) { + var imageTexture: MTLTexture? + var previewImage: UIImage? + if error == nil { + if self.delegate?.responds(to: #selector(VideoCaptureDelegate.videoCapture(_:didCapturePhotoTexture:))) ?? false{ + imageTexture = convertToMTLTexture(sampleBuffer: photoSampleBuffer) + self.delegate?.videoCapture?(self, didCapturePhotoTexture: imageTexture) + } + + if self.delegate?.responds(to: #selector(VideoCaptureDelegate.videoCapture(_:didCapturePhoto:))) ?? false{ + previewImage = convertToUIImage(sampleBuffer: previewPhotoSampleBuffer) + self.delegate?.videoCapture?(self, didCapturePhoto: previewImage) + } } } - } } diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift index 5d40da577121ffb7c6481831efe7b11e36696d81..53c417e88d64926160a95829498425646c2ba1ad 100644 --- a/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift +++ b/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift @@ -19,264 +19,243 @@ import paddle_mobile import MetalPerformanceShaders class FileReader { - let file: UnsafeMutablePointer - let fileSize: Int - init(paramPath: String) throws { - guard let tmpFile = fopen(paramPath, "rb") else { - throw PaddleMobileError.loaderError(message: "open param file error" + paramPath) + let file: UnsafeMutablePointer + let fileSize: Int + init(paramPath: String) throws { + guard let tmpFile = fopen(paramPath, "rb") else { + throw PaddleMobileError.loaderError(message: "open param file error" + paramPath) + } + file = tmpFile + fseek(file, 0, SEEK_END) + fileSize = ftell(file) + guard fileSize > 0 else { + throw PaddleMobileError.loaderError(message: "param file size is too small") + } + rewind(file) + } + + func read() -> UnsafeMutablePointer { + let ptr = UnsafeMutablePointer.allocate(capacity: MemoryLayout.size * fileSize) + fread(ptr, fileSize, 1, file) + return ptr } - file = tmpFile - fseek(file, 0, SEEK_END) - fileSize = ftell(file) - guard fileSize > 0 else { - throw PaddleMobileError.loaderError(message: "param file size is too small") + + deinit { + fclose(file) } - rewind(file) - } - - func read() -> UnsafeMutablePointer { - let ptr = UnsafeMutablePointer.allocate(capacity: MemoryLayout.size * fileSize) - fread(ptr, fileSize, 1, file) - return ptr - } - - deinit { - fclose(file) - } } enum Platform { - case GPU + case GPU } let platformSupport: [(Platform, String)] = [(.GPU, "GPU")] enum SupportModel: String{ - case yolo = "yolo" - case mobilenet_combined = "mobilenet_combined" - case super_resolution = "superresoltion" - case mobilenet = "mobilenet" - - static func supportedModels() -> [SupportModel] { - return [.super_resolution, .yolo, .mobilenet_combined, .mobilenet] - } + case yolo = "yolo" + case mobilenet_combined = "mobilenet_combined" + case super_resolution = "superresoltion" + case mobilenet = "mobilenet" + + static func supportedModels() -> [SupportModel] { + return [.super_resolution, .yolo, .mobilenet_combined, .mobilenet] + } } let netSupport: [SupportModel : Net] = [ - .super_resolution : SuperResolutionNet.init(device: MetalHelper.shared.device), - .yolo : YoloNet.init(device: MetalHelper.shared.device), - .mobilenet_combined : MobileNetCombined.init(device: MetalHelper.shared.device), - .mobilenet : MobileNet.init(device: MetalHelper.shared.device)] + .super_resolution : SuperResolutionNet.init(device: MetalHelper.shared.device), + .yolo : YoloNet.init(device: MetalHelper.shared.device), + .mobilenet_combined : MobileNetCombined.init(device: MetalHelper.shared.device), + .mobilenet : MobileNet.init(device: MetalHelper.shared.device)] class ViewController: UIViewController { - @IBOutlet weak var resultTextView: UITextView! - @IBOutlet weak var selectImageView: UIImageView! - @IBOutlet weak var elapsedTimeLabel: UILabel! - @IBOutlet weak var modelPickerView: UIPickerView! - @IBOutlet weak var threadPickerView: UIPickerView! - @IBOutlet weak var videoView: UIView! - // var videoCapture: VideoCapture! - - var selectImage: UIImage? - var inputPointer: UnsafeMutablePointer? - var modelType: SupportModel = SupportModel.supportedModels()[0] - var toPredictTexture: MTLTexture? - - var runner: Runner! - var platform: Platform = .GPU - var threadNum = 1 - - @IBAction func loadAct(_ sender: Any) { - runner = Runner.init(inNet: netSupport[modelType]!, commandQueue: MetalHelper.shared.queue) - if platform == .GPU { -// let filePath = Bundle.main.path(forResource: "mingren_input_data", ofType: nil) -// let fileReader = try! FileReader.init(paramPath: filePath!) -// let pointer: UnsafeMutablePointer = fileReader.read() -// -// -// let buffer = MetalHelper.shared.device.makeBuffer(length: fileReader.fileSize, options: .storageModeShared) -// -// buffer?.contents().copyMemory(from: pointer, byteCount: fileReader.fileSize) - - - if self.toPredictTexture == nil { - -// runner.getTexture(inBuffer: buffer!) { [weak self] (texture) in -// self?.toPredictTexture = texture -// } + @IBOutlet weak var resultTextView: UITextView! + @IBOutlet weak var selectImageView: UIImageView! + @IBOutlet weak var elapsedTimeLabel: UILabel! + @IBOutlet weak var modelPickerView: UIPickerView! + @IBOutlet weak var threadPickerView: UIPickerView! + @IBOutlet weak var videoView: UIView! + // var videoCapture: VideoCapture! + + var selectImage: UIImage? + var inputPointer: UnsafeMutablePointer? + var modelType: SupportModel = SupportModel.supportedModels()[0] + var toPredictTexture: MTLTexture? + + var runner: Runner! + var platform: Platform = .GPU + var threadNum = 1 + + @IBAction func loadAct(_ sender: Any) { + runner = Runner.init(inNet: netSupport[modelType]!, commandQueue: MetalHelper.shared.queue) + if platform == .GPU { + // let filePath = Bundle.main.path(forResource: "mingren_input_data", ofType: nil) + // let fileReader = try! FileReader.init(paramPath: filePath!) + // let pointer: UnsafeMutablePointer = fileReader.read() + // + // + // let buffer = MetalHelper.shared.device.makeBuffer(length: fileReader.fileSize, options: .storageModeShared) + // + // buffer?.contents().copyMemory(from: pointer, byteCount: fileReader.fileSize) + + + if self.toPredictTexture == nil { + let beforeDate = Date.init() + if modelType == .mobilenet_combined || modelType == .yolo { + self.toPredictTexture = try! MetalHelper.shared.textureLoader.newTexture(cgImage: selectImage!.cgImage!, options: nil) + } else { + runner.getTexture(image: selectImage!.cgImage!) { [weak self] (texture) in + let timeUse = Date.init().timeIntervalSince(beforeDate) + print("get texture time use: \(timeUse)") + self?.toPredictTexture = texture + } + } + } + } else { + fatalError( " unsupport " ) + } - runner.getTexture(image: selectImage!.cgImage!) { [weak self] (texture) in - self?.toPredictTexture = texture + if runner.load() { + print(" load success ! ") + } else { + print(" load error ! ") } - } - } else { - fatalError( " unsupport " ) } - if runner.load() { - print(" load success ! ") - } else { - print(" load error ! ") + @IBAction func selectImageAct(_ sender: Any) { + let imagePicker = UIImagePickerController() + imagePicker.sourceType = .camera + imagePicker.delegate = self + self.present(imagePicker, animated: true, completion: nil) } - } - - @IBAction func selectImageAct(_ sender: Any) { - let imagePicker = UIImagePickerController() - imagePicker.sourceType = .camera - imagePicker.delegate = self - self.present(imagePicker, animated: true, completion: nil) - } - - @IBAction func clearAct(_ sender: Any) { - runner.clear() - } - - @IBAction func predictAct(_ sender: Any) { - let max = 1 - switch platform { - case .GPU: - guard let inTexture = toPredictTexture else { - resultTextView.text = "请选择图片 ! " - return - } - - let startDate = Date.init() - for i in 0.. Int { - if pickerView == modelPickerView { - return 1 - } else if pickerView == threadPickerView { - return 1 - } else { - fatalError() + func numberOfComponents(in pickerView: UIPickerView) -> Int { + if pickerView == modelPickerView { + return 1 + } else if pickerView == threadPickerView { + return 1 + } else { + fatalError() + } } - } - - func pickerView(_ pickerView: UIPickerView, numberOfRowsInComponent component: Int) -> Int { - if pickerView == modelPickerView { - return SupportModel.supportedModels().count - } else if pickerView == threadPickerView { - return platformSupport.count - } else { - fatalError() + + func pickerView(_ pickerView: UIPickerView, numberOfRowsInComponent component: Int) -> Int { + if pickerView == modelPickerView { + return SupportModel.supportedModels().count + } else if pickerView == threadPickerView { + return platformSupport.count + } else { + fatalError() + } } - } - - public func pickerView(_ pickerView: UIPickerView, titleForRow row: Int, forComponent component: Int) -> String? { - if pickerView == modelPickerView { - return SupportModel.supportedModels()[row].rawValue - } else if pickerView == threadPickerView { - return platformSupport[row].1 - } else { - fatalError() + + public func pickerView(_ pickerView: UIPickerView, titleForRow row: Int, forComponent component: Int) -> String? { + if pickerView == modelPickerView { + return SupportModel.supportedModels()[row].rawValue + } else if pickerView == threadPickerView { + return platformSupport[row].1 + } else { + fatalError() + } } - } - - public func pickerView(_ pickerView: UIPickerView, didSelectRow row: Int, inComponent component: Int) { - if pickerView == modelPickerView { - self.modelType = SupportModel.supportedModels()[row] - } else if pickerView == threadPickerView { - platform = platformSupport[row].0 - } else { - fatalError() + + public func pickerView(_ pickerView: UIPickerView, didSelectRow row: Int, inComponent component: Int) { + if pickerView == modelPickerView { + self.modelType = SupportModel.supportedModels()[row] + } else if pickerView == threadPickerView { + platform = platformSupport[row].0 + } else { + fatalError() + } } - } } extension ViewController: UIImagePickerControllerDelegate, UINavigationControllerDelegate { - func imagePickerController(_ picker: UIImagePickerController, didFinishPickingMediaWithInfo info: [String : Any]) { - picker.dismiss(animated: true){[weak self] in - guard let sSelf = self, let image = info["UIImagePickerControllerOriginalImage"] as? UIImage else{ - fatalError("no image") - } - sSelf.selectImage = image - sSelf.selectImageView.image = image - sSelf.runner.getTexture(image: image.cgImage!, getTexture: { (texture) in - sSelf.toPredictTexture = texture - }) + func imagePickerController(_ picker: UIImagePickerController, didFinishPickingMediaWithInfo info: [String : Any]) { + picker.dismiss(animated: true){[weak self] in + guard let sSelf = self, let image = info["UIImagePickerControllerOriginalImage"] as? UIImage else{ + fatalError("no image") + } + sSelf.selectImage = image + sSelf.selectImageView.image = image + sSelf.runner.getTexture(image: image.cgImage!, getTexture: { (texture) in + sSelf.toPredictTexture = texture + }) + } } - } } var bool1 = false extension ViewController: VideoCaptureDelegate{ - func predictTexture(texture: MTLTexture){ - runner.scaleTexture(input: texture) { (scaledTexture) in - self.runner.predict(texture: scaledTexture, completion: { (success, resultHolder) in - // print(resultHolder!.result![0]) - resultHolder?.releasePointer() - }) + func predictTexture(texture: MTLTexture){ + runner.scaleTexture(input: texture) { (scaledTexture) in + self.runner.predict(texture: scaledTexture, completion: { (success, resultHolder) in + resultHolder?.first?.releasePointer() + }) + } } - } - + } diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/project.pbxproj b/metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/project.pbxproj new file mode 100644 index 0000000000000000000000000000000000000000..007fd5e42982539dd9872cce4f2cead5727fde8d --- /dev/null +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/project.pbxproj @@ -0,0 +1,393 @@ +// !$*UTF8*$! +{ + archiveVersion = 1; + classes = { + }; + objectVersion = 50; + objects = { + +/* Begin PBXBuildFile section */ + 5CCC0CF6759710BAFE999DB7 /* Pods_paddle_mobile_metallib.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 5D9D330A035906298947080B /* Pods_paddle_mobile_metallib.framework */; }; + FCC15DE5221E69E100DC3CB2 /* ReluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DBC221E69DD00DC3CB2 /* ReluKernel.metal */; }; + FCC15DE6221E69E100DC3CB2 /* BoxCoder.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DBD221E69DD00DC3CB2 /* BoxCoder.metal */; }; + FCC15DE7221E69E100DC3CB2 /* ConvAddBNReluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DBE221E69DD00DC3CB2 /* ConvAddBNReluKernel.metal */; }; + FCC15DE8221E69E100DC3CB2 /* Split.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DBF221E69DD00DC3CB2 /* Split.metal */; }; + FCC15DE9221E69E100DC3CB2 /* BilinearInterp.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DC0221E69DD00DC3CB2 /* BilinearInterp.metal */; }; + FCC15DEA221E69E100DC3CB2 /* ElementwiseAddPreluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DC1221E69DD00DC3CB2 /* ElementwiseAddPreluKernel.metal */; }; + FCC15DEB221E69E100DC3CB2 /* NMSFetchResultKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DC2221E69DD00DC3CB2 /* NMSFetchResultKernel.metal */; }; + FCC15DEC221E69E100DC3CB2 /* Softmax.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DC3221E69DD00DC3CB2 /* Softmax.metal */; }; + FCC15DED221E69E100DC3CB2 /* PoolKernel.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DC4221E69DE00DC3CB2 /* PoolKernel.inc.metal */; }; + FCC15DEE221E69E100DC3CB2 /* ConvTransposeKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DC5221E69DE00DC3CB2 /* ConvTransposeKernel.metal */; }; + FCC15DEF221E69E100DC3CB2 /* Macro.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DC6221E69DE00DC3CB2 /* Macro.metal */; }; + FCC15DF0221E69E100DC3CB2 /* PreluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DC7221E69DE00DC3CB2 /* PreluKernel.metal */; }; + FCC15DF1221E69E100DC3CB2 /* BilinearInterp.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DC8221E69DE00DC3CB2 /* BilinearInterp.inc.metal */; }; + FCC15DF2221E69E100DC3CB2 /* TransposeKernel.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DC9221E69DE00DC3CB2 /* TransposeKernel.inc.metal */; }; + FCC15DF3221E69E100DC3CB2 /* ConcatKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DCA221E69DE00DC3CB2 /* ConcatKernel.metal */; }; + FCC15DF4221E69E100DC3CB2 /* ResizeBilinear.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DCB221E69DE00DC3CB2 /* ResizeBilinear.metal */; }; + FCC15DF5221E69E100DC3CB2 /* Common.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DCC221E69DE00DC3CB2 /* Common.metal */; }; + FCC15DF6221E69E100DC3CB2 /* PoolKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DCD221E69DE00DC3CB2 /* PoolKernel.metal */; }; + FCC15DF7221E69E100DC3CB2 /* ReshapeKernel.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DCE221E69DE00DC3CB2 /* ReshapeKernel.inc.metal */; }; + FCC15DF8221E69E100DC3CB2 /* ConvBNReluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DCF221E69DE00DC3CB2 /* ConvBNReluKernel.metal */; }; + FCC15DF9221E69E100DC3CB2 /* Kernels.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DD0221E69DE00DC3CB2 /* Kernels.metal */; }; + FCC15DFA221E69E100DC3CB2 /* Shape.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DD1221E69DF00DC3CB2 /* Shape.metal */; }; + FCC15DFB221E69E100DC3CB2 /* Softmax.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DD2221E69DF00DC3CB2 /* Softmax.inc.metal */; }; + FCC15DFC221E69E100DC3CB2 /* ConvAddPreluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DD3221E69DF00DC3CB2 /* ConvAddPreluKernel.metal */; }; + FCC15DFD221E69E100DC3CB2 /* Elementwise.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DD4221E69DF00DC3CB2 /* Elementwise.metal */; }; + FCC15DFE221E69E100DC3CB2 /* ReshapeKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DD5221E69DF00DC3CB2 /* ReshapeKernel.metal */; }; + FCC15DFF221E69E100DC3CB2 /* Scale.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DD6221E69DF00DC3CB2 /* Scale.metal */; }; + FCC15E00221E69E100DC3CB2 /* ConvKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DD7221E69DF00DC3CB2 /* ConvKernel.metal */; }; + FCC15E01221E69E100DC3CB2 /* PriorBoxKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DD8221E69DF00DC3CB2 /* PriorBoxKernel.metal */; }; + FCC15E02221E69E100DC3CB2 /* BatchNormRelu.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DD9221E69E000DC3CB2 /* BatchNormRelu.metal */; }; + FCC15E03221E69E100DC3CB2 /* TransposeKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DDA221E69E000DC3CB2 /* TransposeKernel.metal */; }; + FCC15E04221E69E100DC3CB2 /* ConvAddPrelu.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DDB221E69E000DC3CB2 /* ConvAddPrelu.inc.metal */; }; + FCC15E05221E69E100DC3CB2 /* BatchNormKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DDC221E69E000DC3CB2 /* BatchNormKernel.metal */; }; + FCC15E06221E69E100DC3CB2 /* BoxCoder.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DDD221E69E000DC3CB2 /* BoxCoder.inc.metal */; }; + FCC15E07221E69E100DC3CB2 /* FetchKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DDE221E69E000DC3CB2 /* FetchKernel.metal */; }; + FCC15E08221E69E100DC3CB2 /* Split.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DDF221E69E000DC3CB2 /* Split.inc.metal */; }; + FCC15E09221E69E100DC3CB2 /* ConcatKernel.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DE0221E69E100DC3CB2 /* ConcatKernel.inc.metal */; }; + FCC15E0A221E69E100DC3CB2 /* ElementwiseAddPreluKernel.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DE1221E69E100DC3CB2 /* ElementwiseAddPreluKernel.inc.metal */; }; + FCC15E0B221E69E100DC3CB2 /* FetchKernel.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DE2221E69E100DC3CB2 /* FetchKernel.inc.metal */; }; + FCC15E0C221E69E100DC3CB2 /* BufferToTexture.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DE3221E69E100DC3CB2 /* BufferToTexture.metal */; }; + FCC15E0D221E69E100DC3CB2 /* ConvAddMetal.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCC15DE4221E69E100DC3CB2 /* ConvAddMetal.metal */; }; +/* End PBXBuildFile section */ + +/* Begin PBXFileReference section */ + 33511F4FF7FE78679BE12DC0 /* Pods-paddle-mobile-metallib.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-paddle-mobile-metallib.release.xcconfig"; path = "../Pods/Target Support Files/Pods-paddle-mobile-metallib/Pods-paddle-mobile-metallib.release.xcconfig"; sourceTree = ""; }; + 5D9D330A035906298947080B /* Pods_paddle_mobile_metallib.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = Pods_paddle_mobile_metallib.framework; sourceTree = BUILT_PRODUCTS_DIR; }; + C6D31B9F9533810DBCA6B28D /* Pods-paddle-mobile-metallib.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-paddle-mobile-metallib.debug.xcconfig"; path = "../Pods/Target Support Files/Pods-paddle-mobile-metallib/Pods-paddle-mobile-metallib.debug.xcconfig"; sourceTree = ""; }; + FCC15D60221E66DE00DC3CB2 /* paddle-mobile-metallib.metallib */ = {isa = PBXFileReference; explicitFileType = "archive.metal-library"; includeInIndex = 0; path = "paddle-mobile-metallib.metallib"; sourceTree = BUILT_PRODUCTS_DIR; }; + FCC15DBC221E69DD00DC3CB2 /* ReluKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = ReluKernel.metal; sourceTree = ""; }; + FCC15DBD221E69DD00DC3CB2 /* BoxCoder.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = BoxCoder.metal; sourceTree = ""; }; + FCC15DBE221E69DD00DC3CB2 /* ConvAddBNReluKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = ConvAddBNReluKernel.metal; sourceTree = ""; }; + FCC15DBF221E69DD00DC3CB2 /* Split.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Split.metal; sourceTree = ""; }; + FCC15DC0221E69DD00DC3CB2 /* BilinearInterp.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = BilinearInterp.metal; sourceTree = ""; }; + FCC15DC1221E69DD00DC3CB2 /* ElementwiseAddPreluKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = ElementwiseAddPreluKernel.metal; sourceTree = ""; }; + FCC15DC2221E69DD00DC3CB2 /* NMSFetchResultKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = NMSFetchResultKernel.metal; sourceTree = ""; }; + FCC15DC3221E69DD00DC3CB2 /* Softmax.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Softmax.metal; sourceTree = ""; }; + FCC15DC4221E69DE00DC3CB2 /* PoolKernel.inc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = PoolKernel.inc.metal; sourceTree = ""; }; + FCC15DC5221E69DE00DC3CB2 /* ConvTransposeKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = ConvTransposeKernel.metal; sourceTree = ""; }; + FCC15DC6221E69DE00DC3CB2 /* Macro.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Macro.metal; sourceTree = ""; }; + FCC15DC7221E69DE00DC3CB2 /* PreluKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = PreluKernel.metal; sourceTree = ""; }; + FCC15DC8221E69DE00DC3CB2 /* BilinearInterp.inc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = BilinearInterp.inc.metal; sourceTree = ""; }; + FCC15DC9221E69DE00DC3CB2 /* TransposeKernel.inc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = TransposeKernel.inc.metal; sourceTree = ""; }; + FCC15DCA221E69DE00DC3CB2 /* ConcatKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = ConcatKernel.metal; sourceTree = ""; }; + FCC15DCB221E69DE00DC3CB2 /* ResizeBilinear.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = ResizeBilinear.metal; sourceTree = ""; }; + FCC15DCC221E69DE00DC3CB2 /* Common.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Common.metal; sourceTree = ""; }; + FCC15DCD221E69DE00DC3CB2 /* PoolKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = PoolKernel.metal; sourceTree = ""; }; + FCC15DCE221E69DE00DC3CB2 /* ReshapeKernel.inc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = ReshapeKernel.inc.metal; sourceTree = ""; }; + FCC15DCF221E69DE00DC3CB2 /* ConvBNReluKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = ConvBNReluKernel.metal; sourceTree = ""; }; + FCC15DD0221E69DE00DC3CB2 /* Kernels.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Kernels.metal; sourceTree = ""; }; + FCC15DD1221E69DF00DC3CB2 /* Shape.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Shape.metal; sourceTree = ""; }; + FCC15DD2221E69DF00DC3CB2 /* Softmax.inc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Softmax.inc.metal; sourceTree = ""; }; + FCC15DD3221E69DF00DC3CB2 /* ConvAddPreluKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = ConvAddPreluKernel.metal; sourceTree = ""; }; + FCC15DD4221E69DF00DC3CB2 /* Elementwise.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Elementwise.metal; sourceTree = ""; }; + FCC15DD5221E69DF00DC3CB2 /* ReshapeKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = ReshapeKernel.metal; sourceTree = ""; }; + FCC15DD6221E69DF00DC3CB2 /* Scale.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Scale.metal; sourceTree = ""; }; + FCC15DD7221E69DF00DC3CB2 /* ConvKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = ConvKernel.metal; sourceTree = ""; }; + FCC15DD8221E69DF00DC3CB2 /* PriorBoxKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = PriorBoxKernel.metal; sourceTree = ""; }; + FCC15DD9221E69E000DC3CB2 /* BatchNormRelu.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = BatchNormRelu.metal; sourceTree = ""; }; + FCC15DDA221E69E000DC3CB2 /* TransposeKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = TransposeKernel.metal; sourceTree = ""; }; + FCC15DDB221E69E000DC3CB2 /* ConvAddPrelu.inc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = ConvAddPrelu.inc.metal; sourceTree = ""; }; + FCC15DDC221E69E000DC3CB2 /* BatchNormKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = BatchNormKernel.metal; sourceTree = ""; }; + FCC15DDD221E69E000DC3CB2 /* BoxCoder.inc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = BoxCoder.inc.metal; sourceTree = ""; }; + FCC15DDE221E69E000DC3CB2 /* FetchKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = FetchKernel.metal; sourceTree = ""; }; + FCC15DDF221E69E000DC3CB2 /* Split.inc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Split.inc.metal; sourceTree = ""; }; + FCC15DE0221E69E100DC3CB2 /* ConcatKernel.inc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = ConcatKernel.inc.metal; sourceTree = ""; }; + FCC15DE1221E69E100DC3CB2 /* ElementwiseAddPreluKernel.inc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = ElementwiseAddPreluKernel.inc.metal; sourceTree = ""; }; + FCC15DE2221E69E100DC3CB2 /* FetchKernel.inc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = FetchKernel.inc.metal; sourceTree = ""; }; + FCC15DE3221E69E100DC3CB2 /* BufferToTexture.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = BufferToTexture.metal; sourceTree = ""; }; + FCC15DE4221E69E100DC3CB2 /* ConvAddMetal.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = ConvAddMetal.metal; sourceTree = ""; }; +/* End PBXFileReference section */ + +/* Begin PBXFrameworksBuildPhase section */ + 3262940821E130A79BEC3880 /* Frameworks */ = { + isa = PBXFrameworksBuildPhase; + buildActionMask = 2147483647; + files = ( + 5CCC0CF6759710BAFE999DB7 /* Pods_paddle_mobile_metallib.framework in Frameworks */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXFrameworksBuildPhase section */ + +/* Begin PBXGroup section */ + 2EC9E2608C2591494F8F23F0 /* Frameworks */ = { + isa = PBXGroup; + children = ( + 5D9D330A035906298947080B /* Pods_paddle_mobile_metallib.framework */, + ); + name = Frameworks; + sourceTree = ""; + }; + 755C26B34D5114CE1B98D3DC /* Pods */ = { + isa = PBXGroup; + children = ( + C6D31B9F9533810DBCA6B28D /* Pods-paddle-mobile-metallib.debug.xcconfig */, + 33511F4FF7FE78679BE12DC0 /* Pods-paddle-mobile-metallib.release.xcconfig */, + ); + name = Pods; + sourceTree = ""; + }; + FCC15D59221E66DE00DC3CB2 = { + isa = PBXGroup; + children = ( + FCC15D62221E66DE00DC3CB2 /* paddle-mobile-metallib */, + FCC15D61221E66DE00DC3CB2 /* Products */, + 755C26B34D5114CE1B98D3DC /* Pods */, + 2EC9E2608C2591494F8F23F0 /* Frameworks */, + ); + sourceTree = ""; + }; + FCC15D61221E66DE00DC3CB2 /* Products */ = { + isa = PBXGroup; + children = ( + FCC15D60221E66DE00DC3CB2 /* paddle-mobile-metallib.metallib */, + ); + name = Products; + sourceTree = ""; + }; + FCC15D62221E66DE00DC3CB2 /* paddle-mobile-metallib */ = { + isa = PBXGroup; + children = ( + FCC15DDC221E69E000DC3CB2 /* BatchNormKernel.metal */, + FCC15DD9221E69E000DC3CB2 /* BatchNormRelu.metal */, + FCC15DC8221E69DE00DC3CB2 /* BilinearInterp.inc.metal */, + FCC15DC0221E69DD00DC3CB2 /* BilinearInterp.metal */, + FCC15DDD221E69E000DC3CB2 /* BoxCoder.inc.metal */, + FCC15DBD221E69DD00DC3CB2 /* BoxCoder.metal */, + FCC15DE3221E69E100DC3CB2 /* BufferToTexture.metal */, + FCC15DCC221E69DE00DC3CB2 /* Common.metal */, + FCC15DE0221E69E100DC3CB2 /* ConcatKernel.inc.metal */, + FCC15DCA221E69DE00DC3CB2 /* ConcatKernel.metal */, + FCC15DBE221E69DD00DC3CB2 /* ConvAddBNReluKernel.metal */, + FCC15DE4221E69E100DC3CB2 /* ConvAddMetal.metal */, + FCC15DDB221E69E000DC3CB2 /* ConvAddPrelu.inc.metal */, + FCC15DD3221E69DF00DC3CB2 /* ConvAddPreluKernel.metal */, + FCC15DCF221E69DE00DC3CB2 /* ConvBNReluKernel.metal */, + FCC15DD7221E69DF00DC3CB2 /* ConvKernel.metal */, + FCC15DC5221E69DE00DC3CB2 /* ConvTransposeKernel.metal */, + FCC15DD4221E69DF00DC3CB2 /* Elementwise.metal */, + FCC15DE1221E69E100DC3CB2 /* ElementwiseAddPreluKernel.inc.metal */, + FCC15DC1221E69DD00DC3CB2 /* ElementwiseAddPreluKernel.metal */, + FCC15DE2221E69E100DC3CB2 /* FetchKernel.inc.metal */, + FCC15DDE221E69E000DC3CB2 /* FetchKernel.metal */, + FCC15DD0221E69DE00DC3CB2 /* Kernels.metal */, + FCC15DC6221E69DE00DC3CB2 /* Macro.metal */, + FCC15DC2221E69DD00DC3CB2 /* NMSFetchResultKernel.metal */, + FCC15DC4221E69DE00DC3CB2 /* PoolKernel.inc.metal */, + FCC15DCD221E69DE00DC3CB2 /* PoolKernel.metal */, + FCC15DC7221E69DE00DC3CB2 /* PreluKernel.metal */, + FCC15DD8221E69DF00DC3CB2 /* PriorBoxKernel.metal */, + FCC15DBC221E69DD00DC3CB2 /* ReluKernel.metal */, + FCC15DCE221E69DE00DC3CB2 /* ReshapeKernel.inc.metal */, + FCC15DD5221E69DF00DC3CB2 /* ReshapeKernel.metal */, + FCC15DCB221E69DE00DC3CB2 /* ResizeBilinear.metal */, + FCC15DD6221E69DF00DC3CB2 /* Scale.metal */, + FCC15DD1221E69DF00DC3CB2 /* Shape.metal */, + FCC15DD2221E69DF00DC3CB2 /* Softmax.inc.metal */, + FCC15DC3221E69DD00DC3CB2 /* Softmax.metal */, + FCC15DDF221E69E000DC3CB2 /* Split.inc.metal */, + FCC15DBF221E69DD00DC3CB2 /* Split.metal */, + FCC15DC9221E69DE00DC3CB2 /* TransposeKernel.inc.metal */, + FCC15DDA221E69E000DC3CB2 /* TransposeKernel.metal */, + ); + path = "paddle-mobile-metallib"; + sourceTree = ""; + }; +/* End PBXGroup section */ + +/* Begin PBXNativeTarget section */ + FCC15D5F221E66DE00DC3CB2 /* paddle-mobile-metallib */ = { + isa = PBXNativeTarget; + buildConfigurationList = FCC15D67221E66DE00DC3CB2 /* Build configuration list for PBXNativeTarget "paddle-mobile-metallib" */; + buildPhases = ( + DD854B7EC9A77A557887A67F /* [CP] Check Pods Manifest.lock */, + FCC15D5E221E66DE00DC3CB2 /* Sources */, + 3262940821E130A79BEC3880 /* Frameworks */, + ); + buildRules = ( + ); + dependencies = ( + ); + name = "paddle-mobile-metallib"; + productName = "paddle-mobile-metallib"; + productReference = FCC15D60221E66DE00DC3CB2 /* paddle-mobile-metallib.metallib */; + productType = "com.apple.product-type.metal-library"; + }; +/* End PBXNativeTarget section */ + +/* Begin PBXProject section */ + FCC15D5A221E66DE00DC3CB2 /* Project object */ = { + isa = PBXProject; + attributes = { + LastUpgradeCheck = 1010; + ORGANIZATIONNAME = Ray; + TargetAttributes = { + FCC15D5F221E66DE00DC3CB2 = { + CreatedOnToolsVersion = 10.1; + }; + }; + }; + buildConfigurationList = FCC15D5D221E66DE00DC3CB2 /* Build configuration list for PBXProject "paddle-mobile-metallib" */; + compatibilityVersion = "Xcode 9.3"; + developmentRegion = en; + hasScannedForEncodings = 0; + knownRegions = ( + en, + ); + mainGroup = FCC15D59221E66DE00DC3CB2; + productRefGroup = FCC15D61221E66DE00DC3CB2 /* Products */; + projectDirPath = ""; + projectRoot = ""; + targets = ( + FCC15D5F221E66DE00DC3CB2 /* paddle-mobile-metallib */, + ); + }; +/* End PBXProject section */ + +/* Begin PBXShellScriptBuildPhase section */ + DD854B7EC9A77A557887A67F /* [CP] Check Pods Manifest.lock */ = { + isa = PBXShellScriptBuildPhase; + buildActionMask = 2147483647; + files = ( + ); + inputFileListPaths = ( + ); + inputPaths = ( + "${PODS_PODFILE_DIR_PATH}/Podfile.lock", + "${PODS_ROOT}/Manifest.lock", + ); + name = "[CP] Check Pods Manifest.lock"; + outputFileListPaths = ( + ); + outputPaths = ( + "$(DERIVED_FILE_DIR)/Pods-paddle-mobile-metallib-checkManifestLockResult.txt", + ); + runOnlyForDeploymentPostprocessing = 0; + shellPath = /bin/sh; + shellScript = "diff \"${PODS_PODFILE_DIR_PATH}/Podfile.lock\" \"${PODS_ROOT}/Manifest.lock\" > /dev/null\nif [ $? != 0 ] ; then\n # print error to STDERR\n echo \"error: The sandbox is not in sync with the Podfile.lock. Run 'pod install' or update your CocoaPods installation.\" >&2\n exit 1\nfi\n# This output is used by Xcode 'outputs' to avoid re-running this script phase.\necho \"SUCCESS\" > \"${SCRIPT_OUTPUT_FILE_0}\"\n"; + showEnvVarsInLog = 0; + }; +/* End PBXShellScriptBuildPhase section */ + +/* Begin PBXSourcesBuildPhase section */ + FCC15D5E221E66DE00DC3CB2 /* Sources */ = { + isa = PBXSourcesBuildPhase; + buildActionMask = 2147483647; + files = ( + FCC15DF5221E69E100DC3CB2 /* Common.metal in Sources */, + FCC15DFF221E69E100DC3CB2 /* Scale.metal in Sources */, + FCC15DEC221E69E100DC3CB2 /* Softmax.metal in Sources */, + FCC15DE8221E69E100DC3CB2 /* Split.metal in Sources */, + FCC15DF2221E69E100DC3CB2 /* TransposeKernel.inc.metal in Sources */, + FCC15DE7221E69E100DC3CB2 /* ConvAddBNReluKernel.metal in Sources */, + FCC15E04221E69E100DC3CB2 /* ConvAddPrelu.inc.metal in Sources */, + FCC15DF9221E69E100DC3CB2 /* Kernels.metal in Sources */, + FCC15DF0221E69E100DC3CB2 /* PreluKernel.metal in Sources */, + FCC15DEB221E69E100DC3CB2 /* NMSFetchResultKernel.metal in Sources */, + FCC15DE9221E69E100DC3CB2 /* BilinearInterp.metal in Sources */, + FCC15DFA221E69E100DC3CB2 /* Shape.metal in Sources */, + FCC15E0C221E69E100DC3CB2 /* BufferToTexture.metal in Sources */, + FCC15E0A221E69E100DC3CB2 /* ElementwiseAddPreluKernel.inc.metal in Sources */, + FCC15DFB221E69E100DC3CB2 /* Softmax.inc.metal in Sources */, + FCC15E03221E69E100DC3CB2 /* TransposeKernel.metal in Sources */, + FCC15DFE221E69E100DC3CB2 /* ReshapeKernel.metal in Sources */, + FCC15E0D221E69E100DC3CB2 /* ConvAddMetal.metal in Sources */, + FCC15DF7221E69E100DC3CB2 /* ReshapeKernel.inc.metal in Sources */, + FCC15DE5221E69E100DC3CB2 /* ReluKernel.metal in Sources */, + FCC15DEF221E69E100DC3CB2 /* Macro.metal in Sources */, + FCC15E02221E69E100DC3CB2 /* BatchNormRelu.metal in Sources */, + FCC15E00221E69E100DC3CB2 /* ConvKernel.metal in Sources */, + FCC15E01221E69E100DC3CB2 /* PriorBoxKernel.metal in Sources */, + FCC15DEA221E69E100DC3CB2 /* ElementwiseAddPreluKernel.metal in Sources */, + FCC15DED221E69E100DC3CB2 /* PoolKernel.inc.metal in Sources */, + FCC15E07221E69E100DC3CB2 /* FetchKernel.metal in Sources */, + FCC15E0B221E69E100DC3CB2 /* FetchKernel.inc.metal in Sources */, + FCC15DEE221E69E100DC3CB2 /* ConvTransposeKernel.metal in Sources */, + FCC15DFC221E69E100DC3CB2 /* ConvAddPreluKernel.metal in Sources */, + FCC15E06221E69E100DC3CB2 /* BoxCoder.inc.metal in Sources */, + FCC15DF1221E69E100DC3CB2 /* BilinearInterp.inc.metal in Sources */, + FCC15E08221E69E100DC3CB2 /* Split.inc.metal in Sources */, + FCC15DF4221E69E100DC3CB2 /* ResizeBilinear.metal in Sources */, + FCC15E05221E69E100DC3CB2 /* BatchNormKernel.metal in Sources */, + FCC15DE6221E69E100DC3CB2 /* BoxCoder.metal in Sources */, + FCC15DF6221E69E100DC3CB2 /* PoolKernel.metal in Sources */, + FCC15E09221E69E100DC3CB2 /* ConcatKernel.inc.metal in Sources */, + FCC15DFD221E69E100DC3CB2 /* Elementwise.metal in Sources */, + FCC15DF8221E69E100DC3CB2 /* ConvBNReluKernel.metal in Sources */, + FCC15DF3221E69E100DC3CB2 /* ConcatKernel.metal in Sources */, + ); + runOnlyForDeploymentPostprocessing = 0; + }; +/* End PBXSourcesBuildPhase section */ + +/* Begin XCBuildConfiguration section */ + FCC15D65221E66DE00DC3CB2 /* Debug */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + IPHONEOS_DEPLOYMENT_TARGET = 9.0; + MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; + MTL_FAST_MATH = YES; + MTL_LANGUAGE_REVISION = Metal12; + SDKROOT = iphoneos; + }; + name = Debug; + }; + FCC15D66221E66DE00DC3CB2 /* Release */ = { + isa = XCBuildConfiguration; + buildSettings = { + ALWAYS_SEARCH_USER_PATHS = NO; + IPHONEOS_DEPLOYMENT_TARGET = 9.0; + MTL_ENABLE_DEBUG_INFO = NO; + MTL_FAST_MATH = YES; + MTL_LANGUAGE_REVISION = Metal12; + SDKROOT = iphoneos; + }; + name = Release; + }; + FCC15D68221E66DE00DC3CB2 /* Debug */ = { + isa = XCBuildConfiguration; + baseConfigurationReference = C6D31B9F9533810DBCA6B28D /* Pods-paddle-mobile-metallib.debug.xcconfig */; + buildSettings = { + CODE_SIGN_STYLE = Automatic; + MTL_ENABLE_DEBUG_INFO = NO; + PRODUCT_NAME = "$(TARGET_NAME)"; + }; + name = Debug; + }; + FCC15D69221E66DE00DC3CB2 /* Release */ = { + isa = XCBuildConfiguration; + baseConfigurationReference = 33511F4FF7FE78679BE12DC0 /* Pods-paddle-mobile-metallib.release.xcconfig */; + buildSettings = { + CODE_SIGN_STYLE = Automatic; + MTL_ENABLE_DEBUG_INFO = NO; + PRODUCT_NAME = "$(TARGET_NAME)"; + }; + name = Release; + }; +/* End XCBuildConfiguration section */ + +/* Begin XCConfigurationList section */ + FCC15D5D221E66DE00DC3CB2 /* Build configuration list for PBXProject "paddle-mobile-metallib" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + FCC15D65221E66DE00DC3CB2 /* Debug */, + FCC15D66221E66DE00DC3CB2 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; + FCC15D67221E66DE00DC3CB2 /* Build configuration list for PBXNativeTarget "paddle-mobile-metallib" */ = { + isa = XCConfigurationList; + buildConfigurations = ( + FCC15D68221E66DE00DC3CB2 /* Debug */, + FCC15D69221E66DE00DC3CB2 /* Release */, + ); + defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; + }; +/* End XCConfigurationList section */ + }; + rootObject = FCC15D5A221E66DE00DC3CB2 /* Project object */; +} diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/project.xcworkspace/contents.xcworkspacedata b/metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/project.xcworkspace/contents.xcworkspacedata new file mode 100644 index 0000000000000000000000000000000000000000..7fb68fedde2c18f3c2efbed9e635d8e26c5438f0 --- /dev/null +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/project.xcworkspace/contents.xcworkspacedata @@ -0,0 +1,7 @@ + + + + + diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist b/metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist new file mode 100644 index 0000000000000000000000000000000000000000..18d981003d68d0546c4804ac2ff47dd97c6e7921 --- /dev/null +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist @@ -0,0 +1,8 @@ + + + + + IDEDidComputeMac32BitWarning + + + diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/xcshareddata/xcschemes/paddle-mobile-metallib.xcscheme b/metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/xcshareddata/xcschemes/paddle-mobile-metallib.xcscheme new file mode 100644 index 0000000000000000000000000000000000000000..db675cafd8cf6352d6a1ec0925cbe5bcbfbeef90 --- /dev/null +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib.xcodeproj/xcshareddata/xcschemes/paddle-mobile-metallib.xcscheme @@ -0,0 +1,80 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/BatchNormKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BatchNormKernel.metal similarity index 55% rename from metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/BatchNormKernel.metal rename to metal/paddle-mobile-metallib/paddle-mobile-metallib/BatchNormKernel.metal index 96333a07a9669ecb2b5bfe901d71be729e37b533..ab1dcfae6813ddef860158bc9fd638d26dfb4f8a 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/BatchNormKernel.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BatchNormKernel.metal @@ -20,23 +20,23 @@ kernel void batchnorm(texture2d_array inTexture [[texture(0 const device float4 * nscale [[buffer(0)]], const device float4 * nbias [[buffer(1)]], uint3 gid [[thread_position_in_grid]]) { - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) return; - const float4 input = inTexture.read(gid.xy, gid.z); - float4 output = input * nscale[gid.z] + nbias[gid.z]; - outTexture.write(output, gid.xy, gid.z); + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) return; + const float4 input = inTexture.read(gid.xy, gid.z); + float4 output = input * nscale[gid.z] + nbias[gid.z]; + outTexture.write(output, gid.xy, gid.z); } kernel void batchnorm_half(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - const device half4 * newScale [[buffer(0)]], - const device half4 * newBias [[buffer(1)]], - uint3 gid [[thread_position_in_grid]]) { - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) return; - const half4 input = inTexture.read(gid.xy, gid.z); - half4 output = input * newScale[gid.z] + newBias[gid.z]; - outTexture.write(output, gid.xy, gid.z); + texture2d_array outTexture [[texture(1)]], + const device half4 * newScale [[buffer(0)]], + const device half4 * newBias [[buffer(1)]], + uint3 gid [[thread_position_in_grid]]) { + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) return; + const half4 input = inTexture.read(gid.xy, gid.z); + half4 output = input * newScale[gid.z] + newBias[gid.z]; + outTexture.write(output, gid.xy, gid.z); } diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BatchNormRelu.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BatchNormRelu.metal new file mode 100644 index 0000000000000000000000000000000000000000..98ba10d8472086e85ddf62349d56a85c910dd312 --- /dev/null +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BatchNormRelu.metal @@ -0,0 +1,36 @@ +// +// BatchNormRelu.metal +// paddle-mobile +// + +#include +using namespace metal; + +struct MetalConvParam { + short offsetX; + short offsetY; + short offsetZ; + ushort strideX; + ushort strideY; +}; + +kernel void batch_norm_relu_3x3(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + const device float4 *new_scale [[buffer(0)]], + const device float4 *new_biase [[buffer(1)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + float4 input; + float4 output; + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + input = inTexture.sample(sample, gid.x, gid.y, gid.z); + output = fmax(input * new_scale[gid.z] + new_biase[gid.z], 0.0); + outTexture.write(output, gid.xy, gid.z); + +} diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BilinearInterp.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BilinearInterp.inc.metal new file mode 100644 index 0000000000000000000000000000000000000000..188c31019d98ae396bf8dcc605402529164e1dbe --- /dev/null +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BilinearInterp.inc.metal @@ -0,0 +1,49 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#ifdef P + +#define CONCAT2(a, b) a ## b +#define CONCAT2_(a, b) a ## _ ## b + +#define FUNC(f, p) CONCAT2_(f, p) +#define VECTOR(p, n) CONCAT2(p, n) + +kernel void FUNC(bilinear_interp, P)(texture2d_array input [[texture(0)]], + texture2d_array output [[texture(1)]], + constant bilinear_interp_param & pm [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]) { + VECTOR(P, 4) r; + if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) { + r = input.read(gid.xy, gid.z); + } else { + P w = gid.x * pm.ratio_w; + P h = gid.y * pm.ratio_h; + uint w0 = w, h0 = h; + uint w1 = w0 + 1, h1 = h0 + 1; + P w1lambda = w - w0, h1lambda = h - h0; + P w2lambda = 1.0 - w1lambda, h2lambda = 1.0 - h1lambda; + if (w1 >= input.get_width()) w1 = w0; + if (h1 >= input.get_height()) h1 = h0; + VECTOR(P, 4) r0 = input.read(uint2(w0, h0), gid.z); + VECTOR(P, 4) r1 = input.read(uint2(w1, h0), gid.z); + VECTOR(P, 4) r2 = input.read(uint2(w0, h1), gid.z); + VECTOR(P, 4) r3 = input.read(uint2(w1, h1), gid.z); + r = h2lambda * (w2lambda * r0 + w1lambda * r1) + + h1lambda * (w2lambda * r2 + w1lambda * r3); + } + output.write(r, gid.xy, gid.z); +} + +#endif diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/BilinearInterp.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BilinearInterp.metal similarity index 95% rename from metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/BilinearInterp.metal rename to metal/paddle-mobile-metallib/paddle-mobile-metallib/BilinearInterp.metal index 394cf89db09d47b0d3c87ff124c21a93962c0972..6104abb01d459a7e258e4104f17bba9b4e23424c 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/BilinearInterp.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BilinearInterp.metal @@ -16,8 +16,8 @@ using namespace metal; struct bilinear_interp_param { - float ratio_h; - float ratio_w; + float ratio_h; + float ratio_w; }; #define P float diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BoxCoder.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BoxCoder.inc.metal new file mode 100644 index 0000000000000000000000000000000000000000..184ee2bb71189fa5e89e3d0c18901ea2b70e8d8e --- /dev/null +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BoxCoder.inc.metal @@ -0,0 +1,54 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#ifdef P + +#define CONCAT2(a, b) a ## b +#define CONCAT2_(a, b) a ## _ ## b + +#define FUNC(f, p) CONCAT2_(f, p) +#define VECTOR(p, n) CONCAT2(p, n) +kernel void FUNC(boxcoder, P)(texture2d_array priorBox [[texture(0)]], + texture2d_array priorBoxVar [[texture(1)]], + texture2d_array targetBox [[texture(2)]], + texture2d_array output[[texture(3)]], + uint3 gid [[thread_position_in_grid]]) { + VECTOR(P, 4) p = priorBox.read(uint2(0, gid.x), gid.z); + VECTOR(P, 4) pv = priorBoxVar.read(uint2(0, gid.x), gid.z); + VECTOR(P, 4) t; + t[0] = targetBox.read(uint2(0, gid.x), gid.z)[0]; + t[1] = targetBox.read(uint2(1, gid.x), gid.z)[0]; + t[2] = targetBox.read(uint2(2, gid.x), gid.z)[0]; + t[3] = targetBox.read(uint2(3, gid.x), gid.z)[0]; + + P px = (p.x + p.z) / 2; + P py = (p.y + p.w) / 2; + P pw = p.z - p.x; + P ph = p.w - p.y; + + P tx = pv.x * t.x * pw + px; + P ty = pv.y * t.y * ph + py; + P tw = exp(pv.z * t.z) * pw; + P th = exp(pv.w * t.w) * ph; + + VECTOR(P, 4) r; + r.x = tx - tw / 2; + r.y = ty - th / 2; + r.z = tx + tw / 2; + r.w = ty + th / 2; + + output.write(r, gid.xy, gid.z); +} + +#endif diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/BoxCoder.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BoxCoder.metal similarity index 100% rename from metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/BoxCoder.metal rename to metal/paddle-mobile-metallib/paddle-mobile-metallib/BoxCoder.metal diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/BufferToTexture.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BufferToTexture.metal similarity index 62% rename from metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/BufferToTexture.metal rename to metal/paddle-mobile-metallib/paddle-mobile-metallib/BufferToTexture.metal index 3c07872616bb7c2f130d92247feeeeaa60ece21e..12450f574159cb7030c8e902cc3535d1dda1b864 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/BufferToTexture.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BufferToTexture.metal @@ -13,24 +13,24 @@ kernel void buffer_to_texture_kernel( const device float *input [[buffer(0)]], texture2d outTexture [[texture(0)]], uint2 gid [[thread_position_in_grid]]){ - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height()) { - return; - } - - float y = input[outTexture.get_width() * gid.y + gid.x]; - outTexture.write(float4(y, 0.0f, 0.0f, 0.0f), gid); + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height()) { + return; + } + + float y = input[outTexture.get_width() * gid.y + gid.x]; + outTexture.write(float4(y, 0.0f, 0.0f, 0.0f), gid); } kernel void buffer_to_texture_kernel_half(const device float *input [[buffer(0)]], texture2d outTexture [[texture(0)]], uint2 gid [[thread_position_in_grid]]){ - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height()) { - return; - } - - float y = input[outTexture.get_width() * gid.y + gid.x]; - outTexture.write(half4(y, 0.0f, 0.0f, 0.0f), gid); + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height()) { + return; + } + + float y = input[outTexture.get_width() * gid.y + gid.x]; + outTexture.write(half4(y, 0.0f, 0.0f, 0.0f), gid); } diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Common.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Common.metal new file mode 100644 index 0000000000000000000000000000000000000000..099b8ca77cb10a81ffd8e2e026d1058c0954bd97 --- /dev/null +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Common.metal @@ -0,0 +1,120 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include +using namespace metal; + + +inline void xyzn2abcd_1(int xyzn[4], int abcd[4]) { + abcd[0] = abcd[1] = abcd[2] = 0; + abcd[3] = xyzn[0] * 4 + xyzn[3]; +} +inline void xyzn2abcd_2(int xyzn[4], int abcd[4]) { + abcd[0] = abcd[1] = 0; + abcd[2] = xyzn[1]; + abcd[3] = xyzn[0] * 4 + xyzn[3]; +} +inline void xyzn2abcd_3(int xyzn[4], int abcd[4]) { + abcd[0] = 0; + abcd[3] = xyzn[0]; + abcd[2] = xyzn[1]; + abcd[1] = xyzn[2] * 4 + xyzn[3]; +} +inline void xyzn2abcd_4(int C, int xyzn[4], int abcd[4]) { + abcd[2] = xyzn[0]; + abcd[1] = xyzn[1]; + uint t = xyzn[2] * 4 + xyzn[3]; + abcd[0] = t / C; + abcd[3] = t % C; +} + +inline void abcd2xyzn_1(int abcd[4], int xyzn[4]) { + xyzn[1] = xyzn[2] = 0; + xyzn[0] = abcd[3] / 4; + xyzn[1] = abcd[3] % 4; +} +inline void abcd2xyzn_2(int abcd[4], int xyzn[4]) { + xyzn[2] = 0; + xyzn[1] = abcd[2]; + xyzn[0] = abcd[3] / 4; + xyzn[3] = abcd[3] % 4; +} +inline void abcd2xyzn_3(int abcd[4], int xyzn[4]) { + xyzn[0] = abcd[3]; + xyzn[1] = abcd[2]; + xyzn[2] = abcd[1] / 4; + xyzn[3] = abcd[1] % 4; +} +inline void abcd2xyzn_4(int C, int abcd[4], int xyzn[4]) { + xyzn[0] = abcd[2]; + xyzn[1] = abcd[1]; + uint t = abcd[0] * C + abcd[3]; + xyzn[2] = t / 4; + xyzn[3] = t % 4; +} + +inline void xyzn2abcd(int C, int xyzn[4], int abcd[4]) { + abcd[2] = xyzn[0]; + abcd[1] = xyzn[1]; + uint t = xyzn[2] * 4 + xyzn[3]; + abcd[0] = t / C; + abcd[3] = t % C; +} + +inline void abcd2xyzn(int C, int abcd[4], int xyzn[4]) { + xyzn[0] = abcd[2]; + xyzn[1] = abcd[1]; + uint t = abcd[0] * C + abcd[3]; + xyzn[2] = t / 4; + xyzn[3] = t % 4; +} + +inline int32_t abcd2index(int32_t dim[4], int32_t abcd[4]) { + int32_t r = abcd[0]; + r = r * dim[1] + abcd[1]; + r = r * dim[2] + abcd[2]; + r = r * dim[3] + abcd[3]; + return r; +} + +inline void index2abcd(int32_t dim[4], int32_t ind, int32_t abcd[4]) { + abcd[3] = ind % dim[3]; ind /= dim[3]; + abcd[2] = ind % dim[2]; ind /= dim[2]; + abcd[1] = ind % dim[1]; ind /= dim[1]; + abcd[0] = ind; +} + +inline void trans(int32_t trans[4], int32_t ipos[4], int32_t opos[4]) { + for (int i = 0; i < 4; i++) { + opos[i] = ipos[trans[i]]; + } +} + +inline void invtrans(int32_t trans[4], int32_t ipos[4], int32_t opos[4]) { + for (int i = 0; i < 4; i++) { + opos[trans[i]] = ipos[i]; + } +} + + +struct MetalConvParam { + short offsetX; + short offsetY; + short offsetZ; + ushort strideX; + ushort strideY; + ushort dilationX; + ushort dilationY; +}; + diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.inc.metal new file mode 100644 index 0000000000000000000000000000000000000000..ff8bd3d7a39dd89186b1d3683fbf59d9f89e4ae5 --- /dev/null +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.inc.metal @@ -0,0 +1,318 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#ifdef P + +#define CONCAT2(a, b) a ## b +#define CONCAT2_(a, b) a ## _ ## b +#define CONCAT3_(a, b, c) a ## _ ## b ## _ ## c +#define CONCAT4_(a, b, c, d) a ## _ ## b ## _ ## c ## _ ## d +#define CONCAT5_(a, b, c, d, e) a ## _ ## b ## _ ## c ## _ ## d ## _ ## e + +#define FUNC(f, r, n, v, p) CONCAT5_(f, r, n, v, p) +#define VECTOR(p, n) CONCAT2(p, n) +#define FUNC_R(f, r) CONCAT2_(f, r) + +#if V == VX +#define VV x +#elif V == VY +#define VV y +#elif V == VZ +#define VV z +#else +#define VV normal +#endif + +#if V == VNORMAL +//kernel void FUNC(concat, R, N, normal, P)(array, N> in [[texture(0)]], +// texture2d_array out_x [[texture(N)]], +// texture2d_array out [[texture(N+1)]], +// constant ConcatParam & pm [[buffer(0)]], +// uint3 gid [[thread_position_in_grid]]) { +//} +kernel void FUNC(concat, R, N, VV, P)(texture2d_array in0 [[texture(0)]], + texture2d_array in1 [[texture(1)]], +#if N >= 3 + texture2d_array in2 [[texture(2)]], +#endif +#if N >= 4 + texture2d_array in3 [[texture(3)]], +#endif +#if N >= 5 + texture2d_array in4 [[texture(4)]], +#endif +#if N >= 6 + texture2d_array in5 [[texture(5)]], +#endif + texture2d_array inx [[texture(N)]], + texture2d_array out [[texture(N+1)]], + constant ConcatParam & pm [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]) { + + ConcatParam cp = pm; + int xyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0}, abcd[4], oxyzn[4]; + VECTOR(P, 4) r = inx.read(gid.xy, gid.z); + for (int i = 0; i < 4; i++) { + xyzn[3] = i; +#if R == 4 + xyzn2abcd_4(cp.odim[3], xyzn, abcd); +#else + FUNC_R(xyzn2abcd, R)(xyzn, abcd); +#endif + int k = abcd[cp.axis] - cp.offset; + if (k < 0) continue; + int j = 0; + for (; j < N; j++) { + if (k < cp.vdim[j]) { + break; + } + k -= cp.vdim[j]; + } + if (j == N) { + continue; + } + int ta = cp.odim[cp.axis]; + abcd[cp.axis] = k; + cp.odim[cp.axis] = cp.vdim[j]; +#if R == 4 + abcd2xyzn_4(cp.odim[3], abcd, oxyzn); +#else + FUNC_R(abcd2xyzn, R)(abcd, oxyzn); +#endif + cp.odim[cp.axis] = ta; + switch (j) { + case 0: r[i] = in0.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break; + case 1: r[i] = in1.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break; +#if N >= 3 + case 2: r[i] = in2.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break; +#endif +#if N >= 4 + case 3: r[i] = in3.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break; +#endif +#if N >= 5 + case 4: r[i] = in4.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break; +#endif +#if N >= 6 + case 5: r[i] = in5.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break; +#endif + } + } + out.write(r, gid.xy, gid.z); +} + +#endif // V == NORMAL + + + +#if V == VX +kernel void FUNC(concat, R, N, VV, P)(texture2d_array in0 [[texture(0)]], + texture2d_array in1 [[texture(1)]], +#if N >= 3 + texture2d_array in2 [[texture(2)]], +#endif // N >= 3 +#if N >= 4 + texture2d_array in3 [[texture(3)]], +#endif // N >= 4 +#if N >= 5 + texture2d_array in4 [[texture(4)]], +#endif // N >= 5 +#if N >= 6 + texture2d_array in5 [[texture(5)]], +#endif // N >= 6 + texture2d_array out [[texture(N)]], + constant ConcatParam & pm [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]) { + int x = gid.x - pm.offset; + if (x < 0) return; + if (x < pm.vdim[0]) { + VECTOR(P, 4) r = in0.read(gid.xy, gid.z); + out.write(r, gid.xy, gid.z); + return; + } + x -= pm.vdim[0]; + if (x < pm.vdim[1]) { + VECTOR(P, 4) r = in1.read(uint2(x, gid.y), gid.z); + out.write(r, gid.xy, gid.z); + return; + } +#if N >= 3 + x -= pm.vdim[1]; + if (x < pm.vdim[2]) { + VECTOR(P, 4) r = in2.read(uint2(x, gid.y), gid.z); + out.write(r, gid.xy, gid.z); + return; + } +#endif // N >= 3 +#if N >= 4 + x -= pm.vdim[2]; + if (x < pm.vdim[3]) { + VECTOR(P, 4) r = in3.read(uint2(x, gid.y), gid.z); + out.write(r, gid.xy, gid.z); + return; + } +#endif // N >= 4 +#if N >= 5 + x -= pm.vdim[3]; + if (x < pm.vdim[4]) { + VECTOR(P, 4) r = in4.read(uint2(x, gid.y), gid.z); + out.write(r, gid.xy, gid.z); + return; + } +#endif // N >= 5 +#if N >= 6 + x -= pm.vdim[4]; + if (x < pm.vdim[5]) { + VECTOR(P, 4) r = in5.read(uint2(x, gid.y), gid.z); + out.write(r, gid.xy, gid.z); + return; + } +#endif // N >= 6 +} +#endif // V == VX + +#if V == VY +kernel void FUNC(concat, R, N, VV, P)(texture2d_array in0 [[texture(0)]], + texture2d_array in1 [[texture(1)]], +#if N >= 3 + texture2d_array in2 [[texture(2)]], +#endif // N >= 3 +#if N >= 4 + texture2d_array in3 [[texture(3)]], +#endif // N >= 4 +#if N >= 5 + texture2d_array in4 [[texture(4)]], +#endif // N >= 5 +#if N >= 6 + texture2d_array in5 [[texture(5)]], +#endif // N >= 6 + texture2d_array out [[texture(N)]], + constant ConcatParam & pm [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]) { + int y = gid.y - pm.offset; + if (y < 0) return; + if (y < pm.vdim[0]) { + VECTOR(P, 4) r = in0.read(gid.xy, gid.z); + out.write(r, gid.xy, gid.z); + return; + } + y -= pm.vdim[0]; + if (y < pm.vdim[1]) { + VECTOR(P, 4) r = in1.read(uint2(gid.x, y), gid.z); + out.write(r, gid.xy, gid.z); + return; + } +#if N >= 3 + y -= pm.vdim[1]; + if (y < pm.vdim[2]) { + VECTOR(P, 4) r = in2.read(uint2(gid.x, y), gid.z); + out.write(r, gid.xy, gid.z); + return; + } +#endif // N >= 3 +#if N >= 4 + y -= pm.vdim[2]; + if (y < pm.vdim[3]) { + VECTOR(P, 4) r = in3.read(uint2(gid.x, y), gid.z); + out.write(r, gid.xy, gid.z); + return; + } +#endif // N >= 4 +#if N >= 5 + y -= pm.vdim[3]; + if (y < pm.vdim[4]) { + VECTOR(P, 4) r = in4.read(uint2(gid.x, y), gid.z); + out.write(r, gid.xy, gid.z); + return; + } +#endif // N >= 5 +#if N >= 6 + y -= pm.vdim[4]; + if (y < pm.vdim[5]) { + VECTOR(P, 4) r = in5.read(uint2(gid.x, y), gid.z); + out.write(r, gid.xy, gid.z); + return; + } +#endif // N >= 6 +} +#endif // V == VY + +#if V == VZ +kernel void FUNC(concat, R, N, VV, P)(texture2d_array in0 [[texture(0)]], + texture2d_array in1 [[texture(1)]], +#if N >= 3 + texture2d_array in2 [[texture(2)]], +#endif // N >= 3 +#if N >= 4 + texture2d_array in3 [[texture(3)]], +#endif // N >= 4 +#if N >= 5 + texture2d_array in4 [[texture(4)]], +#endif // N >= 5 +#if N >= 6 + texture2d_array in5 [[texture(5)]], +#endif // N >= 6 + texture2d_array out [[texture(N)]], + constant ConcatParam & pm [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]) { + int z = gid.z - pm.offset; + if (z < 0) return; + if (z < pm.vdim[0]) { + VECTOR(P, 4) r = in0.read(gid.xy, gid.z); + out.write(r, gid.xy, gid.z); + return; + } + z -= pm.vdim[0]; + if (z < pm.vdim[1]) { + VECTOR(P, 4) r = in1.read(gid.xy, z); + out.write(r, gid.xy, gid.z); + return; + } +#if N >= 3 + z -= pm.vdim[1]; + if (z < pm.vdim[2]) { + VECTOR(P, 4) r = in2.read(gid.xy, z); + out.write(r, gid.xy, gid.z); + return; + } +#endif // N >= 3 +#if N >= 4 + z -= pm.vdim[2]; + if (z < pm.vdim[3]) { + VECTOR(P, 4) r = in3.read(gid.xy, z); + out.write(r, gid.xy, gid.z); + return; + } +#endif // N >= 4 +#if N >= 5 + z -= pm.vdim[3]; + if (z < pm.vdim[4]) { + VECTOR(P, 4) r = in4.read(gid.xy, z); + out.write(r, gid.xy, gid.z); + return; + } +#endif // N >= 5 +#if N >= 6 + z -= pm.vdim[4]; + if (z < pm.vdim[5]) { + VECTOR(P, 4) r = in5.read(gid.xy, z); + out.write(r, gid.xy, gid.z); + return; + } +#endif // N >= 6 +} +#endif // V == VZ + + +#undef VV +#endif // #ifdef P diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.metal new file mode 100644 index 0000000000000000000000000000000000000000..8a0390e624151bac1573e6727de04df0e2bb27de --- /dev/null +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.metal @@ -0,0 +1,171 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include +#include "Common.metal" + +using namespace metal; + +struct ConcatParam { + int32_t odim[4]; + int32_t axis; + int32_t offset; + int32_t trans[4]; + int32_t vdim[6]; +}; + +#define VNORMAL 1 +#define VX 2 +#define VY 3 +#define VZ 4 + +// >> fast mode +// only support concat_{2,3,4}_{2,3,4,5,6}_y_{float,half} +// only support concat_{3,4}_{2,3,4,5,6}_x_{float,half} +// only support concat_{1,2,3,4}_{2,3,4,5,6}_z_{float,half} +// >> normal mode (loop mode) +// ssd-ar: (R=4, N=3, V=z), (R=3, N=2, V=y), (R=2, N=5, V=x), (R=3, N=5, V=x) +// ssd: (R=2, N=6, V=y), (R=3, N=6, V=y) +// genet: (R=4, N=2, V=normal) + +// ssd-ar: (R=3, N=5, V=x) +#define V VX +#define R 3 +#define N 5 +#define P float +#include "ConcatKernel.inc.metal" +#undef P +#define P half +#include "ConcatKernel.inc.metal" +#undef P +#undef N +#undef R +#undef V + +// ssd-ar: (R=2, N=5, V=x) +#define V VX +#define R 2 +#define N 5 +#define P float +#include "ConcatKernel.inc.metal" +#undef P +#define P half +#include "ConcatKernel.inc.metal" +#undef P +#undef N +#undef R +#undef V + + +// ssd-ar: (R=3, N=2, V=y) +#define V VY +#define R 3 +#define N 2 +#define P float +#include "ConcatKernel.inc.metal" +#undef P +#define P half +#include "ConcatKernel.inc.metal" +#undef P +#undef N +#undef R +#undef V + +// ssd-ar: (R=4, N=3, V=z) +#define V VZ +#define R 4 +#define N 3 +#define P float +#include "ConcatKernel.inc.metal" +#undef P +#define P half +#include "ConcatKernel.inc.metal" +#undef P +#undef N +#undef R +#undef V + + +// ssd: (R=2, N=6, V=y) +#define V VY +#define R 2 +#define N 6 +#define P float +#include "ConcatKernel.inc.metal" +#undef P +#define P half +#include "ConcatKernel.inc.metal" +#undef P +#undef N +#undef R +#undef V + +// ssd: (R=3, N=6, V=y) +#define V VY +#define R 3 +#define N 6 +#define P float +#include "ConcatKernel.inc.metal" +#undef P +#define P half +#include "ConcatKernel.inc.metal" +#undef P +#undef N +#undef R +#undef V + +#define V VNORMAL +#define R 4 +#define N 2 +#define P float +#include "ConcatKernel.inc.metal" +#undef P +#define P half +#include "ConcatKernel.inc.metal" +#undef P +#undef N +#undef R +#undef V + + +#define V VY +#define R 2 +#define N 2 +#define P float +#include "ConcatKernel.inc.metal" +#undef P +#define P half +#include "ConcatKernel.inc.metal" +#undef P +#undef N +#undef R +#undef V + + +#define V VY +#define R 2 +#define N 5 +#define P float +#include "ConcatKernel.inc.metal" +#undef P +#define P half +#include "ConcatKernel.inc.metal" +#undef P +#undef N +#undef R +#undef V + + + + diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddBNReluKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddBNReluKernel.metal new file mode 100644 index 0000000000000000000000000000000000000000..f55386096f582b560abc4ea7c97945188afd1c9b --- /dev/null +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddBNReluKernel.metal @@ -0,0 +1,310 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include +#include "Common.metal" +using namespace metal; + + +kernel void conv_add_batch_norm_relu_1x1_half( + texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half4 *weights [[buffer(1)]], + const device half4 *biase [[buffer(2)]], + const device half4 *new_scale [[buffer(3)]], + const device half4 *new_biase [[buffer(4)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 1; + + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = float4(0.0); + + half4 input; + for (uint i = 0; i < input_arr_size; ++i) { + input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; + output.x += dot(input, weight_x); + + half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; + output.y += dot(input, weight_y); + + half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; + output.z += dot(input, weight_z); + + half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; + output.w += dot(input, weight_w); + } + output = fmax((output + float4(biase[gid.z])) * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0); + outTexture.write(half4(output), gid.xy, gid.z); +} + +kernel void conv_add_batch_norm_relu_3x3_half( + texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half4 *weights [[buffer(1)]], + const device half4 *biase [[buffer(2)]], + const device half4 *new_scale [[buffer(3)]], + const device half4 *new_biase [[buffer(4)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = float4(0.0); + + half4 input[9]; + for (uint i = 0; i < input_arr_size; ++i) { + input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i); + input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i); + input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i); + input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i); + input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i); + input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i); + input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i); + input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i); + for (int j = 0; j < 9; ++j) { + half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(input[j], weight_x); + + half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(input[j], weight_y); + + half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(input[j], weight_z); + + half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(input[j], weight_w); + } + } + output = fmax((output + float4(biase[gid.z])) * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0); + outTexture.write(half4(output), gid.xy, gid.z); +} + +kernel void depthwise_conv_add_batch_norm_relu_3x3_half( + texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half *weights [[buffer(1)]], + const device half4 *biase [[buffer(2)]], + const device half4 *new_scale [[buffer(3)]], + const device half4 *new_biase [[buffer(4)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + uint output_slice = gid.z; + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint weithTo = gid.z * kernelHXW * 4; + float4 output = float4(0.0); + half4 inputs[9]; + inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); + inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); + inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); + inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); + inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); + inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); + inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); + inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); + inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); + for (int j = 0; j < 9; ++j) { + half4 input = inputs[j]; + output.x += input.x * weights[weithTo + 0 * kernelHXW + j]; + output.y += input.y * weights[weithTo + 1 * kernelHXW + j]; + output.z += input.z * weights[weithTo + 2 * kernelHXW + j]; + output.w += input.w * weights[weithTo + 3 * kernelHXW + j]; + } + output = fmax((output + float4(biase[gid.z])) * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0); + outTexture.write(half4(output), gid.xy, gid.z); +} + + + +/*---------------------------------------------*/ + + + +kernel void conv_add_batch_norm_relu_1x1(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device float4 *weights [[buffer(1)]], + const device float4 *biase [[buffer(2)]], + const device float4 *new_scale [[buffer(3)]], + const device float4 *new_biase [[buffer(4)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 1; + + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = float4(0.0); + + float4 input; + for (uint i = 0; i < input_arr_size; ++i) { + input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; + output.x += dot(input, weight_x); + + float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; + output.y += dot(input, weight_y); + + float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; + output.z += dot(input, weight_z); + + float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; + output.w += dot(input, weight_w); + } + output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0); + outTexture.write(output, gid.xy, gid.z); +} + +kernel void conv_add_batch_norm_relu_3x3(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device float4 *weights [[buffer(1)]], + const device float4 *biase [[buffer(2)]], + const device float4 *new_scale [[buffer(3)]], + const device float4 *new_biase [[buffer(4)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = float4(0.0); + + float4 input[9]; + for (uint i = 0; i < input_arr_size; ++i) { + input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i); + input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i); + input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i); + input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i); + input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i); + input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i); + input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i); + input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i); + for (int j = 0; j < 9; ++j) { + float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(input[j], weight_x); + + float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(input[j], weight_y); + + float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(input[j], weight_z); + + float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(input[j], weight_w); + } + } + output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0); + outTexture.write(output, gid.xy, gid.z); +} + +kernel void depthwise_conv_add_batch_norm_relu_3x3(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device float *weights [[buffer(1)]], + const device float4 *biase [[buffer(2)]], + const device float4 *new_scale [[buffer(3)]], + const device float4 *new_biase [[buffer(4)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + uint output_slice = gid.z; + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint weithTo = gid.z * kernelHXW * 4; + float4 output = float4(0.0); + float4 inputs[9]; + inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); + inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); + inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); + inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); + inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); + inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); + inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); + inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); + inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); + for (int j = 0; j < 9; ++j) { + float4 input = inputs[j]; + output.x += input.x * weights[weithTo + 0 * kernelHXW + j]; + output.y += input.y * weights[weithTo + 1 * kernelHXW + j]; + output.z += input.z * weights[weithTo + 2 * kernelHXW + j]; + output.w += input.w * weights[weithTo + 3 * kernelHXW + j]; + } + output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0); + outTexture.write(output, gid.xy, gid.z); +} + diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddMetal.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddMetal.metal new file mode 100644 index 0000000000000000000000000000000000000000..7489f72896553ba5c412778d523adfc4d0ede279 --- /dev/null +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddMetal.metal @@ -0,0 +1,622 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include +#include "Common.metal" + +using namespace metal; + +#pragma mark - convAdd +kernel void conv_add_1x1(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device float4 *weights [[buffer(1)]], + const device float4 *biase [[buffer(2)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 1; + + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = biase[gid.z]; + + float4 input; + for (uint i = 0; i < input_arr_size; ++i) { + input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; + output.x += dot(input, weight_x); + + float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; + output.y += dot(input, weight_y); + + float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; + output.z += dot(input, weight_z); + + float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; + output.w += dot(input, weight_w); + } + // output = output + biase[gid.z]; + outTexture.write(output, gid.xy, gid.z); +} + +kernel void conv_add_3x3(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device float4 *weights [[buffer(1)]], + const device float4 *biase [[buffer(2)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + + const uint kernelHXW = 9; + + uint input_arr_size = inTexture.get_array_size(); + + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = biase[gid.z]; + + ushort dilation_x = param.dilationX; + ushort dilation_y = param.dilationY; + + float4 input[9]; + + for (uint i = 0; i < input_arr_size; ++i) { + input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i); + + input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i); + + input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i); + + input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i); + + input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + + input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i); + + input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y + dilation_y), i); + + input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i); + + input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y + dilation_y), i); + + for (int j = 0; j < 9; ++j) { + float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(input[j], weight_x); + + float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(input[j], weight_y); + + float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(input[j], weight_z); + + float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(input[j], weight_w); + } + } + // output = output + biase[gid.z]; + outTexture.write(output, gid.xy, gid.z); +} + +kernel void conv_add_5x1(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device float4 *weights [[buffer(1)]], + const device float4 *biase [[buffer(2)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + + const uint kernelHXW = 5; + + uint input_arr_size = inTexture.get_array_size(); + + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = biase[gid.z]; + + ushort dilation_y = param.dilationY; + float4 input[5]; + + for (uint i = 0; i < input_arr_size; ++i) { + input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i); + + input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i); + + input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + + input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i); + + input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i); + + for (int j = 0; j < 5; ++j) { + float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(input[j], weight_x); + + float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(input[j], weight_y); + + float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(input[j], weight_z); + + float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(input[j], weight_w); + } + } + // output = output + biase[gid.z]; + outTexture.write(output, gid.xy, gid.z); +} + + +kernel void conv_add_1x5(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device float4 *weights [[buffer(1)]], + const device float4 *biase [[buffer(2)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + + const uint kernelHXW = 5; + + uint input_arr_size = inTexture.get_array_size(); + + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = biase[gid.z]; + + ushort dilation_x = param.dilationX; + float4 input[5]; + + for (uint i = 0; i < input_arr_size; ++i) { + input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i); + + input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i); + + input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + + input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i); + + input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i); + + for (int j = 0; j < 5; ++j) { + float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(input[j], weight_x); + + float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(input[j], weight_y); + + float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(input[j], weight_z); + + float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(input[j], weight_w); + } + } + // output = output + biase[gid.z]; + outTexture.write(output, gid.xy, gid.z); +} + + +kernel void depthwise_conv_add_3x3(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device float *weights [[buffer(1)]], + const device float4 *biase [[buffer(2)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + uint output_slice = gid.z; + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint weithTo = gid.z * kernelHXW * 4; + float4 output = biase[gid.z]; + float4 inputs[9]; + inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); + inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); + inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); + inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); + inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); + inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); + inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); + inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); + inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); + for (int j = 0; j < 9; ++j) { + float4 input = inputs[j]; + output.x += input.x * weights[weithTo + 0 * kernelHXW + j]; + output.y += input.y * weights[weithTo + 1 * kernelHXW + j]; + output.z += input.z * weights[weithTo + 2 * kernelHXW + j]; + output.w += input.w * weights[weithTo + 3 * kernelHXW + j]; + } + // output = output + biase[gid.z]; + outTexture.write(output, gid.xy, gid.z); +} + + +#pragma mark - half + +kernel void conv_add_1x1_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half4 *weights [[buffer(1)]], + const device half4 *biase [[buffer(2)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 1; + + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = float4(biase[gid.z]); + + float4 input; + for (uint i = 0; i < input_arr_size; ++i) { + input = float4(inTexture.sample(sample, float2(posInInput.x, posInInput.y), i)); + float4 weight_x = float4(weights[weithTo + 0 * kernelHXW * input_arr_size + i]); + output.x += dot(input, weight_x); + + float4 weight_y = float4(weights[weithTo + 1 * kernelHXW * input_arr_size + i]); + output.y += dot(input, weight_y); + + float4 weight_z = float4(weights[weithTo + 2 * kernelHXW * input_arr_size + i]); + output.z += dot(input, weight_z); + + float4 weight_w = float4(weights[weithTo + 3 * kernelHXW * input_arr_size + i]); + output.w += dot(input, weight_w); + } + // output = output + float4(biase[gid.z]); + outTexture.write(half4(output), gid.xy, gid.z); +} + +kernel void conv_add_3x3_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half4 *weights [[buffer(1)]], + const device half4 *biase [[buffer(2)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = float4(biase[gid.z]); + + ushort dilation_x = param.dilationX; + ushort dilation_y = param.dilationY; + + half4 input[9]; + for (uint i = 0; i < input_arr_size; ++i) { + input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i); + input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i); + input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i); + input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i); + input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i); + input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y + dilation_y), i); + input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i); + input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y + dilation_y), i); + for (int j = 0; j < 9; ++j) { + half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(float4(input[j]), float4(weight_x)); + + half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(float4(input[j]), float4(weight_y)); + + half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(float4(input[j]), float4(weight_z)); + + half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(float4(input[j]), float4(weight_w)); + } + } + // output = output + float4(biase[gid.z]); + outTexture.write(half4(output), gid.xy, gid.z); +} + +kernel void depthwise_conv_add_3x3_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half *weights [[buffer(1)]], + const device half4 *biase [[buffer(2)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + uint output_slice = gid.z; + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint weithTo = gid.z * kernelHXW * 4; + float4 output = float4(biase[gid.z]); + half4 inputs[9]; + inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); + inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); + inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); + inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); + inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); + inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); + inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); + inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); + inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); + for (int j = 0; j < 9; ++j) { + half4 input = inputs[j]; + output.x += float(input.x) * float(weights[weithTo + 0 * kernelHXW + j]); + output.y += float(input.y) * float(weights[weithTo + 1 * kernelHXW + j]); + output.z += float(input.z) * float(weights[weithTo + 2 * kernelHXW + j]); + output.w += float(input.w) * float(weights[weithTo + 3 * kernelHXW + j]); + } + // output = output + float4(biase[gid.z]); + outTexture.write(half4(output), gid.xy, gid.z); +} + + +kernel void conv_add_5x1_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half4 *weights [[buffer(1)]], + const device half4 *biase [[buffer(2)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + + const uint kernelHXW = 5; + + uint input_arr_size = inTexture.get_array_size(); + + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = float4(biase[gid.z]); + + ushort dilation_y = param.dilationY; + half4 input[5]; + + for (uint i = 0; i < input_arr_size; ++i) { + input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i); + + input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i); + + input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + + input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i); + + input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i); + + for (int j = 0; j < 5; ++j) { + half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(float4(input[j]), float4(weight_x)); + + half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(float4(input[j]), float4(weight_y)); + + half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(float4(input[j]), float4(weight_z)); + + half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(float4(input[j]), float4(weight_w)); + } + } + // output = output + float4(biase[gid.z]); + outTexture.write(half4(output), gid.xy, gid.z); +} + + +kernel void conv_add_1x5_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half4 *weights [[buffer(1)]], + const device half4 *biase [[buffer(2)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + + const uint kernelHXW = 5; + + uint input_arr_size = inTexture.get_array_size(); + + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = float4(biase[gid.z]); + + ushort dilation_x = param.dilationX; + half4 input[5]; + + for (uint i = 0; i < input_arr_size; ++i) { + input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i); + + input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i); + + input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + + input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i); + + input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i); + + for (int j = 0; j < 5; ++j) { + half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(float4(input[j]), float4(weight_x)); + + half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(float4(input[j]), float4(weight_y)); + + half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(float4(input[j]), float4(weight_z)); + + half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(float4(input[j]), float4(weight_w)); + } + } + // output = output + float4(biase[gid.z]); + outTexture.write(half4(output), gid.xy, gid.z); +} + + +kernel void test_conv_add_3x3(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device float4 *weights [[buffer(1)]], + const device float4 *biase [[buffer(2)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + if (gid.x > 0 || gid.y > 0 || gid.z > 0) { return; } + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + + const uint kernelHXW = 9; + + uint input_arr_size = inTexture.get_array_size(); + + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = float4(0.0); + + ushort dilation_x = param.dilationX; + ushort dilation_y = param.dilationY; + + float4 input[9]; + + for (uint i = 0; i < input_arr_size; ++i) { + + input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i); + + input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i); + + input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i); + + input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i); + + input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + + input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i); + + input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y + dilation_y), i); + + input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i); + + input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y + dilation_y), i); + + for (int j = 0; j < 9; ++j) { + float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(input[j], weight_x); + + float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(input[j], weight_y); + + float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(input[j], weight_z); + + float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(input[j], weight_w); + } + } + // output = output + biase[gid.z]; + outTexture.write(output, gid.xy, gid.z); +} + + + diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddPrelu.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddPrelu.inc.metal new file mode 100644 index 0000000000000000000000000000000000000000..e2b8834cc5314897f04f485a012b88fc29e5054d --- /dev/null +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddPrelu.inc.metal @@ -0,0 +1,447 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#ifdef P + +#include "Macro.metal" + + +#pragma mark - convAdd +kernel void FUNC3_(conv_add_1x1, PRELU_TYPE, P)(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device VECTOR(P, 4) *weights [[buffer(1)]], + const device VECTOR(P, 4) *biase [[buffer(2)]], +#ifdef PRELU_CHANNEL + const device VECTOR(P, 4) *alpha [[buffer(3)]], +#endif +#ifdef PRELU_ELEMENT + const device VECTOR(P, 4) *alpha [[buffer(3)]], +#endif +#ifdef PRELU_OTHER + const device P *alpha [[buffer(3)]], +#endif + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 1; + + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + VECTOR(P, 4) output = biase[gid.z]; + + VECTOR(P, 4) input; + for (uint i = 0; i < input_arr_size; ++i) { + input = inTexture.sample(sample,float2(posInInput.x, posInInput.y), i); + VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; + output.x += dot(input, weight_x); + + VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; + output.y += dot(input, weight_y); + + VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; + output.z += dot(input, weight_z); + + VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; + output.w += dot(input, weight_w); + } + + // output = output + float4(biase[gid.z]); + +#ifdef PRELU_CHANNEL + VECTOR(P, 4) alpha_value = alpha[gid.z]; + output.x = output.x > 0 ? output.x : (alpha_value.x * output.x); + output.y = output.y > 0 ? output.y : (alpha_value.y * output.y); + output.z = output.z > 0 ? output.z : (alpha_value.z * output.z); + output.w = output.w > 0 ? output.w : (alpha_value.w * output.w); +#endif +#ifdef PRELU_ELEMENT + int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size(); + VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z]; + output.x = output.x > 0 ? output.x : (alpha_value.x * output.x); + output.y = output.y > 0 ? output.y : (alpha_value.y * output.y); + output.z = output.z > 0 ? output.z : (alpha_value.z * output.z); + output.w = output.w > 0 ? output.w : (alpha_value.w * output.w); +#endif +#ifdef PRELU_OTHER + P alpha_value = alpha[0]; + output.x = output.x > 0 ? output.x : (alpha_value * output.x); + output.y = output.y > 0 ? output.y : (alpha_value * output.y); + output.z = output.z > 0 ? output.z : (alpha_value * output.z); + output.w = output.w > 0 ? output.w : (alpha_value * output.w); +#endif + outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z); +} + +kernel void FUNC3_(conv_add_3x3, PRELU_TYPE, P)(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device VECTOR(P, 4) *weights [[buffer(1)]], + const device VECTOR(P, 4) *biase [[buffer(2)]], +#ifdef PRELU_CHANNEL + const device VECTOR(P, 4) *alpha [[buffer(3)]], +#endif +#ifdef PRELU_ELEMENT + const device VECTOR(P, 4) *alpha [[buffer(3)]], +#endif +#ifdef PRELU_OTHER + const device P *alpha [[buffer(3)]], +#endif + uint3 gid [[thread_position_in_grid]]) { + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + + const uint kernelHXW = 9; + + uint input_arr_size = inTexture.get_array_size(); + + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + VECTOR(P, 4) output = biase[gid.z]; + + ushort dilation_x = param.dilationX; + ushort dilation_y = param.dilationY; + + VECTOR(P, 4) input[9]; + + for (uint i = 0; i < input_arr_size; ++i) { + input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i); + + input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i); + + input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i); + + input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i); + + input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + + input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i); + + input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y + dilation_y), i); + + input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i); + + input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y + dilation_y), i); + + for (int j = 0; j < 9; ++j) { + VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(input[j], weight_x); + + VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(input[j], weight_y); + + VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(input[j], weight_z); + + VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(input[j], weight_w); + } + } + // output = output + float4(biase[gid.z]); + +#ifdef PRELU_CHANNEL + VECTOR(P, 4) alpha_value = alpha[gid.z]; + output.x = output.x > 0 ? output.x : (alpha_value.x * output.x); + output.y = output.y > 0 ? output.y : (alpha_value.y * output.y); + output.z = output.z > 0 ? output.z : (alpha_value.z * output.z); + output.w = output.w > 0 ? output.w : (alpha_value.w * output.w); +#endif +#ifdef PRELU_ELEMENT + int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size(); + VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z]; + output.x = output.x > 0 ? output.x : (alpha_value.x * output.x); + output.y = output.y > 0 ? output.y : (alpha_value.y * output.y); + output.z = output.z > 0 ? output.z : (alpha_value.z * output.z); + output.w = output.w > 0 ? output.w : (alpha_value.w * output.w); +#endif +#ifdef PRELU_OTHER + P alpha_value = alpha[0]; + output.x = output.x > 0 ? output.x : (alpha_value * output.x); + output.y = output.y > 0 ? output.y : (alpha_value * output.y); + output.z = output.z > 0 ? output.z : (alpha_value * output.z); + output.w = output.w > 0 ? output.w : (alpha_value * output.w); +#endif + outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z); +} + +kernel void FUNC3_(conv_add_5x1, PRELU_TYPE, P)(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device VECTOR(P, 4) *weights [[buffer(1)]], + const device VECTOR(P, 4) *biase [[buffer(2)]], +#ifdef PRELU_CHANNEL + const device VECTOR(P, 4) *alpha [[buffer(3)]], +#endif +#ifdef PRELU_ELEMENT + const device VECTOR(P, 4) *alpha [[buffer(3)]], +#endif +#ifdef PRELU_OTHER + const device P *alpha [[buffer(3)]], +#endif + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + + const uint kernelHXW = 5; + + uint input_arr_size = inTexture.get_array_size(); + + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + VECTOR(P, 4) output = biase[gid.z];; + + ushort dilation_y = param.dilationY; + VECTOR(P, 4) input[5]; + + for (uint i = 0; i < input_arr_size; ++i) { + input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i); + + input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i); + + input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + + input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i); + + input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i); + + for (int j = 0; j < 5; ++j) { + VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(input[j], weight_x); + + VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(input[j], weight_y); + + VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(input[j], weight_z); + + VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(input[j], weight_w); + } + } + +#ifdef PRELU_CHANNEL + VECTOR(P, 4) alpha_value = alpha[gid.z]; + output.x = output.x > 0 ? output.x : (alpha_value.x * output.x); + output.y = output.y > 0 ? output.y : (alpha_value.y * output.y); + output.z = output.z > 0 ? output.z : (alpha_value.z * output.z); + output.w = output.w > 0 ? output.w : (alpha_value.w * output.w); +#endif +#ifdef PRELU_ELEMENT + int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size(); + VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z]; + output.x = output.x > 0 ? output.x : (alpha_value.x * output.x); + output.y = output.y > 0 ? output.y : (alpha_value.y * output.y); + output.z = output.z > 0 ? output.z : (alpha_value.z * output.z); + output.w = output.w > 0 ? output.w : (alpha_value.w * output.w); +#endif +#ifdef PRELU_OTHER + P alpha_value = alpha[0]; + output.x = output.x > 0 ? output.x : (alpha_value * output.x); + output.y = output.y > 0 ? output.y : (alpha_value * output.y); + output.z = output.z > 0 ? output.z : (alpha_value * output.z); + output.w = output.w > 0 ? output.w : (alpha_value * output.w); +#endif + outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z); +} + + +kernel void FUNC3_(conv_add_1x5, PRELU_TYPE, P)(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device VECTOR(P, 4) *weights [[buffer(1)]], + const device VECTOR(P, 4) *biase [[buffer(2)]], +#ifdef PRELU_CHANNEL + const device VECTOR(P, 4) *alpha [[buffer(3)]], +#endif +#ifdef PRELU_ELEMENT + const device VECTOR(P, 4) *alpha [[buffer(3)]], +#endif +#ifdef PRELU_OTHER + const device P *alpha [[buffer(3)]], +#endif + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + + const uint kernelHXW = 5; + + uint input_arr_size = inTexture.get_array_size(); + + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + VECTOR(P, 4) output = biase[gid.z]; + + ushort dilation_x = param.dilationX; + VECTOR(P, 4) input[5]; + + for (uint i = 0; i < input_arr_size; ++i) { + input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i); + + input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i); + + input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + + input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i); + + input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i); + + for (int j = 0; j < 5; ++j) { + VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(input[j], weight_x); + + VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(input[j], weight_y); + + VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(input[j], weight_z); + + VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(input[j], weight_w); + } + } + +#ifdef PRELU_CHANNEL + VECTOR(P, 4) alpha_value = alpha[gid.z]; + output.x = output.x > 0 ? output.x : (alpha_value.x * output.x); + output.y = output.y > 0 ? output.y : (alpha_value.y * output.y); + output.z = output.z > 0 ? output.z : (alpha_value.z * output.z); + output.w = output.w > 0 ? output.w : (alpha_value.w * output.w); +#endif +#ifdef PRELU_ELEMENT + int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size(); + VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z]; + output.x = output.x > 0 ? output.x : (alpha_value.x * output.x); + output.y = output.y > 0 ? output.y : (alpha_value.y * output.y); + output.z = output.z > 0 ? output.z : (alpha_value.z * output.z); + output.w = output.w > 0 ? output.w : (alpha_value.w * output.w); +#endif +#ifdef PRELU_OTHER + P alpha_value = alpha[0]; + output.x = output.x > 0 ? output.x : (alpha_value * output.x); + output.y = output.y > 0 ? output.y : (alpha_value * output.y); + output.z = output.z > 0 ? output.z : (alpha_value * output.z); + output.w = output.w > 0 ? output.w : (alpha_value * output.w); +#endif + outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z); +} + +kernel void FUNC3_(depthwise_conv_add_3x3, PRELU_TYPE, P)(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device P *weights [[buffer(1)]], + const device VECTOR(P, 4) *biase [[buffer(2)]], +#ifdef PRELU_CHANNEL + const device VECTOR(P, 4) *alpha [[buffer(3)]], +#endif +#ifdef PRELU_ELEMENT + const device VECTOR(P, 4) *alpha [[buffer(3)]], +#endif +#ifdef PRELU_OTHER + const device P *alpha [[buffer(3)]], +#endif + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + uint output_slice = gid.z; + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint weithTo = gid.z * kernelHXW * 4; + VECTOR(P, 4) output = biase[gid.z]; + VECTOR(P, 4) inputs[9]; + inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); + inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); + inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); + inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); + inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); + inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); + inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); + inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); + inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); + for (int j = 0; j < 9; ++j) { + VECTOR(P, 4) input = inputs[j]; + output.x += input.x * weights[weithTo + 0 * kernelHXW + j]; + output.y += input.y * weights[weithTo + 1 * kernelHXW + j]; + output.z += input.z * weights[weithTo + 2 * kernelHXW + j]; + output.w += input.w * weights[weithTo + 3 * kernelHXW + j]; + } + +#ifdef PRELU_CHANNEL + VECTOR(P, 4) alpha_value = alpha[gid.z]; + output.x = output.x > 0 ? output.x : (alpha_value.x * output.x); + output.y = output.y > 0 ? output.y : (alpha_value.y * output.y); + output.z = output.z > 0 ? output.z : (alpha_value.z * output.z); + output.w = output.w > 0 ? output.w : (alpha_value.w * output.w); +#endif +#ifdef PRELU_ELEMENT + int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size(); + VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z]; + output.x = output.x > 0 ? output.x : (alpha_value.x * output.x); + output.y = output.y > 0 ? output.y : (alpha_value.y * output.y); + output.z = output.z > 0 ? output.z : (alpha_value.z * output.z); + output.w = output.w > 0 ? output.w : (alpha_value.w * output.w); +#endif +#ifdef PRELU_OTHER + P alpha_value = alpha[0]; + output.x = output.x > 0 ? output.x : (alpha_value * output.x); + output.y = output.y > 0 ? output.y : (alpha_value * output.y); + output.z = output.z > 0 ? output.z : (alpha_value * output.z); + output.w = output.w > 0 ? output.w : (alpha_value * output.w); +#endif + outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z); +} + +#endif + diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddPreluKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddPreluKernel.metal new file mode 100644 index 0000000000000000000000000000000000000000..407b8385b7a7b822df9151905f167c930c8670a9 --- /dev/null +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddPreluKernel.metal @@ -0,0 +1,65 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include +#include "Common.metal" +using namespace metal; + +#define P float + +#define PRELU_CHANNEL prelu_channel +#define PRELU_TYPE prelu_channel +#include "ConvAddPrelu.inc.metal" +#undef PRELU_TYPE +#undef PRELU_CHANNEL + +#define PRELU_ELEMENT prelu_element +#define PRELU_TYPE prelu_element +#include "ConvAddPrelu.inc.metal" +#undef PRELU_TYPE +#undef PRELU_ELEMENT + +#define PRELU_OTHER prelu_other +#define PRELU_TYPE prelu_other +#include "ConvAddPrelu.inc.metal" +#undef PRELU_TYPE +#undef PRELU_OTHER + +#undef P + +#define P half + +#define PRELU_CHANNEL prelu_channel +#define PRELU_TYPE prelu_channel +#include "ConvAddPrelu.inc.metal" +#undef PRELU_TYPE +#undef PRELU_CHANNEL + +#define PRELU_ELEMENT prelu_element +#define PRELU_TYPE prelu_element +#include "ConvAddPrelu.inc.metal" +#undef PRELU_TYPE +#undef PRELU_ELEMENT + +#define PRELU_OTHER prelu_other +#define PRELU_TYPE prelu_other +#include "ConvAddPrelu.inc.metal" +#undef PRELU_TYPE +#undef PRELU_OTHER + +#undef P + + + + diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvBNReluKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvBNReluKernel.metal new file mode 100644 index 0000000000000000000000000000000000000000..6851f8aa98f49c405645e55d176ef921d2a1c0d2 --- /dev/null +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvBNReluKernel.metal @@ -0,0 +1,297 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include +#include "Common.metal" + +using namespace metal; + +#pragma mark - conv bn relu +kernel void conv_batch_norm_relu_1x1(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device float4 *weights [[buffer(1)]], + const device float4 *new_scale [[buffer(2)]], + const device float4 *new_biase [[buffer(3)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 1; + + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = float4(0.0); + + float4 input; + for (uint i = 0; i < input_arr_size; ++i) { + input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; + output.x += dot(input, weight_x); + + float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; + output.y += dot(input, weight_y); + + float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; + output.z += dot(input, weight_z); + + float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; + output.w += dot(input, weight_w); + } + output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0); + outTexture.write(output, gid.xy, gid.z); +} + +kernel void conv_batch_norm_relu_3x3(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device float4 *weights [[buffer(1)]], + const device float4 *new_scale [[buffer(2)]], + const device float4 *new_biase [[buffer(3)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = float4(0.0); + + float4 input[9]; + for (uint i = 0; i < input_arr_size; ++i) { + input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i); + input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i); + input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i); + input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i); + input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i); + input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i); + input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i); + input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i); + for (int j = 0; j < 9; ++j) { + float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(input[j], weight_x); + + float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(input[j], weight_y); + + float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(input[j], weight_z); + + float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(input[j], weight_w); + } + } + output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0); + outTexture.write(output, gid.xy, gid.z); +} + +kernel void depthwise_conv_batch_norm_relu_3x3(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device float *weights [[buffer(1)]], + const device float4 *new_scale [[buffer(2)]], + const device float4 *new_biase [[buffer(3)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + uint output_slice = gid.z; + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint weithTo = gid.z * kernelHXW * 4; + float4 output = float4(0.0); + float4 inputs[9]; + inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); + inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); + inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); + inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); + inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); + inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); + inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); + inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); + inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); + for (int j = 0; j < 9; ++j) { + float4 input = inputs[j]; + output.x += input.x * weights[weithTo + 0 * kernelHXW + j]; + output.y += input.y * weights[weithTo + 1 * kernelHXW + j]; + output.z += input.z * weights[weithTo + 2 * kernelHXW + j]; + output.w += input.w * weights[weithTo + 3 * kernelHXW + j]; + } + output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0); + outTexture.write(output, gid.xy, gid.z); +} + +#pragma mark - half +kernel void conv_batch_norm_relu_1x1_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half4 *weights [[buffer(1)]], + const device half4 *new_scale [[buffer(2)]], + const device half4 *new_biase [[buffer(3)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 1; + + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = float4(0.0); + + half4 input; + for (uint i = 0; i < input_arr_size; ++i) { + input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; + output.x += dot(float4(input), float4(weight_x)); + + half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; + output.y += dot(float4(input), float4(weight_y)); + + half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; + output.z += dot(float4(input), float4(weight_z)); + + half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; + output.w += dot(float4(input), float4(weight_w)); + } + output = fmax(output * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0); + outTexture.write(half4(output), gid.xy, gid.z); +} + +kernel void conv_batch_norm_relu_3x3_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half4 *weights [[buffer(1)]], + const device half4 *new_scale [[buffer(2)]], + const device half4 *new_biase [[buffer(3)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = float4(0.0); + + half4 input[9]; + for (uint i = 0; i < input_arr_size; ++i) { + input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i); + input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i); + input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i); + input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i); + input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i); + input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i); + input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i); + input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i); + for (int j = 0; j < 9; ++j) { + half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(float4(input[j]), float4(weight_x)); + + half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(float4(input[j]), float4(weight_y)); + + half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(float4(input[j]), float4(weight_z)); + + half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(float4(input[j]), float4(weight_w)); + } + } + output = fmax(output * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0); + outTexture.write(half4(output), gid.xy, gid.z); +} + +kernel void depthwise_conv_batch_norm_relu_3x3_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half *weights [[buffer(1)]], + const device half4 *new_scale [[buffer(2)]], + const device half4 *new_biase [[buffer(3)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + uint output_slice = gid.z; + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint weithTo = gid.z * kernelHXW * 4; + float4 output = float4(0.0); + half4 inputs[9]; + inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); + inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); + inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); + inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); + inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); + inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); + inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); + inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); + inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); + for (int j = 0; j < 9; ++j) { + half4 input = inputs[j]; + output.x += input.x * weights[weithTo + 0 * kernelHXW + j]; + output.y += input.y * weights[weithTo + 1 * kernelHXW + j]; + output.z += input.z * weights[weithTo + 2 * kernelHXW + j]; + output.w += input.w * weights[weithTo + 3 * kernelHXW + j]; + } + output = fmax(output * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0); + outTexture.write(half4(output), gid.xy, gid.z); +} + diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvKernel.metal new file mode 100644 index 0000000000000000000000000000000000000000..c7b3f792d69033eb608e55ec747bb086e501040b --- /dev/null +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvKernel.metal @@ -0,0 +1,280 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include +#include "Common.metal" +using namespace metal; + +// conv +#pragma mark -- conv +kernel void conv_3x3(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device float4 *weights [[buffer(1)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = float4(0.0); + + float4 input[9]; + for (uint i = 0; i < input_arr_size; ++i) { + input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i); + input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i); + input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i); + input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i); + input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i); + input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i); + input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i); + input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i); + for (int j = 0; j < 9; ++j) { + float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(input[j], weight_x); + + float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(input[j], weight_y); + + float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(input[j], weight_z); + + float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(input[j], weight_w); + } + } + outTexture.write(output, gid.xy, gid.z); +} + +kernel void depthwise_conv_3x3(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device float *weights [[buffer(1)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + uint output_slice = gid.z; + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint weithTo = gid.z * kernelHXW * 4; + float4 output = float4(0.0); + float4 inputs[9]; + inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); + inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); + inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); + inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); + inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); + inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); + inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); + inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); + inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); + for (int j = 0; j < 9; ++j) { + float4 input = inputs[j]; + output.x += input.x * weights[weithTo + 0 * kernelHXW + j]; + output.y += input.y * weights[weithTo + 1 * kernelHXW + j]; + output.z += input.z * weights[weithTo + 2 * kernelHXW + j]; + output.w += input.w * weights[weithTo + 3 * kernelHXW + j]; + } + outTexture.write(output, gid.xy, gid.z); +} + +kernel void conv_1x1(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device float4 *weights [[buffer(1)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 1; + + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = float4(0.0); + + float4 input; + for (uint i = 0; i < input_arr_size; ++i) { + input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; + output.x += dot(input, weight_x); + + float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; + output.y += dot(input, weight_y); + + float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; + output.z += dot(input, weight_z); + + float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; + output.w += dot(input, weight_w); + } + outTexture.write(output, gid.xy, gid.z); +} + + +kernel void conv_3x3_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half4 *weights [[buffer(1)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = float4(0.0); + + half4 input[9]; + for (uint i = 0; i < input_arr_size; ++i) { + input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i); + input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i); + input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i); + input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i); + input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i); + input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i); + input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i); + input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i); + for (int j = 0; j < 9; ++j) { + half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(float4(input[j]), float4(weight_x)); + + half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(float4(input[j]), float4(weight_y)); + + half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(float4(input[j]), float4(weight_z)); + + half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(float4(input[j]), float4(weight_w)); + } + } + outTexture.write(half4(output), gid.xy, gid.z); +} + +kernel void depthwise_conv_3x3_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half *weights [[buffer(1)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + uint output_slice = gid.z; + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint weithTo = gid.z * kernelHXW * 4; + float4 output = float4(0.0); + half4 inputs[9]; + inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); + inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); + inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); + inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); + inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); + inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); + inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); + inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); + inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); + for (int j = 0; j < 9; ++j) { + half4 input = inputs[j]; + output.x += float(input.x) * float(weights[weithTo + 0 * kernelHXW + j]); + output.y += float(input.y) * float(weights[weithTo + 1 * kernelHXW + j]); + output.z += float(input.z) * float(weights[weithTo + 2 * kernelHXW + j]); + output.w += float(input.w) * float(weights[weithTo + 3 * kernelHXW + j]); + } + outTexture.write(half4(output), gid.xy, gid.z); +} + +kernel void conv_1x1_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half4 *weights [[buffer(1)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 1; + + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = float4(0.0); + + half4 input; + for (uint i = 0; i < input_arr_size; ++i) { + input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; + output.x += dot(float4(input), float4(weight_x)); + + half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; + output.y += dot(float4(input), float4(weight_y)); + + half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; + output.z += dot(float4(input), float4(weight_z)); + + half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; + output.w += dot(float4(input), float4(weight_w)); + } + outTexture.write(half4(output), gid.xy, gid.z); +} + + diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ConvTransposeKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvTransposeKernel.metal similarity index 52% rename from metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ConvTransposeKernel.metal rename to metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvTransposeKernel.metal index baf3f31157a472412bb08ccb3c803f5ec9e25d9c..a324fac188051552c349dda76da644b39ff00dbf 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ConvTransposeKernel.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvTransposeKernel.metal @@ -16,17 +16,17 @@ using namespace metal; struct MetalConvTransposeParam{ - ushort kernelW; - ushort kernelH; - - ushort strideX; - ushort strideY; - - ushort paddingX; - ushort paddingY; - - ushort dilationX; - ushort dilationY; + ushort kernelW; + ushort kernelH; + + ushort strideX; + ushort strideY; + + ushort paddingX; + ushort paddingY; + + ushort dilationX; + ushort dilationY; }; kernel void conv_transpose2x2_stride2(texture2d_array inTexture [[texture(0)]], @@ -34,83 +34,83 @@ kernel void conv_transpose2x2_stride2(texture2d_array inT constant MetalConvTransposeParam ¶m [[buffer(0)]], const device float4 *weights [[buffer(1)]], uint3 gid [[thread_position_in_grid]]) { - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - int input_array_size = inTexture.get_array_size(); - int kernel_index_x = gid.x % 2; - int kernel_index_y = gid.y % 2; - int kernel_index = kernel_index_y * 2 + kernel_index_x; - int kernel_to = gid.z * input_array_size * 4 * 4 + (kernel_index * input_array_size); - int input_x = gid.x / 2; - int input_y = gid.y / 2; - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - float4 output = float4(0.0); - for (int i = 0; i < input_array_size; ++i) { - - float4 input = inTexture.sample(sample, float2(input_x, input_y), i); - - float4 kernel_slice0 = weights[kernel_to + input_array_size * 4 * 0 + i]; - float4 kernel_slice1 = weights[kernel_to + input_array_size * 4 * 1 + i]; - float4 kernel_slice2 = weights[kernel_to + input_array_size * 4 * 2 + i]; - float4 kernel_slice3 = weights[kernel_to + input_array_size * 4 * 3 + i]; - - output.x += dot(input, kernel_slice0); + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } - output.y += dot(input, kernel_slice1); + int input_array_size = inTexture.get_array_size(); + int kernel_index_x = gid.x % 2; + int kernel_index_y = gid.y % 2; + int kernel_index = kernel_index_y * 2 + kernel_index_x; + int kernel_to = gid.z * input_array_size * 4 * 4 + (kernel_index * input_array_size); + int input_x = gid.x / 2; + int input_y = gid.y / 2; - output.z += dot(input, kernel_slice2); + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + float4 output = float4(0.0); + for (int i = 0; i < input_array_size; ++i) { + + float4 input = inTexture.sample(sample, float2(input_x, input_y), i); + + float4 kernel_slice0 = weights[kernel_to + input_array_size * 4 * 0 + i]; + float4 kernel_slice1 = weights[kernel_to + input_array_size * 4 * 1 + i]; + float4 kernel_slice2 = weights[kernel_to + input_array_size * 4 * 2 + i]; + float4 kernel_slice3 = weights[kernel_to + input_array_size * 4 * 3 + i]; + + output.x += dot(input, kernel_slice0); + + output.y += dot(input, kernel_slice1); + + output.z += dot(input, kernel_slice2); + + output.w += dot(input, kernel_slice3); + } - output.w += dot(input, kernel_slice3); - } - - outTexture.write(output, gid.xy, gid.z); + outTexture.write(output, gid.xy, gid.z); } kernel void conv_transpose2x2_stride2_half(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvTransposeParam ¶m [[buffer(0)]], - const device half4 *weights [[buffer(1)]], - uint3 gid [[thread_position_in_grid]]) { - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - int input_array_size = inTexture.get_array_size(); - int kernel_index_x = gid.x % 2; - int kernel_index_y = gid.y % 2; - int kernel_index = kernel_index_y * 2 + kernel_index_x; - int kernel_to = gid.z * input_array_size * 4 * 4 + (kernel_index * input_array_size); - int input_x = gid.x / 2; - int input_y = gid.y / 2; - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - float4 output = float4(0.0); - for (int i = 0; i < input_array_size; ++i) { - - half4 input = inTexture.sample(sample, float2(input_x, input_y), i); - - half4 kernel_slice0 = weights[kernel_to + input_array_size * 4 * 0 + i]; - half4 kernel_slice1 = weights[kernel_to + input_array_size * 4 * 1 + i]; - half4 kernel_slice2 = weights[kernel_to + input_array_size * 4 * 2 + i]; - half4 kernel_slice3 = weights[kernel_to + input_array_size * 4 * 3 + i]; - - output.x += dot(float4(input), float4(kernel_slice0)); + texture2d_array outTexture [[texture(1)]], + constant MetalConvTransposeParam ¶m [[buffer(0)]], + const device half4 *weights [[buffer(1)]], + uint3 gid [[thread_position_in_grid]]) { + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } - output.y += dot(float4(input), float4(kernel_slice1)); + int input_array_size = inTexture.get_array_size(); + int kernel_index_x = gid.x % 2; + int kernel_index_y = gid.y % 2; + int kernel_index = kernel_index_y * 2 + kernel_index_x; + int kernel_to = gid.z * input_array_size * 4 * 4 + (kernel_index * input_array_size); + int input_x = gid.x / 2; + int input_y = gid.y / 2; - output.z += dot(float4(input), float4(kernel_slice2)); + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + float4 output = float4(0.0); + for (int i = 0; i < input_array_size; ++i) { + + half4 input = inTexture.sample(sample, float2(input_x, input_y), i); + + half4 kernel_slice0 = weights[kernel_to + input_array_size * 4 * 0 + i]; + half4 kernel_slice1 = weights[kernel_to + input_array_size * 4 * 1 + i]; + half4 kernel_slice2 = weights[kernel_to + input_array_size * 4 * 2 + i]; + half4 kernel_slice3 = weights[kernel_to + input_array_size * 4 * 3 + i]; + + output.x += dot(float4(input), float4(kernel_slice0)); + + output.y += dot(float4(input), float4(kernel_slice1)); + + output.z += dot(float4(input), float4(kernel_slice2)); + + output.w += dot(float4(input), float4(kernel_slice3)); + } - output.w += dot(float4(input), float4(kernel_slice3)); - } - - outTexture.write(half4(output), gid.xy, gid.z); + outTexture.write(half4(output), gid.xy, gid.z); } //kernel void conv_transpose(texture2d_array inTexture [[texture(0)]], diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Elementwise.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Elementwise.metal new file mode 100644 index 0000000000000000000000000000000000000000..40cad28df130e2d826500cc840aaabf09d04e79b --- /dev/null +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Elementwise.metal @@ -0,0 +1,100 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include +#include "Common.metal" + +using namespace metal; + +struct ElementwiseAddParam { + int32_t fast; + int32_t axis; + int32_t ylen; + int32_t xdim[4]; + int32_t xtrans[4]; + int32_t ydim[4]; + int32_t ytrans[4]; +}; + +kernel void elementwise_add(texture2d_array inputX [[texture(0)]], + texture2d_array inputY [[texture(1)]], + texture2d_array outTexture [[texture(2)]], + constant ElementwiseAddParam &pm [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]) { + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) return; + float4 rx, ry; + + if (pm.fast == 1) { + rx = inputX.read(gid.xy, gid.z); + ry = inputY.read(gid.xy, gid.z); + } else { + rx = inputX.read(gid.xy, gid.z); + int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4]; + int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4]; + int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]}; + int32_t ytrans[4] = {pm.ytrans[0], pm.ytrans[1], pm.ytrans[2], pm.ytrans[3]}; + int32_t yshift = 4 - pm.ylen - pm.axis; + for (int n = 0; n < 4; n++) { + x_xyzn[3] = n; + xyzn2abcd(pm.xdim[3], x_xyzn, x_abcd); + invtrans(xtrans, x_abcd, t_abcd); + for (int k = pm.axis; k < (pm.axis + pm.ylen); k++) { + y_abcd[yshift+k] = t_abcd[k]; + } + trans(ytrans, y_abcd, t_abcd); + abcd2xyzn(pm.ydim[3], t_abcd, y_xyzn); + ry[n] = inputY.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]]; + } + } + float4 r = rx + ry; + outTexture.write(r, gid.xy, gid.z); +} + +kernel void elementwise_add_half(texture2d_array inputX [[texture(0)]], + texture2d_array inputY [[texture(1)]], + texture2d_array outTexture [[texture(2)]], + constant ElementwiseAddParam &pm [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]) { + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) return; + half4 rx, ry; + + if (pm.fast == 1) { + rx = inputX.read(gid.xy, gid.z); + ry = inputY.read(gid.xy, gid.z); + } else { + rx = inputX.read(gid.xy, gid.z); + int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4]; + int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4]; + int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]}; + int32_t ytrans[4] = {pm.ytrans[0], pm.ytrans[1], pm.ytrans[2], pm.ytrans[3]}; + int32_t yshift = 4 - pm.ylen - pm.axis; + for (int n = 0; n < 4; n++) { + x_xyzn[3] = n; + xyzn2abcd(pm.xdim[3], x_xyzn, x_abcd); + invtrans(xtrans, x_abcd, t_abcd); + for (int k = pm.axis; k < (pm.axis + pm.ylen); k++) { + y_abcd[yshift+k] = t_abcd[k]; + } + trans(ytrans, y_abcd, t_abcd); + abcd2xyzn(pm.ydim[3], t_abcd, y_xyzn); + ry[n] = inputY.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]]; + } + } + half4 r = rx + ry; + outTexture.write(r, gid.xy, gid.z); +} diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ElementwiseAddPreluKernel.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ElementwiseAddPreluKernel.inc.metal new file mode 100644 index 0000000000000000000000000000000000000000..65566952efa5a30c8601e751cbfb0ac6ccf21464 --- /dev/null +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ElementwiseAddPreluKernel.inc.metal @@ -0,0 +1,91 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#ifdef P + +#include +#include "Macro.metal" + +using namespace metal; + +kernel void FUNC3_(elementwise_add, PRELU_TYPE, P)(texture2d_array inputX [[texture(0)]], + texture2d_array inputY [[texture(1)]], + texture2d_array outTexture [[texture(2)]], + constant ElementwiseAddParam &pm [[buffer(0)]], +#ifdef PRELU_CHANNEL + const device VECTOR(P, 4) *alpha [[buffer(1)]], +#endif +#ifdef PRELU_ELEMENT + const device VECTOR(P, 4) *alpha [[buffer(1)]], +#endif +#ifdef PRELU_OTHER + const device P *alpha [[buffer(1)]], +#endif + uint3 gid [[thread_position_in_grid]]) { + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) return; + VECTOR(P, 4) rx, ry; + + if (pm.fast == 1) { + rx = inputX.read(gid.xy, gid.z); + ry = inputY.read(gid.xy, gid.z); + } else { + rx = inputX.read(gid.xy, gid.z); + int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4]; + int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4]; + int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]}; + int32_t ytrans[4] = {pm.ytrans[0], pm.ytrans[1], pm.ytrans[2], pm.ytrans[3]}; + int32_t yshift = 4 - pm.ylen - pm.axis; + for (int n = 0; n < 4; n++) { + x_xyzn[3] = n; + xyzn2abcd(pm.xdim[3], x_xyzn, x_abcd); + invtrans(xtrans, x_abcd, t_abcd); + for (int k = pm.axis; k < (pm.axis + pm.ylen); k++) { + y_abcd[yshift+k] = t_abcd[k]; + } + trans(ytrans, y_abcd, t_abcd); + abcd2xyzn(pm.ydim[3], t_abcd, y_xyzn); + ry[n] = inputY.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]]; + } + } + VECTOR(P, 4) output = rx + ry; + +#ifdef PRELU_CHANNEL + VECTOR(P, 4) alpha_value = alpha[gid.z]; + output.x = output.x > 0 ? output.x : (alpha_value.x * output.x); + output.y = output.y > 0 ? output.y : (alpha_value.y * output.y); + output.z = output.z > 0 ? output.z : (alpha_value.z * output.z); + output.w = output.w > 0 ? output.w : (alpha_value.w * output.w); +#endif +#ifdef PRELU_ELEMENT + int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size(); + VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z]; + output.x = output.x > 0 ? output.x : (alpha_value.x * output.x); + output.y = output.y > 0 ? output.y : (alpha_value.y * output.y); + output.z = output.z > 0 ? output.z : (alpha_value.z * output.z); + output.w = output.w > 0 ? output.w : (alpha_value.w * output.w); +#endif +#ifdef PRELU_OTHER + P alpha_value = alpha[0]; + output.x = output.x > 0 ? output.x : (alpha_value * output.x); + output.y = output.y > 0 ? output.y : (alpha_value * output.y); + output.z = output.z > 0 ? output.z : (alpha_value * output.z); + output.w = output.w > 0 ? output.w : (alpha_value * output.w); +#endif + + outTexture.write(output, gid.xy, gid.z); +} + +#endif diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ElementwiseAddPreluKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ElementwiseAddPreluKernel.metal similarity index 92% rename from metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ElementwiseAddPreluKernel.metal rename to metal/paddle-mobile-metallib/paddle-mobile-metallib/ElementwiseAddPreluKernel.metal index 8fd1a9fdab8c86fbc52f6dab9c448b7b0f27d403..cca11e80861723668eea05169c060cb7fcc455c2 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ElementwiseAddPreluKernel.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ElementwiseAddPreluKernel.metal @@ -17,13 +17,13 @@ using namespace metal; struct ElementwiseAddParam { - int32_t fast; - int32_t axis; - int32_t ylen; - int32_t xdim[4]; - int32_t xtrans[4]; - int32_t ydim[4]; - int32_t ytrans[4]; + int32_t fast; + int32_t axis; + int32_t ylen; + int32_t xdim[4]; + int32_t xtrans[4]; + int32_t ydim[4]; + int32_t ytrans[4]; }; #define P float diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/FetchKernel.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/FetchKernel.inc.metal new file mode 100644 index 0000000000000000000000000000000000000000..114aa1566441163e52f03d201fb848d8185ea75a --- /dev/null +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/FetchKernel.inc.metal @@ -0,0 +1,61 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#ifdef P + +#define CONCAT3_(a, b, c) a ## _ ## b ## _ ## c +#define CONCAT2_(a, b) a ## _ ## b +#define CONCAT2(a, b) a ## b +#define FUNC(m, n, q) CONCAT3_(m, n, q) +#define FUNC_T(m, n) CONCAT2_(m, n) + +#define VECTOR(p, n) CONCAT2(p, n) + +kernel void FUNC_T(fetch, P)(texture2d_array inTexture [[texture(0)]], + device float *output [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]) { + if (gid.x >= inTexture.get_width() || + gid.y >= inTexture.get_height() || + gid.z >= inTexture.get_array_size()) { + return; + } + + int input_width = inTexture.get_width(); + int input_height = inTexture.get_height(); + const VECTOR(P, 4) input = inTexture.read(gid.xy, gid.z); + int output_to = 4 * input_width * input_height; + + output[gid.z * output_to + 0 * input_width * input_height + gid.y * input_width + gid.x] = input.x; + + output[gid.z * output_to + 1 * input_width * input_height + gid.y * input_width + gid.x] = input.y; + output[gid.z * output_to + 2 * input_width * input_height + gid.y * input_width + gid.x] = input.z; + output[gid.z * output_to + 3 * input_width * input_height + gid.y * input_width + gid.x] = input.w; +} + +kernel void FUNC(fetch, 1or2, P)(texture2d_array inTexture [[texture(0)]], + device float4 *output [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]) { + if (gid.x >= inTexture.get_width() || + gid.y >= inTexture.get_height() || + gid.z >= inTexture.get_array_size()) { + return; + } + + int input_width = inTexture.get_width(); + const VECTOR(P, 4) input = inTexture.read(gid.xy, gid.z); + output[gid.y * input_width + gid.x] = float4(input); +} + + +#endif diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/FetchKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/FetchKernel.metal similarity index 99% rename from metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/FetchKernel.metal rename to metal/paddle-mobile-metallib/paddle-mobile-metallib/FetchKernel.metal index 87d304302fe4dbf246ecfed2da1af8172ff717ac..df2de98648ef9ec5cfb8eaf5cc46887aadc04e98 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/FetchKernel.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/FetchKernel.metal @@ -31,7 +31,7 @@ using namespace metal; kernel void fetch_placeholder(texture2d_array inTexture [[texture(0)]], device float *output [[buffer(0)]], uint3 gid [[thread_position_in_grid]]) { - + } kernel void fetch_placeholder_half(texture2d_array inTexture [[texture(0)]], diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/Kernels.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Kernels.metal similarity index 60% rename from metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/Kernels.metal rename to metal/paddle-mobile-metallib/paddle-mobile-metallib/Kernels.metal index 368509f001aca6361b81b9b7839cf24b2efc5c12..06bf42697efa18b7100711301ea492447d3c14ce 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/Kernels.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Kernels.metal @@ -23,47 +23,47 @@ kernel void place_holder(texture2d inTexture [[texture(0)]], } struct OutputDim { - ushort width; - ushort height; - ushort strideX; - ushort strideY; + ushort width; + ushort height; + ushort strideX; + ushort strideY; }; kernel void resize(texture2d inTexture [[texture(0)]], texture2d_array outTexture [[texture(1)]], constant OutputDim ¶ms [[buffer(0)]], uint3 gid [[thread_position_in_grid]]) { - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) return; - - constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint2 pos = gid.xy * uint2(params.strideX, params.strideY); - const half4 input = inTexture.read(pos); - outTexture.write(half4(input.x, input.y, input.z, input.w), gid.xy, gid.z); + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) return; + + constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint2 pos = gid.xy * uint2(params.strideX, params.strideY); + const half4 input = inTexture.read(pos); + outTexture.write(half4(input.x, input.y, input.z, input.w), gid.xy, gid.z); } kernel void texture2d_to_2d_array(texture2d inTexture [[texture(0)]], texture2d_array outTexture [[texture(1)]], uint3 gid [[thread_position_in_grid]]) { - if (gid.x >= inTexture.get_width() || - gid.y >= inTexture.get_height()){ - return; - } - const float4 input = inTexture.read(gid.xy); - outTexture.write(input, gid.xy, 0); + if (gid.x >= inTexture.get_width() || + gid.y >= inTexture.get_height()){ + return; + } + const float4 input = inTexture.read(gid.xy); + outTexture.write(input, gid.xy, 0); } kernel void texture2d_to_2d_array_half(texture2d inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - uint3 gid [[thread_position_in_grid]]) { - if (gid.x >= inTexture.get_width() || - gid.y >= inTexture.get_height()){ - return; - } - const half4 input = inTexture.read(gid.xy); - outTexture.write(input, gid.xy, 0); + texture2d_array outTexture [[texture(1)]], + uint3 gid [[thread_position_in_grid]]) { + if (gid.x >= inTexture.get_width() || + gid.y >= inTexture.get_height()){ + return; + } + const half4 input = inTexture.read(gid.xy); + outTexture.write(input, gid.xy, 0); } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/Macro.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Macro.metal similarity index 100% rename from metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/Macro.metal rename to metal/paddle-mobile-metallib/paddle-mobile-metallib/Macro.metal diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/NMSFetchResultKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/NMSFetchResultKernel.metal new file mode 100644 index 0000000000000000000000000000000000000000..e32c98cc29f964be4089699bbb035f059f32d0dd --- /dev/null +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/NMSFetchResultKernel.metal @@ -0,0 +1,80 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include +using namespace metal; + +kernel void nms_fetch_result(texture2d_array inTexture [[texture(0)]], + device float *output [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= inTexture.get_width() || + gid.y >= inTexture.get_height() || + gid.z >= inTexture.get_array_size()) { + return; + } + + int input_width = inTexture.get_width(); + const float4 input = inTexture.read(gid.xy, gid.z); + output[gid.y * input_width + gid.x] = input.x; + +} + + +kernel void nms_fetch_result_half(texture2d_array inTexture [[texture(0)]], + device float *output [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= inTexture.get_width() || + gid.y >= inTexture.get_height() || + gid.z >= inTexture.get_array_size()) { + return; + } + + int input_width = inTexture.get_width(); + const half4 input = inTexture.read(gid.xy, gid.z); + output[gid.y * input_width + gid.x] = input.x; +} + +kernel void nms_fetch_bbox(texture2d_array inTexture [[texture(0)]], + device float4 *output [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= inTexture.get_width() || + gid.y >= inTexture.get_height() || + gid.z >= inTexture.get_array_size()) { + return; + } + + int input_width = inTexture.get_width(); + // int input_height = inTexture.get_height(); + const float4 input = inTexture.read(gid.xy, gid.z); + output[gid.y * input_width + gid.x] = input; +} + +kernel void nms_fetch_bbox_half(texture2d_array inTexture [[texture(0)]], + device float4 *output [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]) { + if (gid.x >= inTexture.get_width() || + gid.y >= inTexture.get_height() || + gid.z >= inTexture.get_array_size()) { + return; + } + + int input_width = inTexture.get_width(); + // int input_height = inTexture.get_height(); + const half4 input = inTexture.read(gid.xy, gid.z); + output[gid.y * input_width + gid.x] = float4(input); +} + diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/PoolKernel.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/PoolKernel.inc.metal new file mode 100644 index 0000000000000000000000000000000000000000..05146b8d14ea9f637ab7fc381f9911c1ad129ad2 --- /dev/null +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/PoolKernel.inc.metal @@ -0,0 +1,50 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#ifdef P + +kernel void FUNC2_(pool, P)(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant PoolParam &pm [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]) { + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) return; + int xmin = gid.x * pm.strideX - pm.paddingX; + int xmax = min(xmin + pm.ksizeX, int(inTexture.get_width())); + xmin = max(xmin, 0); + int ymin = gid.y * pm.strideX - pm.paddingX; + int ymax = min(ymin + pm.ksizeX, int(inTexture.get_height())); + ymin = max(ymin, 0); + + VECTOR(P, 4) r = 0; + if (pm.poolType == 0) { + r = inTexture.read(uint2(xmin, ymin), gid.z); + for (int x = xmin; x < xmax; x++) { + for (int y = ymin; y < ymax; y++) { + r = fmax(r, inTexture.read(uint2(x, y), gid.z)); + } + } + } else if (pm.poolType == 1) { + for (int x = xmin; x < xmax; x++) { + for (int y = ymin; y < ymax; y++) { + r += inTexture.read(uint2(x, y), gid.z); + } + } + r /= (xmax - xmin) * (ymax - ymin); + } + outTexture.write(r, gid.xy, gid.z); +} + +#endif diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/PoolKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/PoolKernel.metal similarity index 87% rename from metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/PoolKernel.metal rename to metal/paddle-mobile-metallib/paddle-mobile-metallib/PoolKernel.metal index e76b4ac74200ff9e61c888166f46deda7b071f2c..30111b7bcb24e6c5eceecfbbcd65430404333a1c 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/PoolKernel.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/PoolKernel.metal @@ -18,13 +18,13 @@ using namespace metal; struct PoolParam { - int ksizeX; - int ksizeY; - int strideX; - int strideY; - int paddingX; - int paddingY; - int poolType; + int ksizeX; + int ksizeY; + int strideX; + int strideY; + int paddingX; + int paddingY; + int poolType; }; #define P half diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/PreluKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/PreluKernel.metal new file mode 100644 index 0000000000000000000000000000000000000000..6279821436804d8d3459899b986f01d326e35df0 --- /dev/null +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/PreluKernel.metal @@ -0,0 +1,151 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include +using namespace metal; + +kernel void prelu_channel(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + const device float4 *alpha [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]){ + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + float4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z); + float4 alpha_value = alpha[gid.z]; + float4 output; + output.x = input.x > 0 ? input.x : (alpha_value.x * input.x); + output.y = input.y > 0 ? input.y : (alpha_value.y * input.y); + output.z = input.z > 0 ? input.z : (alpha_value.z * input.z); + output.w = input.w > 0 ? input.w : (alpha_value.w * input.w); + outTexture.write(output, gid.xy, gid.z); +} + +kernel void prelu_element(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + const device float4 *alpha [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]){ + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + float4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z); + + int alpha_to = (gid.y * inTexture.get_width() + gid.x) * inTexture.get_array_size(); + float4 alpha_value = alpha[alpha_to + gid.z]; + + float4 output; + output.x = input.x > 0 ? input.x : (alpha_value.x * input.x); + output.y = input.y > 0 ? input.y : (alpha_value.y * input.y); + output.z = input.z > 0 ? input.z : (alpha_value.z * input.z); + output.w = input.w > 0 ? input.w : (alpha_value.w * input.w); + outTexture.write(output, gid.xy, gid.z); +} + +kernel void prelu_other(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + const device float *alpha [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]){ + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + float4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z); + float alpha_value = alpha[0]; + float4 output; + output.x = input.x > 0 ? input.x : (alpha_value * input.x); + output.y = input.y > 0 ? input.y : (alpha_value * input.y); + output.z = input.z > 0 ? input.z : (alpha_value * input.z); + output.w = input.w > 0 ? input.w : (alpha_value * input.w); + outTexture.write(output, gid.xy, gid.z); +} + + +kernel void prelu_channel_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + const device half4 *alpha [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]){ + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z); + half4 alpha_value = alpha[gid.z]; + half4 output; + output.x = input.x > 0 ? input.x : (alpha_value.x * input.x); + output.y = input.y > 0 ? input.y : (alpha_value.y * input.y); + output.z = input.z > 0 ? input.z : (alpha_value.z * input.z); + output.w = input.w > 0 ? input.w : (alpha_value.w * input.w); + outTexture.write(output, gid.xy, gid.z); +} + +kernel void prelu_element_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + const device half4 *alpha [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]){ + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z); + + int alpha_to = (gid.y * inTexture.get_width() + gid.x) * inTexture.get_array_size(); + half4 alpha_value = alpha[alpha_to + gid.z]; + + half4 output; + output.x = input.x > 0 ? input.x : (alpha_value.x * input.x); + output.y = input.y > 0 ? input.y : (alpha_value.y * input.y); + output.z = input.z > 0 ? input.z : (alpha_value.z * input.z); + output.w = input.w > 0 ? input.w : (alpha_value.w * input.w); + outTexture.write(output, gid.xy, gid.z); +} + +kernel void prelu_other_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + const device half *alpha [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]){ + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z); + half alpha_value = alpha[0]; + half4 output; + output.x = input.x > 0 ? input.x : (alpha_value * input.x); + output.y = input.y > 0 ? input.y : (alpha_value * input.y); + output.z = input.z > 0 ? input.z : (alpha_value * input.z); + output.w = input.w > 0 ? input.w : (alpha_value * input.w); + outTexture.write(output, gid.xy, gid.z); +} + + diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/PriorBoxKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/PriorBoxKernel.metal new file mode 100644 index 0000000000000000000000000000000000000000..c7f97043bfe8dc614080f510098d7b3e10f73c9a --- /dev/null +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/PriorBoxKernel.metal @@ -0,0 +1,367 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include +using namespace metal; + +struct PriorBoxMetalParam { + float offset; + float stepWidth; + float stepHeight; + float minSize; + float maxSize; + float imageWidth; + float imageHeight; + + bool clip; + + uint numPriors; + uint aspecRatiosSize; + uint minSizeSize; + uint maxSizeSize; +}; + +kernel void prior_box(texture2d_array inTexture [[texture(0)]], + texture2d_array outBoxTexture [[texture(1)]], + texture2d_array varianceTexture [[texture(2)]], + const device float *aspect_ratios [[buffer(0)]], + constant PriorBoxMetalParam ¶m [[buffer(1)]], + const device float4 *variances [[buffer(2)]], + uint3 gid [[thread_position_in_grid]]) { + if (gid.x >= outBoxTexture.get_width() || + gid.y >= outBoxTexture.get_height() || + gid.z >= outBoxTexture.get_array_size()) return; + + float center_x = (gid.x + param.offset) * param.stepWidth; + float center_y = (gid.y + param.offset) * param.stepHeight; + + float box_width, box_height; + + if (gid.z < param.aspecRatiosSize) { + float ar = aspect_ratios[gid.z]; + box_width = param.minSize * sqrt(ar) / 2; + box_height = param.minSize / sqrt(ar) / 2; + float4 box; + box.x = (center_x - box_width) / param.imageWidth; + box.y = (center_y - box_height) / param.imageHeight; + box.z = (center_x + box_width) / param.imageWidth; + box.w = (center_y + box_height) / param.imageHeight; + + float4 res; + if (param.clip) { + res = fmin(fmax(box, 0.0), 1.0); + } else { + res = box; + } + + outBoxTexture.write(res, gid.xy, gid.z); + } else if (gid.z >= param.aspecRatiosSize) { + if (param.maxSizeSize > 0) { + box_width = box_height = sqrt(param.minSize * param.maxSize) / 2; + float4 max_box; + max_box.x = (center_x - box_width) / param.imageWidth; + max_box.y = (center_y - box_height) / param.imageHeight; + max_box.z = (center_x + box_width) / param.imageWidth; + max_box.w = (center_y + box_height) / param.imageHeight; + + float4 res; + if (param.clip) { + res = min(max(max_box, 0.0), 1.0); + } else { + res = max_box; + } + outBoxTexture.write(max_box, gid.xy, gid.z); + } + } + + float4 variance = variances[0]; + if (gid.z < param.numPriors) { + float4 variances_output; + variances_output.x = variance.x; + variances_output.y = variance.y; + variances_output.z = variance.z; + variances_output.w = variance.w; + varianceTexture.write(variances_output, gid.xy, gid.z); + } +} + + +kernel void prior_box_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outBoxTexture [[texture(1)]], + texture2d_array varianceTexture [[texture(2)]], + const device half *aspect_ratios [[buffer(0)]], + constant PriorBoxMetalParam ¶m [[buffer(1)]], + const device float4 *variances [[buffer(2)]], + uint3 gid [[thread_position_in_grid]]) { + if (gid.x >= outBoxTexture.get_width() || + gid.y >= outBoxTexture.get_height() || + gid.z >= outBoxTexture.get_array_size()) return; + + float center_x = (gid.x + param.offset) * param.stepWidth; + float center_y = (gid.y + param.offset) * param.stepHeight; + + float box_width, box_height; + + if (gid.z < param.aspecRatiosSize) { + half ar = aspect_ratios[gid.z]; + box_width = param.minSize * sqrt(ar) / 2; + box_height = param.minSize / sqrt(ar) / 2; + float4 box; + box.x = (center_x - box_width) / param.imageWidth; + box.y = (center_y - box_height) / param.imageHeight; + box.z = (center_x + box_width) / param.imageWidth; + box.w = (center_y + box_height) / param.imageHeight; + + float4 res; + if (param.clip) { + res = fmin(fmax(box, 0.0), 1.0); + } else { + res = box; + } + + outBoxTexture.write(half4(res), gid.xy, gid.z); + } else if (gid.z >= param.aspecRatiosSize) { + if (param.maxSizeSize > 0) { + box_width = box_height = sqrt(param.minSize * param.maxSize) / 2; + float4 max_box; + max_box.x = (center_x - box_width) / param.imageWidth; + max_box.y = (center_y - box_height) / param.imageHeight; + max_box.z = (center_x + box_width) / param.imageWidth; + max_box.w = (center_y + box_height) / param.imageHeight; + + float4 res; + if (param.clip) { + res = min(max(max_box, 0.0), 1.0); + } else { + res = max_box; + } + outBoxTexture.write(half4(max_box), gid.xy, gid.z); + } + } + + float4 variance = variances[0]; + if (gid.z < param.numPriors) { + float4 variances_output; + variances_output.x = variance.x; + variances_output.y = variance.y; + variances_output.z = variance.z; + variances_output.w = variance.w; + varianceTexture.write(half4(variances_output), gid.xy, gid.z); + } +} + + + +kernel void prior_box_MinMaxAspectRatiosOrder(texture2d_array inTexture [[texture(0)]], + texture2d_array outBoxTexture [[texture(1)]], + texture2d_array varianceTexture [[texture(2)]], + const device float *aspect_ratios [[buffer(0)]], + constant PriorBoxMetalParam ¶m [[buffer(1)]], + const device float4 *variances [[buffer(2)]], + uint3 gid [[thread_position_in_grid]]) { + if (gid.x >= outBoxTexture.get_width() || + gid.y >= outBoxTexture.get_height() || + gid.z >= outBoxTexture.get_array_size()) return; + + float center_x = (gid.x + param.offset) * param.stepWidth; + float center_y = (gid.y + param.offset) * param.stepHeight; + + float box_width, box_height; + + + + if (gid.z == 0) { + box_width = box_height = param.minSize / 2; + + float4 box; + box.x = (center_x - box_width) / param.imageWidth; + box.y = (center_y - box_height) / param.imageHeight; + box.z = (center_x + box_width) / param.imageWidth; + box.w = (center_y + box_height) / param.imageHeight; + + float4 res; + if (param.clip) { + res = fmin(fmax(box, 0.0), 1.0); + } else { + res = box; + } + + outBoxTexture.write(res, gid.xy, gid.z); + } + + if (gid.z == 1 && param.maxSizeSize > 0) { + + box_width = box_height = sqrt(param.minSize * param.maxSize) / 2; + float4 max_box; + max_box.x = (center_x - box_width) / param.imageWidth; + max_box.y = (center_y - box_height) / param.imageHeight; + max_box.z = (center_x + box_width) / param.imageWidth; + max_box.w = (center_y + box_height) / param.imageHeight; + + float4 res; + if (param.clip) { + res = min(max(max_box, 0.0), 1.0); + } else { + res = max_box; + } + outBoxTexture.write(res, gid.xy, gid.z); + } + + int aspect_to = 0; + if (param.maxSizeSize > 0) { + aspect_to = gid.z - 2; + } else { + aspect_to = gid.z - 1; + } + + + + + if (aspect_to >= 0 && aspect_to < int(param.aspecRatiosSize)) { + + int skip = 0; + for (int i = 0; i < aspect_to + 1; ++i) { + if (fabs(aspect_ratios[i] - 1.) < 1e-6) { + skip += 1; + } + } + aspect_to += skip; + + float ar = aspect_ratios[aspect_to]; + + box_width = param.minSize * sqrt(ar) / 2; + box_height = param.minSize / sqrt(ar) / 2; + float4 box; + box.x = (center_x - box_width) / param.imageWidth; + box.y = (center_y - box_height) / param.imageHeight; + box.z = (center_x + box_width) / param.imageWidth; + box.w = (center_y + box_height) / param.imageHeight; + + float4 res; + if (param.clip) { + res = fmin(fmax(box, 0.0), 1.0); + } else { + res = box; + } + + outBoxTexture.write(res, gid.xy, gid.z); + } + + float4 variance = variances[0]; + if (gid.z < param.numPriors) { + float4 variances_output; + variances_output.x = variance.x; + variances_output.y = variance.y; + variances_output.z = variance.z; + variances_output.w = variance.w; + varianceTexture.write(variances_output, gid.xy, gid.z); + } +} + + +kernel void prior_box_MinMaxAspectRatiosOrder_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outBoxTexture [[texture(1)]], + texture2d_array varianceTexture [[texture(2)]], + const device half *aspect_ratios [[buffer(0)]], + constant PriorBoxMetalParam ¶m [[buffer(1)]], + const device float4 *variances [[buffer(2)]], + uint3 gid [[thread_position_in_grid]]) { + if (gid.x >= outBoxTexture.get_width() || + gid.y >= outBoxTexture.get_height() || + gid.z >= outBoxTexture.get_array_size()) return; + + float center_x = (gid.x + param.offset) * param.stepWidth; + float center_y = (gid.y + param.offset) * param.stepHeight; + + float box_width, box_height; + + + + if (gid.z == 0) { + box_width = box_height = param.minSize / 2; + + float4 box; + box.x = (center_x - box_width) / param.imageWidth; + box.y = (center_y - box_height) / param.imageHeight; + box.z = (center_x + box_width) / param.imageWidth; + box.w = (center_y + box_height) / param.imageHeight; + + float4 res; + if (param.clip) { + res = fmin(fmax(box, 0.0), 1.0); + } else { + res = box; + } + + outBoxTexture.write(half4(res), gid.xy, gid.z); + } + + if (gid.z == 1 && param.maxSizeSize > 0) { + + box_width = box_height = sqrt(param.minSize * param.maxSize) / 2; + float4 max_box; + max_box.x = (center_x - box_width) / param.imageWidth; + max_box.y = (center_y - box_height) / param.imageHeight; + max_box.z = (center_x + box_width) / param.imageWidth; + max_box.w = (center_y + box_height) / param.imageHeight; + + float4 res; + if (param.clip) { + res = min(max(max_box, 0.0), 1.0); + } else { + res = max_box; + } + outBoxTexture.write(half4(res), gid.xy, gid.z); + } + + int aspect_to = 0; + if (param.maxSizeSize > 0) { + aspect_to = gid.z - 2; + } else { + aspect_to = gid.z - 1; + } + + if (aspect_to > 0 && aspect_to < int(param.aspecRatiosSize) && fabs(aspect_ratios[aspect_to] - 1.) > 1e-6) { + float ar = aspect_ratios[aspect_to]; + + box_width = param.minSize * sqrt(ar) / 2; + box_height = param.minSize / sqrt(ar) / 2; + float4 box; + box.x = (center_x - box_width) / param.imageWidth; + box.y = (center_y - box_height) / param.imageHeight; + box.z = (center_x + box_width) / param.imageWidth; + box.w = (center_y + box_height) / param.imageHeight; + + float4 res; + if (param.clip) { + res = fmin(fmax(box, 0.0), 1.0); + } else { + res = box; + } + + outBoxTexture.write(half4(res), gid.xy, gid.z); + } + + float4 variance = variances[0]; + if (gid.z < param.numPriors) { + float4 variances_output; + variances_output.x = variance.x; + variances_output.y = variance.y; + variances_output.z = variance.z; + variances_output.w = variance.w; + varianceTexture.write(half4(variances_output), gid.xy, gid.z); + } +} + + diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ReluKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ReluKernel.metal similarity index 50% rename from metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ReluKernel.metal rename to metal/paddle-mobile-metallib/paddle-mobile-metallib/ReluKernel.metal index e725440bbe997d571f1860bce323516144a94da8..725222d75e1b0c40ecfd2e4f95f35c13e7851e21 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ReluKernel.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ReluKernel.metal @@ -17,25 +17,25 @@ using namespace metal; kernel void relu_half(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - uint3 gid [[thread_position_in_grid]]) { - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) return; - constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero); - const half4 input = inTexture.read(gid.xy, gid.z); - const float4 relu = fmax((float4)input, 0.0); - outTexture.write(half4(relu), gid.xy, gid.z); + texture2d_array outTexture [[texture(1)]], + uint3 gid [[thread_position_in_grid]]) { + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) return; + constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero); + const half4 input = inTexture.read(gid.xy, gid.z); + const float4 relu = fmax((float4)input, 0.0); + outTexture.write(half4(relu), gid.xy, gid.z); } kernel void relu(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - uint3 gid [[thread_position_in_grid]]) { - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) return; - constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero); - const float4 input = inTexture.read(gid.xy, gid.z); - const float4 relu = fmax((float4)input, 0.0); - outTexture.write(float4(relu), gid.xy, gid.z); + texture2d_array outTexture [[texture(1)]], + uint3 gid [[thread_position_in_grid]]) { + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) return; + constexpr sampler s(coord::pixel, filter::nearest, address::clamp_to_zero); + const float4 input = inTexture.read(gid.xy, gid.z); + const float4 relu = fmax((float4)input, 0.0); + outTexture.write(float4(relu), gid.xy, gid.z); } diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ReshapeKernel.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ReshapeKernel.inc.metal new file mode 100644 index 0000000000000000000000000000000000000000..3037e404a354ae6471db9056d84661af1434e2f7 --- /dev/null +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ReshapeKernel.inc.metal @@ -0,0 +1,66 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#ifdef P + +#define CONCAT2(a, b) a ## b +#define CONCAT2_(a, b) a ## _ ## b +#define CONCAT3_(a, b, c) a ## _ ## b ## _ ## c +#define CONCAT4_(a, b, c, d) a ## _ ## b ## _ ## c ## _ ## d + +#define FUNC(f, r1, r2, p) CONCAT4_(f, r1, r2, p) +#define VECTOR(p, n) CONCAT2(p, n) +#define FUNC_R(f, r) CONCAT2_(f, r) + +kernel void FUNC(reshape, RIN, ROUT, P)(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant ReshapeParam &rp [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]) { + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) return; + + int oxyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0}, oabcd[4], ixyzn[4], iabcd[4]; + ReshapeParam lrp = rp; + int oC = lrp.odim[lrp.otrans[3]]; + int iC = lrp.idim[lrp.itrans[3]]; + int count = lrp.odim[0] * lrp.odim[1] * lrp.odim[2] * lrp.odim[3]; + VECTOR(P, 4) r; + for (int n = 0; n < 4; n++) { + oxyzn[3] = n; +#if ROUT == 4 + xyzn2abcd_4(oC, oxyzn, oabcd); +#else + FUNC_R(xyzn2abcd, ROUT)(oxyzn, oabcd); +#endif + int tabcd[4]; + invtrans(lrp.otrans, oabcd, tabcd); + int index = abcd2index(lrp.odim, tabcd); + if (index < count) { + index2abcd(lrp.idim, index, tabcd); + trans(lrp.itrans, tabcd, iabcd); +#if RIN == 4 + abcd2xyzn_4(iC, iabcd, ixyzn); +#else + FUNC_R(abcd2xyzn, RIN)(iabcd, ixyzn); +#endif + r[n] = inTexture.read(uint2(ixyzn[0], ixyzn[1]), ixyzn[2])[ixyzn[3]]; + } else { + r[n] = 0; + } + } + outTexture.write(r, gid.xy, gid.z); +} + +#endif diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ReshapeKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ReshapeKernel.metal similarity index 97% rename from metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ReshapeKernel.metal rename to metal/paddle-mobile-metallib/paddle-mobile-metallib/ReshapeKernel.metal index d2f5815d422ec8c4f3e1e3c1992855547e002264..bb155a87a3fe5f7acfb633eded934b64ea4df178 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ReshapeKernel.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ReshapeKernel.metal @@ -18,10 +18,10 @@ using namespace metal; struct ReshapeParam { - int32_t idim[4]; - int32_t itrans[4]; - int32_t odim[4]; - int32_t otrans[4]; + int32_t idim[4]; + int32_t itrans[4]; + int32_t odim[4]; + int32_t otrans[4]; }; #define P float diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ResizeBilinear.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ResizeBilinear.metal new file mode 100644 index 0000000000000000000000000000000000000000..3cca15d5518b37743ec8fb6283a4d8583e0520b6 --- /dev/null +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ResizeBilinear.metal @@ -0,0 +1,75 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include +using namespace metal; + +struct resize_bilinear_param { + // int32_t out_h; + // int32_t out_w; + float ratio_h; + float ratio_w; +}; + +kernel void resize_bilinear(texture2d_array input [[texture(0)]], + texture2d_array output [[texture(2)]], + constant resize_bilinear_param & pm [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]) { + float4 r; + if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) { + r = input.read(gid.xy, gid.z); + } else { + float w = gid.x * pm.ratio_w; + float h = gid.y * pm.ratio_h; + uint w0 = w, h0 = h; + uint w1 = w0 + 1, h1 = h0 + 1; + float w1lambda = w - w0, h1lambda = h - h0; + float w2lambda = 1.0 - w1lambda, h2lambda = 1.0 - h1lambda; + if (w1 >= input.get_width()) w1 = w0; + if (h1 >= input.get_height()) h1 = h0; + float4 r0 = input.read(uint2(w0, h0), gid.z); + float4 r1 = input.read(uint2(w1, h0), gid.z); + float4 r2 = input.read(uint2(w0, h1), gid.z); + float4 r3 = input.read(uint2(w1, h1), gid.z); + r = h2lambda * (w2lambda * r0 + w1lambda * r1) + h1lambda * (w2lambda * r2 + w1lambda * r3); + } + output.write(r, gid.xy, gid.z); +} + +kernel void resize_bilinear_half(texture2d_array input [[texture(0)]], + texture2d_array output [[texture(2)]], + constant resize_bilinear_param & pm [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]) { + + half4 r; + if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) { + r = input.read(gid.xy, gid.z); + } else { + half w = gid.x * pm.ratio_w; + half h = gid.y * pm.ratio_h; + uint w0 = w, h0 = h; + uint w1 = w0 + 1, h1 = h0 + 1; + half w1lambda = w - w0, h1lambda = h - h0; + half w2lambda = 1.0 - w1lambda, h2lambda = 1.0 - h1lambda; + if (w1 >= input.get_width()) w1 = w0; + if (h1 >= input.get_height()) h1 = h0; + half4 r0 = input.read(uint2(w0, h0), gid.z); + half4 r1 = input.read(uint2(w1, h0), gid.z); + half4 r2 = input.read(uint2(w0, h1), gid.z); + half4 r3 = input.read(uint2(w1, h1), gid.z); + r = h2lambda * (w2lambda * r0 + w1lambda * r1) + h1lambda * (w2lambda * r2 + w1lambda * r3); + } + output.write(r, gid.xy, gid.z); + output.write(r, gid.xy, gid.z); +} diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Scale.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Scale.metal new file mode 100644 index 0000000000000000000000000000000000000000..62b5fd0c929e5dae1d6dbb1e70c739b59b8b7192 --- /dev/null +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Scale.metal @@ -0,0 +1,30 @@ +// +// Scale.metal +// paddle-mobile +// +// Created by liuRuiLong on 2019/1/4. +// Copyright © 2019 orange. All rights reserved. +// + +#include +using namespace metal; + +kernel void scale(texture2d inTexture [[texture(0)]], texture2d outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]]) { + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height()) return; + float w_stride = inTexture.get_width() / outTexture.get_width(); + float h_stride = inTexture.get_height() / outTexture.get_height(); + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + float4 input = inTexture.sample(sample, float2(gid.x * w_stride, gid.y * h_stride), 0); + outTexture.write(input, gid); +} + +kernel void scale_half(texture2d inTexture [[texture(0)]], texture2d outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]]) { + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height()) return; + float w_stride = inTexture.get_width() / outTexture.get_width(); + float h_stride = inTexture.get_height() / outTexture.get_height(); + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + float4 input = inTexture.sample(sample, float2(gid.x * w_stride, gid.y * h_stride), 0); + outTexture.write(half4(input), gid); +} diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/Shape.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Shape.metal similarity index 100% rename from metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/Shape.metal rename to metal/paddle-mobile-metallib/paddle-mobile-metallib/Shape.metal diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Softmax.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Softmax.inc.metal new file mode 100644 index 0000000000000000000000000000000000000000..3affcadd799e3a95e2f39822b4089094003b1cff --- /dev/null +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Softmax.inc.metal @@ -0,0 +1,61 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#ifdef P + +#define CONCAT2(a, b) a ## b +#define CONCAT2_(a, b) a ## _ ## b + +#define FUNC(f, p) CONCAT2_(f, p) +#define VECTOR(p, n) CONCAT2(p, n) + +kernel void FUNC(softmax, P)(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant SoftmaxParam &sp [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]) { + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) return; + // int zsize = inTexture.get_array_size(); + P maxv = inTexture.read(uint2(0, gid.y), 0)[0]; + int group = sp.K / 4; + int remain = sp.K % 4; + for (int x = 0; x < group; x++) { + VECTOR(P, 4) r = inTexture.read(uint2(x, gid.y), 0); + maxv = max(maxv, max(r[0], max(r[1], max(r[2], r[3])))); + } + if (remain > 0) { + VECTOR(P, 4) r = inTexture.read(uint2(group, gid.y), 0); + for (int i = 0; i < remain; i++) { + maxv = max(maxv, r[i]); + } + } + VECTOR(P, 4) rsum = {0, 0, 0, 0}; + for (int x = 0; x < group; x++) { + VECTOR(P, 4) r = inTexture.read(uint2(x, gid.y), 0); + rsum += exp(r - maxv); + } + P sum = rsum[0] + rsum[1] + rsum[2] + rsum[3]; + if (remain > 0) { + VECTOR(P, 4) r = inTexture.read(uint2(group, gid.y), 0); + for (int i = 0; i < remain; i++) { + sum += exp(r[i] - maxv); + } + } + VECTOR(P, 4) rr = inTexture.read(gid.xy, gid.z); + rr = exp(rr - maxv) / sum; + outTexture.write(rr, gid.xy, gid.z); +} + +#endif diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/Softmax.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Softmax.metal similarity index 97% rename from metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/Softmax.metal rename to metal/paddle-mobile-metallib/paddle-mobile-metallib/Softmax.metal index 67c279a4441095e710985c65d85aac589b7d0f54..f4bc8de4bc0f825d7e40d3e9deb0a8579cbae47b 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/Softmax.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Softmax.metal @@ -16,8 +16,8 @@ using namespace metal; struct SoftmaxParam { - int N; - int K; + int N; + int K; }; #define P float diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/Split.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Split.inc.metal similarity index 57% rename from metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/Split.inc.metal rename to metal/paddle-mobile-metallib/paddle-mobile-metallib/Split.inc.metal index 54e3f21e793a9c1474f13fed61857211cb7d117f..1c9bcc7e18874316db68809688622dc7ec12058b 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/Split.inc.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Split.inc.metal @@ -36,41 +36,41 @@ #if V == VY kernel void FUNC(split, R, N, VV, P)(texture2d_array input [[texture(0)]], - texture2d_array out1 [[texture(1)]], - texture2d_array out2 [[texture(2)]], + texture2d_array out1 [[texture(1)]], + texture2d_array out2 [[texture(2)]], #if N >= 3 - texture2d_array out3 [[texture(3)]], + texture2d_array out3 [[texture(3)]], #endif // N >= 3 #if N >= 4 - texture2d_array out4 [[texture(4)]], + texture2d_array out4 [[texture(4)]], #endif // N >= 4 - constant SplitParam &sp [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]) { - - VECTOR(P, 4) r = input.read(gid.xy, gid.z); - int y = gid.y - sp.offset; - if (y < sp.vdim[0]) { - out1.write(r, gid.xy, gid.z); - return; - } - y -= sp.vdim[0]; - if (y < sp.vdim[1]) { - out2.write(r, uint2(gid.x, y), gid.z); - return; - } + constant SplitParam &sp [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]) { + + VECTOR(P, 4) r = input.read(gid.xy, gid.z); + int y = gid.y - sp.offset; + if (y < sp.vdim[0]) { + out1.write(r, gid.xy, gid.z); + return; + } + y -= sp.vdim[0]; + if (y < sp.vdim[1]) { + out2.write(r, uint2(gid.x, y), gid.z); + return; + } #if N >= 3 - y -= sp.vdim[1]; - if (y < sp.vdim[2]) { - out3.write(r, uint2(gid.x, y), gid.z); - return; - } + y -= sp.vdim[1]; + if (y < sp.vdim[2]) { + out3.write(r, uint2(gid.x, y), gid.z); + return; + } #endif // N >= 3 #if N >= 4 - y -= sp.vdim[2]; - if (y < sp.vdim[3]) { - out4.write(r, uint2(gid.x, y), gid.z); - return; - } + y -= sp.vdim[2]; + if (y < sp.vdim[3]) { + out4.write(r, uint2(gid.x, y), gid.z); + return; + } #endif // N >= 4 } #endif // V == VY @@ -88,30 +88,30 @@ kernel void FUNC(split, R, N, VV, P)(texture2d_array input [[te #endif // N >= 4 constant SplitParam &sp [[buffer(0)]], uint3 gid [[thread_position_in_grid]]) { - VECTOR(P, 4) r = input.read(gid.xy, gid.z); - int x = gid.x; - if (x < sp.vdim[0]) { - out1.write(r, gid.xy, gid.z); - return; - } - x -= sp.vdim[0]; - if (x < sp.vdim[1]) { - out2.write(r, uint2(x, gid.y), gid.z); - return; - } + VECTOR(P, 4) r = input.read(gid.xy, gid.z); + int x = gid.x; + if (x < sp.vdim[0]) { + out1.write(r, gid.xy, gid.z); + return; + } + x -= sp.vdim[0]; + if (x < sp.vdim[1]) { + out2.write(r, uint2(x, gid.y), gid.z); + return; + } #if N >= 3 - x -= sp.vdim[1]; - if (x < sp.vdim[2]) { - out3.write(r, uint2(x, gid.y), gid.z); - return; - } + x -= sp.vdim[1]; + if (x < sp.vdim[2]) { + out3.write(r, uint2(x, gid.y), gid.z); + return; + } #endif // N >= 3 #if N >= 4 - x -= sp.vdim[2]; - if (x < sp.vdim[3]) { - out4.write(r, uint2(x, gid.y), gid.z); - return; - } + x -= sp.vdim[2]; + if (x < sp.vdim[3]) { + out4.write(r, uint2(x, gid.y), gid.z); + return; + } #endif // N >= 4 } #endif // V == VX diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/Split.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Split.metal similarity index 66% rename from metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/Split.metal rename to metal/paddle-mobile-metallib/paddle-mobile-metallib/Split.metal index 4c1e818d2bf5c7266169f406fbfaf8e322685dc4..d167608fbb54793f08f2bc620101fa3492293c0f 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/Split.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Split.metal @@ -18,11 +18,11 @@ using namespace metal; struct SplitParam { - int32_t idim[4]; - int32_t axis; - int32_t offset; - int32_t trans[4]; - int32_t vdim[4]; + int32_t idim[4]; + int32_t axis; + int32_t offset; + int32_t trans[4]; + int32_t vdim[4]; }; #define VNORMAL 1 @@ -36,29 +36,29 @@ struct SplitParam { //// ssd-ar: (R=3, N=2, V=y) #define V VY - #define R 3 - #define N 2 - #define P float - #include "Split.inc.metal" - #undef P - #define P half - #include "Split.inc.metal" - #undef P - #undef N - #undef R +#define R 3 +#define N 2 +#define P float +#include "Split.inc.metal" +#undef P +#define P half +#include "Split.inc.metal" +#undef P +#undef N +#undef R #undef V //// ssd-ar: (R=2, N=2, V=y) #define V VY - #define R 2 - #define N 2 - #define P float - #include "Split.inc.metal" - #undef P - #define P half - #include "Split.inc.metal" - #undef P - #undef N - #undef R +#define R 2 +#define N 2 +#define P float +#include "Split.inc.metal" +#undef P +#define P half +#include "Split.inc.metal" +#undef P +#undef N +#undef R #undef V diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/TransposeKernel.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/TransposeKernel.inc.metal similarity index 53% rename from metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/TransposeKernel.inc.metal rename to metal/paddle-mobile-metallib/paddle-mobile-metallib/TransposeKernel.inc.metal index 534166e45fc3db49cc5de526ec0d5179ca3f9899..d80361da46d53be81314711ad4d3c6e5420fcdc4 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/TransposeKernel.inc.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/TransposeKernel.inc.metal @@ -22,39 +22,39 @@ #define VECTOR(p, n) CONCAT2(p, n) kernel void FUNC(transpose, R, P)(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant TransposeParam &pm [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]) { - VECTOR(P, 4) r; - int oxyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0}; - int iabcd[4], oabcd[4], ixyzn[4]; - for (int n = 0; n < 4; n++) { - oxyzn[3] = n; + texture2d_array outTexture [[texture(1)]], + constant TransposeParam &pm [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]) { + VECTOR(P, 4) r; + int oxyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0}; + int iabcd[4], oabcd[4], ixyzn[4]; + for (int n = 0; n < 4; n++) { + oxyzn[3] = n; #if R == 4 - xyzn2abcd_4(pm.oC, oxyzn, iabcd); + xyzn2abcd_4(pm.oC, oxyzn, iabcd); #endif // R == 4 #if R == 3 - xyzn2abcd_3(oxyzn, oabcd); + xyzn2abcd_3(oxyzn, oabcd); #endif // R == 3 #if R == 2 - xyzn2abcd_2(oxyzn, oabcd); + xyzn2abcd_2(oxyzn, oabcd); #endif // R == 2 - iabcd[pm.axis[0]] = oabcd[0]; - iabcd[pm.axis[1]] = oabcd[1]; - iabcd[pm.axis[2]] = oabcd[2]; - iabcd[pm.axis[3]] = oabcd[3]; + iabcd[pm.axis[0]] = oabcd[0]; + iabcd[pm.axis[1]] = oabcd[1]; + iabcd[pm.axis[2]] = oabcd[2]; + iabcd[pm.axis[3]] = oabcd[3]; #if R == 4 - abcd2xyzn_4(pm.iC, iabcd, ixyzn); + abcd2xyzn_4(pm.iC, iabcd, ixyzn); #endif // R == 4 #if R == 3 - abcd2xyzn_3(iabcd, ixyzn); + abcd2xyzn_3(iabcd, ixyzn); #endif // R == 3 #if R == 2 - abcd2xyzn_2(iabcd, ixyzn); + abcd2xyzn_2(iabcd, ixyzn); #endif // R == 2 - r[n] = inTexture.read(uint2(ixyzn[0], ixyzn[1]), ixyzn[2])[ixyzn[3]]; - } - outTexture.write(r, gid.xy, gid.z); + r[n] = inTexture.read(uint2(ixyzn[0], ixyzn[1]), ixyzn[2])[ixyzn[3]]; + } + outTexture.write(r, gid.xy, gid.z); } #endif diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/TransposeKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/TransposeKernel.metal new file mode 100644 index 0000000000000000000000000000000000000000..66c22f03883cb0cdcac9eff9866718735908ca0a --- /dev/null +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/TransposeKernel.metal @@ -0,0 +1,63 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include +#include "Common.metal" +using namespace metal; + +struct TransposeParam { + int iC; + int oC; + int axis[4]; +}; + +kernel void transpose_copy_float(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant TransposeParam &pm [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]) { + outTexture.write(inTexture.read(gid.xy, gid.z), gid.xy, gid.z); +} +kernel void transpose_copy_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant TransposeParam &pm [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]) { + outTexture.write(inTexture.read(gid.xy, gid.z), gid.xy, gid.z); +} + +#define R 4 +#define P float +#include "TransposeKernel.inc.metal" +#undef P +#define P half +#include "TransposeKernel.inc.metal" +#undef P +#undef R + +#define R 3 +#define P float +#include "TransposeKernel.inc.metal" +#undef P +#define P half +#include "TransposeKernel.inc.metal" +#undef P +#undef R + +#define R 2 +#define P float +#include "TransposeKernel.inc.metal" +#undef P +#define P half +#include "TransposeKernel.inc.metal" +#undef P +#undef R diff --git a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/ViewController.swift b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/ViewController.swift index 98f03affa2a230b2698edf6bafe5e06def8986b6..052cac90d7793e0f07c049b2d64879447e363695 100644 --- a/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/ViewController.swift +++ b/metal/paddle-mobile-unit-test/paddle-mobile-unit-test/ViewController.swift @@ -27,8 +27,8 @@ class ViewController: UIViewController { inQueue: queue ) test.testConcat() -// test.testReshape() -// test.testTranspose() + // test.testReshape() + // test.testTranspose() print(" done ") } diff --git a/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj b/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj index 17eeb75bfffcd5bb2b0d484b0fe2c9048049bebd..14cfcb3a92158fa9d98d2d2574ccc23eaa3f5477 100644 --- a/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj +++ b/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj @@ -11,31 +11,16 @@ 456BB7B521F5B356001474E2 /* Framework.pbobjc.h in Headers */ = {isa = PBXBuildFile; fileRef = 456BB7B321F5B356001474E2 /* Framework.pbobjc.h */; settings = {ATTRIBUTES = (Public, ); }; }; 4AA1EA862146625E00D0F791 /* BilinearInterpOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA852146625E00D0F791 /* BilinearInterpOp.swift */; }; 4AA1EA88214662BD00D0F791 /* BilinearInterpKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA87214662BD00D0F791 /* BilinearInterpKernel.swift */; }; - 4AA1EA8A2146631C00D0F791 /* BilinearInterp.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA892146631C00D0F791 /* BilinearInterp.metal */; }; 4AA1EA8C2146640900D0F791 /* SplitOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA8B2146640900D0F791 /* SplitOp.swift */; }; 4AA1EA8E2146647F00D0F791 /* SplitKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA8D2146647F00D0F791 /* SplitKernel.swift */; }; - 4AA1EA90214664CD00D0F791 /* Split.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA8F214664CD00D0F791 /* Split.metal */; }; 4AA1EA92214665D700D0F791 /* ShapeOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA91214665D700D0F791 /* ShapeOp.swift */; }; 4AA1EA942146661500D0F791 /* ShapeKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA932146661500D0F791 /* ShapeKernel.swift */; }; 4AA1EA982146666500D0F791 /* FlattenOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA972146666500D0F791 /* FlattenOp.swift */; }; - 4AA1EAA02148DEEE00D0F791 /* ReshapeKernel.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EA9F2148DEEE00D0F791 /* ReshapeKernel.inc.metal */; }; 4AA1EAA2214912CD00D0F791 /* FlattenKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EAA1214912CC00D0F791 /* FlattenKernel.swift */; }; - 4AA1EAA4214A295C00D0F791 /* Split.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EAA3214A295C00D0F791 /* Split.inc.metal */; }; - 4AA1EAA6214B5F6800D0F791 /* Shape.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EAA5214B5F6800D0F791 /* Shape.metal */; }; - 4AA1EAA8214B7AFB00D0F791 /* BilinearInterp.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EAA7214B7AFB00D0F791 /* BilinearInterp.inc.metal */; }; - 4AA1EAAA214F53D800D0F791 /* BoxCoder.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EAA9214F53D800D0F791 /* BoxCoder.inc.metal */; }; - 4AA1EAAC214F55C800D0F791 /* Softmax.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EAAB214F55C800D0F791 /* Softmax.inc.metal */; }; - 4AA1EAAE214F5FD900D0F791 /* TransposeKernel.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AA1EAAD214F5FD900D0F791 /* TransposeKernel.inc.metal */; }; - 4AF928772133F1DB005B6C3A /* BoxCoder.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AF928762133F1DB005B6C3A /* BoxCoder.metal */; }; - 4AF9287921341661005B6C3A /* Softmax.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AF9287821341661005B6C3A /* Softmax.metal */; }; - 4AF928822135673D005B6C3A /* ConcatKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AF928812135673D005B6C3A /* ConcatKernel.metal */; }; - 4AF9288421357BE3005B6C3A /* Elementwise.metal in Sources */ = {isa = PBXBuildFile; fileRef = 4AF9288321357BE3005B6C3A /* Elementwise.metal */; }; C28FE02F21BA68C00054EFAC /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = C28FE02C21BA68C00054EFAC /* Metal.framework */; }; C28FE03021BA68C00054EFAC /* MetalPerformanceShaders.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = C28FE02D21BA68C00054EFAC /* MetalPerformanceShaders.framework */; }; C28FE03121BA68C00054EFAC /* MetalKit.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = C28FE02E21BA68C00054EFAC /* MetalKit.framework */; }; D3831F70E7E0B565B9AC22DA /* Pods_paddle_mobile.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = DD2E06330A1E7129C918DB46 /* Pods_paddle_mobile.framework */; }; - FC0226562138F33800F395E2 /* TransposeKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC0226552138F33800F395E2 /* TransposeKernel.metal */; }; - FC0226582138F38D00F395E2 /* PoolKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC0226572138F38D00F395E2 /* PoolKernel.metal */; }; FC039B6F20E11C3C0081E9F8 /* paddle_mobile.h in Headers */ = {isa = PBXBuildFile; fileRef = FC039B6D20E11C3C0081E9F8 /* paddle_mobile.h */; settings = {ATTRIBUTES = (Public, ); }; }; FC039B9720E11C9A0081E9F8 /* Extensions.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039B9420E11C9A0081E9F8 /* Extensions.swift */; }; FC039B9820E11C9A0081E9F8 /* Errors.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC039B9520E11C9A0081E9F8 /* Errors.swift */; }; @@ -61,40 +46,25 @@ FC0E2DBC20EE45FE009C1FAC /* ConvKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC0E2DBB20EE45FE009C1FAC /* ConvKernel.swift */; }; FC0E2DBE20EE460D009C1FAC /* BatchNormKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC0E2DBD20EE460D009C1FAC /* BatchNormKernel.swift */; }; FC0E2DC020EE461F009C1FAC /* ElementwiseAddKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC0E2DBF20EE461F009C1FAC /* ElementwiseAddKernel.swift */; }; - FC1B16B320EC9A4F00678B91 /* Kernels.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC1B16B220EC9A4F00678B91 /* Kernels.metal */; }; FC1CF3F721D4B4C400F7392E /* Runner.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC1CF3F621D4B4C400F7392E /* Runner.swift */; }; FC2BFCC221DF2F9100C262B2 /* GlobalConfig.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC2BFCC121DF2F9100C262B2 /* GlobalConfig.swift */; }; FC2BFD4621DF685F00C262B2 /* Scale.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC2BFD4521DF685F00C262B2 /* Scale.swift */; }; FC2BFD4A21DF81DE00C262B2 /* Kernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC2BFD4921DF81DE00C262B2 /* Kernel.swift */; }; FC2BFD4E21DF820B00C262B2 /* ConvAddBatchNormReluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC2BFD4D21DF820A00C262B2 /* ConvAddBatchNormReluOp.swift */; }; - FC2BFD5121DF8E0400C262B2 /* Scale.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC2BFD5021DF8E0400C262B2 /* Scale.metal */; }; FC3602CC2108819F00FACB58 /* PaddleMobileUnitTest.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC3602CB2108819F00FACB58 /* PaddleMobileUnitTest.swift */; }; - FC4CB74920F0B954007C0C6D /* ConvKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC4CB74820F0B954007C0C6D /* ConvKernel.metal */; }; FC4CB74B20F12C30007C0C6D /* ProgramOptimize.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC4CB74A20F12C30007C0C6D /* ProgramOptimize.swift */; }; FC5163F620EF556E00636C28 /* Texture2DTo2DArrayKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC5163F520EF556E00636C28 /* Texture2DTo2DArrayKernel.swift */; }; FC60DB8920E9AAA500FF203F /* MetalExtension.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC60DB8820E9AAA500FF203F /* MetalExtension.swift */; }; FC803BBF214CB65A0094B8E5 /* ConvAddPreluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC803BBE214CB65A0094B8E5 /* ConvAddPreluOp.swift */; }; FC803BC1214CB77A0094B8E5 /* ConvAddPreluKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC803BC0214CB77A0094B8E5 /* ConvAddPreluKernel.swift */; }; - FC803BC3214CB79C0094B8E5 /* ConvAddPreluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC803BC2214CB79C0094B8E5 /* ConvAddPreluKernel.metal */; }; - FC803BC5214CB8F00094B8E5 /* ConvAddPrelu.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC803BC4214CB8F00094B8E5 /* ConvAddPrelu.inc.metal */; }; - FC803BC7214CBA820094B8E5 /* Macro.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC803BC6214CBA820094B8E5 /* Macro.metal */; }; - FC803BC9214CFC8D0094B8E5 /* FetchKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC803BC8214CFC8D0094B8E5 /* FetchKernel.metal */; }; FC82735920E3C04200BE430A /* OpCreator.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC82735820E3C04200BE430A /* OpCreator.swift */; }; FC9797C921D6101D00F2FD90 /* ResizeBilinearOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC9797C821D6101D00F2FD90 /* ResizeBilinearOp.swift */; }; FC9797CB21D6102D00F2FD90 /* ResizeBilinearKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC9797CA21D6102D00F2FD90 /* ResizeBilinearKernel.swift */; }; - FC9C2A0D21D3D185005856C6 /* FetchKernel.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC9C2A0C21D3D185005856C6 /* FetchKernel.inc.metal */; }; FC9D037920E229E4000F735A /* OpParam.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC9D037820E229E4000F735A /* OpParam.swift */; }; FC9D038020E22FBB000F735A /* FeedOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC9D037F20E22FBB000F735A /* FeedOp.swift */; }; FC9D038220E2312E000F735A /* FetchOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC9D038120E2312E000F735A /* FetchOp.swift */; }; FC9D038420E23B01000F735A /* Texture.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC9D038320E23B01000F735A /* Texture.swift */; }; - FCA3A1632132A4AC00084FE5 /* ReshapeKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCA3A1622132A4AC00084FE5 /* ReshapeKernel.metal */; }; - FCA3A1652132A5EB00084FE5 /* Common.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCA3A1642132A5EB00084FE5 /* Common.metal */; }; - FCA67B1721364EF000BD58AA /* ConvTransposeKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCA67B1621364EF000BD58AA /* ConvTransposeKernel.metal */; }; - FCA67CD52138272900BD58AA /* ConvAddMetal.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCA67CD42138272900BD58AA /* ConvAddMetal.metal */; }; - FCA67CD7213827AC00BD58AA /* ConvAddBNReluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCA67CD6213827AC00BD58AA /* ConvAddBNReluKernel.metal */; }; - FCA67CD92138287B00BD58AA /* ConvBNReluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCA67CD82138287B00BD58AA /* ConvBNReluKernel.metal */; }; FCB40E5921E0DCAB0075EC91 /* FetchKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCB40E5821E0DCAB0075EC91 /* FetchKernel.swift */; }; - FCB91DC221FEEE990051C6B2 /* BufferToTexture.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCB91DC121FEEE990051C6B2 /* BufferToTexture.metal */; }; FCBCCC572122F41300D94F7E /* DwConvBNReluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC562122F41300D94F7E /* DwConvBNReluOp.swift */; }; FCBCCC592122F42700D94F7E /* ConvBNReluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC582122F42700D94F7E /* ConvBNReluOp.swift */; }; FCBCCC5B2122F66F00D94F7E /* ConvBNReluKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC5A2122F66F00D94F7E /* ConvBNReluKernel.swift */; }; @@ -109,7 +79,6 @@ FCBCCC6D2123073A00D94F7E /* BoxcoderKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC6C2123073A00D94F7E /* BoxcoderKernel.swift */; }; FCBCCC6F2123097100D94F7E /* MulticlassNMSOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC6E2123097100D94F7E /* MulticlassNMSOp.swift */; }; FCBCCC71212309A700D94F7E /* MulticlassNMSKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC70212309A700D94F7E /* MulticlassNMSKernel.swift */; }; - FCCED5E121D71FC000BE8D5F /* PoolKernel.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCCED5E021D71FC000BE8D5F /* PoolKernel.inc.metal */; }; FCD04E6620F314C50007374F /* PoolOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCD04E6520F314C50007374F /* PoolOp.swift */; }; FCD04E6820F315020007374F /* PoolKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCD04E6720F315020007374F /* PoolKernel.swift */; }; FCD04E6A20F319EC0007374F /* SoftmaxOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCD04E6920F319EC0007374F /* SoftmaxOp.swift */; }; @@ -121,19 +90,12 @@ FCDC0FEB21099A1D00DC9EFB /* Tools.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCDC0FEA21099A1D00DC9EFB /* Tools.swift */; }; FCDDC6C6212F9FB800E5EF74 /* PreluKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCDDC6C5212F9FB800E5EF74 /* PreluKernel.swift */; }; FCDDC6C8212FA3CA00E5EF74 /* ConvTransposeKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCDDC6C7212FA3CA00E5EF74 /* ConvTransposeKernel.swift */; }; - FCDDC6CA212FDF6800E5EF74 /* BatchNormKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCDDC6C9212FDF6800E5EF74 /* BatchNormKernel.metal */; }; - FCDDC6CC212FDFDB00E5EF74 /* ReluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCDDC6CB212FDFDB00E5EF74 /* ReluKernel.metal */; }; - FCDDC6CF212FE14700E5EF74 /* PriorBoxKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCDDC6CE212FE14700E5EF74 /* PriorBoxKernel.metal */; }; FCDE8A33212A917900F4A8F6 /* ConvTransposeOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCDE8A32212A917900F4A8F6 /* ConvTransposeOp.swift */; }; FCE3A1A92153DE5100C37CDE /* ConvAddAddPreluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCE3A1A82153DE5100C37CDE /* ConvAddAddPreluOp.swift */; }; FCE3A1AB2153DE8C00C37CDE /* ConvAddAddPreluKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCE3A1AA2153DE8C00C37CDE /* ConvAddAddPreluKernel.swift */; }; FCE3A1AD2153E8BA00C37CDE /* ElementwiseAddPreluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCE3A1AC2153E8BA00C37CDE /* ElementwiseAddPreluOp.swift */; }; FCE3A1AF2153E8EE00C37CDE /* ElementwiseAddPreluKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCE3A1AE2153E8EE00C37CDE /* ElementwiseAddPreluKernel.swift */; }; - FCE3A1B12153E90F00C37CDE /* ElementwiseAddPreluKernel.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCE3A1B02153E90F00C37CDE /* ElementwiseAddPreluKernel.inc.metal */; }; - FCE3A1B32153E91900C37CDE /* ElementwiseAddPreluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCE3A1B22153E91900C37CDE /* ElementwiseAddPreluKernel.metal */; }; FCE9D7B7214F869000B520C3 /* Net.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCE9D7B6214F869000B520C3 /* Net.swift */; }; - FCE9D7B9214FAA4800B520C3 /* NMSFetchResultKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCE9D7B8214FAA4800B520C3 /* NMSFetchResultKernel.metal */; }; - FCEB684A212F00DB00D2448E /* PreluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCEB6849212F00DB00D2448E /* PreluKernel.metal */; }; FCEB684C212F093800D2448E /* PreluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCEB684B212F093800D2448E /* PreluOp.swift */; }; FCEBC0F620F1FE120099DBAF /* ConvAddBatchNormReluKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCEBC0F520F1FE120099DBAF /* ConvAddBatchNormReluKernel.swift */; }; /* End PBXBuildFile section */ @@ -143,34 +105,18 @@ 456BB7B321F5B356001474E2 /* Framework.pbobjc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = Framework.pbobjc.h; sourceTree = ""; }; 4AA1EA852146625E00D0F791 /* BilinearInterpOp.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = BilinearInterpOp.swift; sourceTree = ""; }; 4AA1EA87214662BD00D0F791 /* BilinearInterpKernel.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = BilinearInterpKernel.swift; sourceTree = ""; }; - 4AA1EA892146631C00D0F791 /* BilinearInterp.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = BilinearInterp.metal; sourceTree = ""; }; 4AA1EA8B2146640900D0F791 /* SplitOp.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = SplitOp.swift; sourceTree = ""; }; 4AA1EA8D2146647F00D0F791 /* SplitKernel.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = SplitKernel.swift; sourceTree = ""; }; - 4AA1EA8F214664CD00D0F791 /* Split.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Split.metal; sourceTree = ""; }; 4AA1EA91214665D700D0F791 /* ShapeOp.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = ShapeOp.swift; sourceTree = ""; }; 4AA1EA932146661500D0F791 /* ShapeKernel.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = ShapeKernel.swift; sourceTree = ""; }; 4AA1EA972146666500D0F791 /* FlattenOp.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = FlattenOp.swift; sourceTree = ""; }; - 4AA1EA9D2148D6F900D0F791 /* ConcatKernel.inc.metal */ = {isa = PBXFileReference; explicitFileType = sourcecode.metal; fileEncoding = 4; path = ConcatKernel.inc.metal; sourceTree = ""; }; - 4AA1EA9F2148DEEE00D0F791 /* ReshapeKernel.inc.metal */ = {isa = PBXFileReference; explicitFileType = sourcecode.metal; fileEncoding = 4; path = ReshapeKernel.inc.metal; sourceTree = ""; }; 4AA1EAA1214912CC00D0F791 /* FlattenKernel.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = FlattenKernel.swift; sourceTree = ""; }; - 4AA1EAA3214A295C00D0F791 /* Split.inc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Split.inc.metal; sourceTree = ""; }; - 4AA1EAA5214B5F6800D0F791 /* Shape.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Shape.metal; sourceTree = ""; }; - 4AA1EAA7214B7AFB00D0F791 /* BilinearInterp.inc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = BilinearInterp.inc.metal; sourceTree = ""; }; - 4AA1EAA9214F53D800D0F791 /* BoxCoder.inc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = BoxCoder.inc.metal; sourceTree = ""; }; - 4AA1EAAB214F55C800D0F791 /* Softmax.inc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Softmax.inc.metal; sourceTree = ""; }; - 4AA1EAAD214F5FD900D0F791 /* TransposeKernel.inc.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = TransposeKernel.inc.metal; sourceTree = ""; }; - 4AF928762133F1DB005B6C3A /* BoxCoder.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = BoxCoder.metal; sourceTree = ""; }; - 4AF9287821341661005B6C3A /* Softmax.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Softmax.metal; sourceTree = ""; }; - 4AF928812135673D005B6C3A /* ConcatKernel.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = ConcatKernel.metal; sourceTree = ""; }; - 4AF9288321357BE3005B6C3A /* Elementwise.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = Elementwise.metal; sourceTree = ""; }; C28FE02C21BA68C00054EFAC /* Metal.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Metal.framework; path = System/Library/Frameworks/Metal.framework; sourceTree = SDKROOT; }; C28FE02D21BA68C00054EFAC /* MetalPerformanceShaders.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = MetalPerformanceShaders.framework; path = System/Library/Frameworks/MetalPerformanceShaders.framework; sourceTree = SDKROOT; }; C28FE02E21BA68C00054EFAC /* MetalKit.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = MetalKit.framework; path = System/Library/Frameworks/MetalKit.framework; sourceTree = SDKROOT; }; CDF58151D902A1CBAE56A0C2 /* Pods-paddle-mobile.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-paddle-mobile.debug.xcconfig"; path = "../Pods/Target Support Files/Pods-paddle-mobile/Pods-paddle-mobile.debug.xcconfig"; sourceTree = ""; }; DD2E06330A1E7129C918DB46 /* Pods_paddle_mobile.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = Pods_paddle_mobile.framework; sourceTree = BUILT_PRODUCTS_DIR; }; E2A7957C92EDA5C3BEC0FFC2 /* Pods-paddle-mobile.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-paddle-mobile.release.xcconfig"; path = "../Pods/Target Support Files/Pods-paddle-mobile/Pods-paddle-mobile.release.xcconfig"; sourceTree = ""; }; - FC0226552138F33800F395E2 /* TransposeKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = TransposeKernel.metal; sourceTree = ""; }; - FC0226572138F38D00F395E2 /* PoolKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = PoolKernel.metal; sourceTree = ""; }; FC039B6A20E11C3C0081E9F8 /* paddle_mobile.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = paddle_mobile.framework; sourceTree = BUILT_PRODUCTS_DIR; }; FC039B6D20E11C3C0081E9F8 /* paddle_mobile.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = paddle_mobile.h; sourceTree = ""; }; FC039B6E20E11C3C0081E9F8 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = ""; }; @@ -198,41 +144,26 @@ FC0E2DBB20EE45FE009C1FAC /* ConvKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvKernel.swift; sourceTree = ""; }; FC0E2DBD20EE460D009C1FAC /* BatchNormKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = BatchNormKernel.swift; sourceTree = ""; }; FC0E2DBF20EE461F009C1FAC /* ElementwiseAddKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ElementwiseAddKernel.swift; sourceTree = ""; }; - FC1B16B220EC9A4F00678B91 /* Kernels.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = Kernels.metal; sourceTree = ""; }; FC1CF3F621D4B4C400F7392E /* Runner.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Runner.swift; sourceTree = ""; }; FC2BFCC121DF2F9100C262B2 /* GlobalConfig.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = GlobalConfig.swift; sourceTree = ""; }; FC2BFD4521DF685F00C262B2 /* Scale.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Scale.swift; sourceTree = ""; }; FC2BFD4921DF81DE00C262B2 /* Kernel.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Kernel.swift; sourceTree = ""; }; FC2BFD4D21DF820A00C262B2 /* ConvAddBatchNormReluOp.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = ConvAddBatchNormReluOp.swift; sourceTree = ""; }; - FC2BFD5021DF8E0400C262B2 /* Scale.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = Scale.metal; sourceTree = ""; }; FC3602CB2108819F00FACB58 /* PaddleMobileUnitTest.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PaddleMobileUnitTest.swift; sourceTree = ""; }; - FC4CB74820F0B954007C0C6D /* ConvKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ConvKernel.metal; sourceTree = ""; }; FC4CB74A20F12C30007C0C6D /* ProgramOptimize.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ProgramOptimize.swift; sourceTree = ""; }; FC4FD97D2140F2C30073E130 /* libstdc++.tbd */ = {isa = PBXFileReference; lastKnownFileType = "sourcecode.text-based-dylib-definition"; name = "libstdc++.tbd"; path = "usr/lib/libstdc++.tbd"; sourceTree = SDKROOT; }; FC5163F520EF556E00636C28 /* Texture2DTo2DArrayKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Texture2DTo2DArrayKernel.swift; sourceTree = ""; }; FC60DB8820E9AAA500FF203F /* MetalExtension.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MetalExtension.swift; sourceTree = ""; }; FC803BBE214CB65A0094B8E5 /* ConvAddPreluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvAddPreluOp.swift; sourceTree = ""; }; FC803BC0214CB77A0094B8E5 /* ConvAddPreluKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvAddPreluKernel.swift; sourceTree = ""; }; - FC803BC2214CB79C0094B8E5 /* ConvAddPreluKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ConvAddPreluKernel.metal; sourceTree = ""; }; - FC803BC4214CB8F00094B8E5 /* ConvAddPrelu.inc.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ConvAddPrelu.inc.metal; sourceTree = ""; }; - FC803BC6214CBA820094B8E5 /* Macro.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = Macro.metal; sourceTree = ""; }; - FC803BC8214CFC8D0094B8E5 /* FetchKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = FetchKernel.metal; sourceTree = ""; }; FC82735820E3C04200BE430A /* OpCreator.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = OpCreator.swift; sourceTree = ""; }; FC9797C821D6101D00F2FD90 /* ResizeBilinearOp.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = ResizeBilinearOp.swift; sourceTree = ""; }; FC9797CA21D6102D00F2FD90 /* ResizeBilinearKernel.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = ResizeBilinearKernel.swift; sourceTree = ""; }; - FC9C2A0C21D3D185005856C6 /* FetchKernel.inc.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = FetchKernel.inc.metal; sourceTree = ""; }; FC9D037820E229E4000F735A /* OpParam.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = OpParam.swift; sourceTree = ""; }; FC9D037F20E22FBB000F735A /* FeedOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = FeedOp.swift; sourceTree = ""; }; FC9D038120E2312E000F735A /* FetchOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = FetchOp.swift; sourceTree = ""; }; FC9D038320E23B01000F735A /* Texture.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Texture.swift; sourceTree = ""; }; - FCA3A1622132A4AC00084FE5 /* ReshapeKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ReshapeKernel.metal; sourceTree = ""; }; - FCA3A1642132A5EB00084FE5 /* Common.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = Common.metal; sourceTree = ""; }; - FCA67B1621364EF000BD58AA /* ConvTransposeKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ConvTransposeKernel.metal; sourceTree = ""; }; - FCA67CD42138272900BD58AA /* ConvAddMetal.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ConvAddMetal.metal; sourceTree = ""; }; - FCA67CD6213827AC00BD58AA /* ConvAddBNReluKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ConvAddBNReluKernel.metal; sourceTree = ""; }; - FCA67CD82138287B00BD58AA /* ConvBNReluKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ConvBNReluKernel.metal; sourceTree = ""; }; FCB40E5821E0DCAB0075EC91 /* FetchKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = FetchKernel.swift; sourceTree = ""; }; - FCB91DC121FEEE990051C6B2 /* BufferToTexture.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = BufferToTexture.metal; sourceTree = ""; }; FCBCCC562122F41300D94F7E /* DwConvBNReluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DwConvBNReluOp.swift; sourceTree = ""; }; FCBCCC582122F42700D94F7E /* ConvBNReluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvBNReluOp.swift; sourceTree = ""; }; FCBCCC5A2122F66F00D94F7E /* ConvBNReluKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvBNReluKernel.swift; sourceTree = ""; }; @@ -247,7 +178,6 @@ FCBCCC6C2123073A00D94F7E /* BoxcoderKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = BoxcoderKernel.swift; sourceTree = ""; }; FCBCCC6E2123097100D94F7E /* MulticlassNMSOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MulticlassNMSOp.swift; sourceTree = ""; }; FCBCCC70212309A700D94F7E /* MulticlassNMSKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MulticlassNMSKernel.swift; sourceTree = ""; }; - FCCED5E021D71FC000BE8D5F /* PoolKernel.inc.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = PoolKernel.inc.metal; sourceTree = ""; }; FCD04E6520F314C50007374F /* PoolOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PoolOp.swift; sourceTree = ""; }; FCD04E6720F315020007374F /* PoolKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PoolKernel.swift; sourceTree = ""; }; FCD04E6920F319EC0007374F /* SoftmaxOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SoftmaxOp.swift; sourceTree = ""; }; @@ -259,19 +189,12 @@ FCDC0FEA21099A1D00DC9EFB /* Tools.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Tools.swift; sourceTree = ""; }; FCDDC6C5212F9FB800E5EF74 /* PreluKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PreluKernel.swift; sourceTree = ""; }; FCDDC6C7212FA3CA00E5EF74 /* ConvTransposeKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvTransposeKernel.swift; sourceTree = ""; }; - FCDDC6C9212FDF6800E5EF74 /* BatchNormKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = BatchNormKernel.metal; sourceTree = ""; }; - FCDDC6CB212FDFDB00E5EF74 /* ReluKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ReluKernel.metal; sourceTree = ""; }; - FCDDC6CE212FE14700E5EF74 /* PriorBoxKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = PriorBoxKernel.metal; sourceTree = ""; }; FCDE8A32212A917900F4A8F6 /* ConvTransposeOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvTransposeOp.swift; sourceTree = ""; }; FCE3A1A82153DE5100C37CDE /* ConvAddAddPreluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvAddAddPreluOp.swift; sourceTree = ""; }; FCE3A1AA2153DE8C00C37CDE /* ConvAddAddPreluKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvAddAddPreluKernel.swift; sourceTree = ""; }; FCE3A1AC2153E8BA00C37CDE /* ElementwiseAddPreluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ElementwiseAddPreluOp.swift; sourceTree = ""; }; FCE3A1AE2153E8EE00C37CDE /* ElementwiseAddPreluKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ElementwiseAddPreluKernel.swift; sourceTree = ""; }; - FCE3A1B02153E90F00C37CDE /* ElementwiseAddPreluKernel.inc.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ElementwiseAddPreluKernel.inc.metal; sourceTree = ""; }; - FCE3A1B22153E91900C37CDE /* ElementwiseAddPreluKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ElementwiseAddPreluKernel.metal; sourceTree = ""; }; FCE9D7B6214F869000B520C3 /* Net.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Net.swift; sourceTree = ""; }; - FCE9D7B8214FAA4800B520C3 /* NMSFetchResultKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = NMSFetchResultKernel.metal; sourceTree = ""; }; - FCEB6849212F00DB00D2448E /* PreluKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = PreluKernel.metal; sourceTree = ""; }; FCEB684B212F093800D2448E /* PreluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PreluOp.swift; sourceTree = ""; }; FCEBC0F520F1FE120099DBAF /* ConvAddBatchNormReluKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvAddBatchNormReluKernel.swift; sourceTree = ""; }; /* End PBXFileReference section */ @@ -426,7 +349,6 @@ isa = PBXGroup; children = ( FCDDC6CD212FE02100E5EF74 /* Base */, - FCEB6837212F00B100D2448E /* metal */, FC9797CA21D6102D00F2FD90 /* ResizeBilinearKernel.swift */, FCDDC6C7212FA3CA00E5EF74 /* ConvTransposeKernel.swift */, FC0E2DBB20EE45FE009C1FAC /* ConvKernel.swift */, @@ -498,52 +420,6 @@ path = Base; sourceTree = ""; }; - FCEB6837212F00B100D2448E /* metal */ = { - isa = PBXGroup; - children = ( - FCB91DC121FEEE990051C6B2 /* BufferToTexture.metal */, - 4AF928812135673D005B6C3A /* ConcatKernel.metal */, - 4AA1EA9D2148D6F900D0F791 /* ConcatKernel.inc.metal */, - 4AF9288321357BE3005B6C3A /* Elementwise.metal */, - FC1B16B220EC9A4F00678B91 /* Kernels.metal */, - FC4CB74820F0B954007C0C6D /* ConvKernel.metal */, - 4AF928762133F1DB005B6C3A /* BoxCoder.metal */, - 4AA1EAA9214F53D800D0F791 /* BoxCoder.inc.metal */, - 4AA1EAA5214B5F6800D0F791 /* Shape.metal */, - 4AA1EA8F214664CD00D0F791 /* Split.metal */, - 4AA1EAA3214A295C00D0F791 /* Split.inc.metal */, - 4AA1EA892146631C00D0F791 /* BilinearInterp.metal */, - 4AA1EAA7214B7AFB00D0F791 /* BilinearInterp.inc.metal */, - 4AF9287821341661005B6C3A /* Softmax.metal */, - 4AA1EAAB214F55C800D0F791 /* Softmax.inc.metal */, - FCEB6849212F00DB00D2448E /* PreluKernel.metal */, - FCDDC6C9212FDF6800E5EF74 /* BatchNormKernel.metal */, - FCDDC6CB212FDFDB00E5EF74 /* ReluKernel.metal */, - FCDDC6CE212FE14700E5EF74 /* PriorBoxKernel.metal */, - FCA3A1622132A4AC00084FE5 /* ReshapeKernel.metal */, - 4AA1EA9F2148DEEE00D0F791 /* ReshapeKernel.inc.metal */, - FCA3A1642132A5EB00084FE5 /* Common.metal */, - FCA67B1621364EF000BD58AA /* ConvTransposeKernel.metal */, - FCA67CD42138272900BD58AA /* ConvAddMetal.metal */, - FCA67CD6213827AC00BD58AA /* ConvAddBNReluKernel.metal */, - FCA67CD82138287B00BD58AA /* ConvBNReluKernel.metal */, - FC0226552138F33800F395E2 /* TransposeKernel.metal */, - 4AA1EAAD214F5FD900D0F791 /* TransposeKernel.inc.metal */, - FC0226572138F38D00F395E2 /* PoolKernel.metal */, - FCCED5E021D71FC000BE8D5F /* PoolKernel.inc.metal */, - FC803BC2214CB79C0094B8E5 /* ConvAddPreluKernel.metal */, - FC803BC4214CB8F00094B8E5 /* ConvAddPrelu.inc.metal */, - FC803BC6214CBA820094B8E5 /* Macro.metal */, - FC803BC8214CFC8D0094B8E5 /* FetchKernel.metal */, - FC9C2A0C21D3D185005856C6 /* FetchKernel.inc.metal */, - FCE9D7B8214FAA4800B520C3 /* NMSFetchResultKernel.metal */, - FCE3A1B02153E90F00C37CDE /* ElementwiseAddPreluKernel.inc.metal */, - FCE3A1B22153E91900C37CDE /* ElementwiseAddPreluKernel.metal */, - FC2BFD5021DF8E0400C262B2 /* Scale.metal */, - ); - path = metal; - sourceTree = ""; - }; /* End PBXGroup section */ /* Begin PBXHeadersBuildPhase section */ @@ -647,67 +523,41 @@ buildActionMask = 2147483647; files = ( FC9D038020E22FBB000F735A /* FeedOp.swift in Sources */, - FC9C2A0D21D3D185005856C6 /* FetchKernel.inc.metal in Sources */, - 4AA1EAAA214F53D800D0F791 /* BoxCoder.inc.metal in Sources */, FC039B9F20E11CB20081E9F8 /* Tensor.swift in Sources */, - FC803BC9214CFC8D0094B8E5 /* FetchKernel.metal in Sources */, - FCA67CD7213827AC00BD58AA /* ConvAddBNReluKernel.metal in Sources */, - 4AF9287921341661005B6C3A /* Softmax.metal in Sources */, 4AA1EA942146661500D0F791 /* ShapeKernel.swift in Sources */, FC0E2DBC20EE45FE009C1FAC /* ConvKernel.swift in Sources */, FC039BAA20E11CBC0081E9F8 /* ElementwiseAddOp.swift in Sources */, FCDE8A33212A917900F4A8F6 /* ConvTransposeOp.swift in Sources */, FCBCCC6B2123071700D94F7E /* BoxcoderOp.swift in Sources */, - 4AA1EAAE214F5FD900D0F791 /* TransposeKernel.inc.metal in Sources */, - 4AA1EAA4214A295C00D0F791 /* Split.inc.metal in Sources */, - FC803BC7214CBA820094B8E5 /* Macro.metal in Sources */, FC039B9B20E11CA00081E9F8 /* Executor.swift in Sources */, - 4AF9288421357BE3005B6C3A /* Elementwise.metal in Sources */, FCD04E7020F31B720007374F /* ReshapeKernel.swift in Sources */, - FCE3A1B12153E90F00C37CDE /* ElementwiseAddPreluKernel.inc.metal in Sources */, FCD04E7220F343420007374F /* ConvAddOp.swift in Sources */, FC039BBB20E11CC20081E9F8 /* PMProgramDesc.swift in Sources */, FCE3A1AB2153DE8C00C37CDE /* ConvAddAddPreluKernel.swift in Sources */, FC9D037920E229E4000F735A /* OpParam.swift in Sources */, FC3602CC2108819F00FACB58 /* PaddleMobileUnitTest.swift in Sources */, - FCDDC6CC212FDFDB00E5EF74 /* ReluKernel.metal in Sources */, - FC0226562138F33800F395E2 /* TransposeKernel.metal in Sources */, FCDDC6C6212F9FB800E5EF74 /* PreluKernel.swift in Sources */, FC9797CB21D6102D00F2FD90 /* ResizeBilinearKernel.swift in Sources */, - FCA67CD52138272900BD58AA /* ConvAddMetal.metal in Sources */, FCBCCC5B2122F66F00D94F7E /* ConvBNReluKernel.swift in Sources */, 4AA1EA8C2146640900D0F791 /* SplitOp.swift in Sources */, - FCB91DC221FEEE990051C6B2 /* BufferToTexture.metal in Sources */, - 4AA1EAAC214F55C800D0F791 /* Softmax.inc.metal in Sources */, FC0E2DC020EE461F009C1FAC /* ElementwiseAddKernel.swift in Sources */, - 4AF928772133F1DB005B6C3A /* BoxCoder.metal in Sources */, FC803BBF214CB65A0094B8E5 /* ConvAddPreluOp.swift in Sources */, FCEB684C212F093800D2448E /* PreluOp.swift in Sources */, - 4AA1EAA8214B7AFB00D0F791 /* BilinearInterp.inc.metal in Sources */, - FCA67CD92138287B00BD58AA /* ConvBNReluKernel.metal in Sources */, FC60DB8920E9AAA500FF203F /* MetalExtension.swift in Sources */, FCEBC0F620F1FE120099DBAF /* ConvAddBatchNormReluKernel.swift in Sources */, - 4AA1EA8A2146631C00D0F791 /* BilinearInterp.metal in Sources */, - FCDDC6CA212FDF6800E5EF74 /* BatchNormKernel.metal in Sources */, - FC1B16B320EC9A4F00678B91 /* Kernels.metal in Sources */, FC039BBA20E11CC20081E9F8 /* TensorDesc.swift in Sources */, FC039BA020E11CB20081E9F8 /* Dim.swift in Sources */, FC039B9920E11C9A0081E9F8 /* Types.swift in Sources */, - FC4CB74920F0B954007C0C6D /* ConvKernel.metal in Sources */, - FCA3A1632132A4AC00084FE5 /* ReshapeKernel.metal in Sources */, FCBCCC592122F42700D94F7E /* ConvBNReluOp.swift in Sources */, FC039BA920E11CBC0081E9F8 /* ConvOp.swift in Sources */, - FCCED5E121D71FC000BE8D5F /* PoolKernel.inc.metal in Sources */, FC2BFD4A21DF81DE00C262B2 /* Kernel.swift in Sources */, FC9D038420E23B01000F735A /* Texture.swift in Sources */, - FCE3A1B32153E91900C37CDE /* ElementwiseAddPreluKernel.metal in Sources */, FC2BFD4E21DF820B00C262B2 /* ConvAddBatchNormReluOp.swift in Sources */, 4AA1EAA2214912CD00D0F791 /* FlattenKernel.swift in Sources */, 4AA1EA982146666500D0F791 /* FlattenOp.swift in Sources */, FC2BFCC221DF2F9100C262B2 /* GlobalConfig.swift in Sources */, 456BB7B421F5B356001474E2 /* Framework.pbobjc.m in Sources */, FCBCCC652122FCD700D94F7E /* TransposeOp.swift in Sources */, - 4AA1EAA6214B5F6800D0F791 /* Shape.metal in Sources */, FCD04E6E20F31B4B0007374F /* ReshapeOp.swift in Sources */, FC039B9820E11C9A0081E9F8 /* Errors.swift in Sources */, FC039BBF20E11CC20081E9F8 /* Attribute.swift in Sources */, @@ -716,12 +566,9 @@ FC1CF3F721D4B4C400F7392E /* Runner.swift in Sources */, FC039BB920E11CC20081E9F8 /* Scope.swift in Sources */, FCD04E6620F314C50007374F /* PoolOp.swift in Sources */, - FCE9D7B9214FAA4800B520C3 /* NMSFetchResultKernel.metal in Sources */, FC039BAC20E11CBC0081E9F8 /* BatchNormOp.swift in Sources */, FCBCCC6F2123097100D94F7E /* MulticlassNMSOp.swift in Sources */, FC039BBC20E11CC20081E9F8 /* PMVarDesc.swift in Sources */, - FC803BC5214CB8F00094B8E5 /* ConvAddPrelu.inc.metal in Sources */, - 4AF928822135673D005B6C3A /* ConcatKernel.metal in Sources */, FCBCCC632122FCC000D94F7E /* TransposeKernel.swift in Sources */, FCBCCC71212309A700D94F7E /* MulticlassNMSKernel.swift in Sources */, FCDC0FEB21099A1D00DC9EFB /* Tools.swift in Sources */, @@ -732,7 +579,6 @@ FCBCCC69212306D300D94F7E /* ConcatKernel.swift in Sources */, FCDDC6C8212FA3CA00E5EF74 /* ConvTransposeKernel.swift in Sources */, FC82735920E3C04200BE430A /* OpCreator.swift in Sources */, - FCA3A1652132A5EB00084FE5 /* Common.metal in Sources */, 4AA1EA92214665D700D0F791 /* ShapeOp.swift in Sources */, FC803BC1214CB77A0094B8E5 /* ConvAddPreluKernel.swift in Sources */, FCBCCC5D2122F8A100D94F7E /* DepthwiseConvOp.swift in Sources */, @@ -744,24 +590,16 @@ FCBCCC612122FBDF00D94F7E /* PriorBoxKernel.swift in Sources */, FCBCCC5F2122FB3B00D94F7E /* PriorBoxOp.swift in Sources */, FC9D038220E2312E000F735A /* FetchOp.swift in Sources */, - FCA67B1721364EF000BD58AA /* ConvTransposeKernel.metal in Sources */, FC039BBD20E11CC20081E9F8 /* Program.swift in Sources */, - FC2BFD5121DF8E0400C262B2 /* Scale.metal in Sources */, FC039BA220E11CB70081E9F8 /* Loader.swift in Sources */, FCBCCC67212306B000D94F7E /* ConcatOp.swift in Sources */, FCD04E6C20F31A280007374F /* SoftmaxKernel.swift in Sources */, - FCEB684A212F00DB00D2448E /* PreluKernel.metal in Sources */, - 4AA1EAA02148DEEE00D0F791 /* ReshapeKernel.inc.metal in Sources */, - FCDDC6CF212FE14700E5EF74 /* PriorBoxKernel.metal in Sources */, FC4CB74B20F12C30007C0C6D /* ProgramOptimize.swift in Sources */, FCE3A1A92153DE5100C37CDE /* ConvAddAddPreluOp.swift in Sources */, FC5163F620EF556E00636C28 /* Texture2DTo2DArrayKernel.swift in Sources */, FCE3A1AD2153E8BA00C37CDE /* ElementwiseAddPreluOp.swift in Sources */, FC039BC020E11CC20081E9F8 /* PMBlockDesc.swift in Sources */, - FC803BC3214CB79C0094B8E5 /* ConvAddPreluKernel.metal in Sources */, - 4AA1EA90214664CD00D0F791 /* Split.metal in Sources */, FCD04E6820F315020007374F /* PoolKernel.swift in Sources */, - FC0226582138F38D00F395E2 /* PoolKernel.metal in Sources */, FC039BAD20E11CBC0081E9F8 /* ReluOp.swift in Sources */, FCBCCC572122F41300D94F7E /* DwConvBNReluOp.swift in Sources */, FC039BBE20E11CC20081E9F8 /* PMOpDesc.swift in Sources */, @@ -903,7 +741,7 @@ CODE_SIGN_IDENTITY = "iPhone Developer"; CODE_SIGN_STYLE = Automatic; DEFINES_MODULE = YES; - DEVELOPMENT_TEAM = ""; + DEVELOPMENT_TEAM = A798K58VVL; DYLIB_COMPATIBILITY_VERSION = 1; DYLIB_CURRENT_VERSION = 1; DYLIB_INSTALL_NAME_BASE = "@rpath"; @@ -920,7 +758,7 @@ "$(inherited)", "$(PROJECT_DIR)/paddle-mobile/CPU", ); - MACH_O_TYPE = mh_dylib; + MACH_O_TYPE = staticlib; MTL_LANGUAGE_REVISION = UseDeploymentTarget; PRODUCT_BUNDLE_IDENTIFIER = "orange.paddle-mobile"; PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)"; @@ -940,7 +778,7 @@ CODE_SIGN_IDENTITY = "iPhone Developer"; CODE_SIGN_STYLE = Automatic; DEFINES_MODULE = YES; - DEVELOPMENT_TEAM = ""; + DEVELOPMENT_TEAM = A798K58VVL; DYLIB_COMPATIBILITY_VERSION = 1; DYLIB_CURRENT_VERSION = 1; DYLIB_INSTALL_NAME_BASE = "@rpath"; @@ -957,7 +795,7 @@ "$(inherited)", "$(PROJECT_DIR)/paddle-mobile/CPU", ); - MACH_O_TYPE = mh_dylib; + MACH_O_TYPE = staticlib; MTL_LANGUAGE_REVISION = UseDeploymentTarget; PRODUCT_BUNDLE_IDENTIFIER = "orange.paddle-mobile"; PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)"; diff --git a/metal/paddle-mobile/paddle-mobile/API/GlobalConfig.swift b/metal/paddle-mobile/paddle-mobile/API/GlobalConfig.swift index da66460d8bc895ce3b31e1ec7866765827515054..57b4f8201778d92d79f12c2fc6b43949660c8bcb 100644 --- a/metal/paddle-mobile/paddle-mobile/API/GlobalConfig.swift +++ b/metal/paddle-mobile/paddle-mobile/API/GlobalConfig.swift @@ -15,24 +15,26 @@ import Foundation @objc public enum MetalLoadMode: Int { - case - LoadMetalInPaddleMobile = 1, // 使用 paddle-mobile 中的 metal 代码 - LoadMetalInDefaultLib = 2, // 使用 main bundle 中的 metal 代码 - LoadMetalInCustomMetalLib = 3 // 使用 metal 库文件 + case + LoadMetalInPaddleMobile = 1, // 使用 paddle-mobile 中的 metal 代码 + LoadMetalInDefaultLib = 2, // 使用 main bundle 中的 metal 代码 + LoadMetalInCustomMetalLib = 3 // 使用 metal 库文件 } -@objc public enum ComputePrecision: Int { - case - Float32 = 1, - Float16 = 2 +@objc public enum Precision: Int { + case + Float32 = 1, + Float16 = 2 } @objc public class GlobalConfig: NSObject { - - /// 单例 - @objc public static let shared: GlobalConfig = GlobalConfig.init() - - /// 运算精度, runner 生命周期中不可变 - @objc public var computePrecision: ComputePrecision = .Float16 - + + /// 单例 + @objc public static let shared: GlobalConfig = GlobalConfig.init() + + /// 运算精度, runner 生命周期中不可变 + @objc public var computePrecision: Precision = .Float16 + + /// 是否开启 log + @objc public var debug: Bool = false } diff --git a/metal/paddle-mobile/paddle-mobile/API/Net.swift b/metal/paddle-mobile/paddle-mobile/API/Net.swift index 6eb7732410a54829c2633bc6cb3449ef25b5ec9d..35cd09eb49cde78a958ab019e69b03d4dfe35d0d 100644 --- a/metal/paddle-mobile/paddle-mobile/API/Net.swift +++ b/metal/paddle-mobile/paddle-mobile/API/Net.swift @@ -17,71 +17,79 @@ import Foundation /// 网络的基类, 参数已经给了默认值,请在子类实现中修改需要改的参数 @objc open class Net: NSObject { + + /// 默认为0, 如果指定个数, 后边 except 个op不使用 GPU 运算, 中间结果会通过 fetchResult 传参过来 + @objc public var except: Int = 0 + + /// 预处理 kernel, 如果输入图像需要预处理, 则指定预处理 kernel + @objc public var preprocessKernel: CusomKernel? = nil + + // 以下四个参数为从内存中读取模型时用到的参数 + /// 模型在内存中的指针 + @objc public var modelPointer: UnsafeMutableRawPointer? = nil + + /// 模型大小 单位: 字节 + @objc public var modelSize: Int = 0 + + /// 权重参数在内存中的指针 + @objc public var paramPointer: UnsafeMutableRawPointer? = nil + + /// 权重大小 单位: 字节 + @objc public var paramSize: Int = 0 + + // 以下两个为从文件中读取模型时用到的参数 + /// 模型文件路径 + @objc public var modelPath: String? = nil + + /// 权重文件路径 + @objc public var paramPath: String? = nil + + /// 代表着 GPU 处理器 + @objc public let device: MTLDevice + + /// metal 代码加载方式 注意: 如果静态库只能使用 LoadMetalInDefaultLib LoadMetalInCustomMetalLib 进行 load metal 代码 + @objc public var metalLoadMode: MetalLoadMode = .LoadMetalInPaddleMobile + + /// 当 metalLoadMode 为 LoadMetalInCustomMetalLib 时, metal library 路径不能为空 + @objc public var metalLibPath: String? = nil + + /// 输入维度,按照 n h w c 方式传入 + @objc public var inputDim: Dim = Dim.init(inDim: []) + + /// 是否使用 MetalPerformanceShaders 进行运算 + @objc public var useMPS: Bool = false + + /// 模型精度 - 当使用模型精度为 Float 16 时 不要开启 useMPS, 暂不支持 + @objc public var paramPrecision: Precision = .Float32 - /// 默认为0, 如果指定个数, 后边 except 个op不使用 GPU 运算, 中间结果会通过 fetchResult 传参过来 - @objc public var except: Int = 0 - - /// 预处理 kernel, 如果输入图像需要预处理, 则指定预处理 kernel - @objc public var preprocessKernel: CusomKernel? = nil - - // 以下四个参数为从内存中读取模型时用到的参数 - /// 模型在内存中的指针 - @objc public var modelPointer: UnsafeMutableRawPointer? = nil - - /// 模型大小 单位: 字节 - @objc public var modelSize: Int = 0 - - /// 权重参数在内存中的指针 - @objc public var paramPointer: UnsafeMutableRawPointer? = nil - - /// 权重大小 单位: 字节 - @objc public var paramSize: Int = 0 - - // 以下两个为从文件中读取模型时用到的参数 - /// 模型文件路径 - @objc public var modelPath: String? = nil - - /// 权重文件路径 - @objc public var paramPath: String? = nil - - /// 代表着 GPU 处理器 - @objc public let device: MTLDevice - - /// metal 代码加载方式 注意: 如果静态库只能使用 LoadMetalInDefaultLib LoadMetalInCustomMetalLib 进行 load metal 代码 - @objc public var metalLoadMode: MetalLoadMode = .LoadMetalInPaddleMobile - - /// 当 metalLoadMode 为 LoadMetalInCustomMetalLib 时, metal library 路径不能为空 - @objc public var metalLibPath: String? = nil - - /// 输入维度,按照 n h w c 方式传入 - @objc public var inputDim: Dim = Dim.init(inDim: []) - - - @objc public init(device: MTLDevice, paramPointer: UnsafeMutableRawPointer, paramSize:Int, modePointer: UnsafeMutableRawPointer, modelSize: Int) { - self.paramPointer = paramPointer - self.paramSize = paramSize - self.modelPointer = modePointer - self.modelSize = modelSize - self.device = device - super.init() - } - - @objc public init(device: MTLDevice) { - self.device = device - super.init() - } - - @objc open func resultStr(res: ResultHolder) -> String { - fatalError() - } - - @objc open func fetchResult(paddleMobileRes: GPUResultHolder) -> ResultHolder { - guard let inResPointer = paddleMobileRes.resultPointer else { - fatalError() + @objc public init(device: MTLDevice, inParamPointer: UnsafeMutableRawPointer, inParamSize:Int, inModelPointer: UnsafeMutableRawPointer, inModelSize: Int) { + self.paramPointer = inParamPointer + self.paramSize = inParamSize + self.modelPointer = inModelPointer + self.modelSize = inModelSize + self.device = device + super.init() } - return ResultHolder.init(inResult: inResPointer, inCapacity: paddleMobileRes.capacity) - } - - open func updateProgram(program: Program) { - } + + @objc public init(device: MTLDevice) { + self.device = device + super.init() + } + + @objc open func resultStr(res: [ResultHolder]) -> String { + fatalError() + } + + @objc open func fetchResult(paddleMobileRes: [GPUResultHolder]) -> [ResultHolder] { + return paddleMobileRes.map { (gpuRes) -> ResultHolder in + guard let inResPointer = gpuRes.resultPointer else { + fatalError() + } + return ResultHolder.init(inResult: inResPointer, inCapacity: gpuRes.capacity, inDim: gpuRes.dim) + } + } + + open func updateProgram(program: Program) { + } + } diff --git a/metal/paddle-mobile/paddle-mobile/API/Runner.swift b/metal/paddle-mobile/paddle-mobile/API/Runner.swift index b4b7403ec8d0f6086eca373ffca34cee33545c94..a0c7ec68647b2e15b43b5e6ccd18d620cfed89dc 100644 --- a/metal/paddle-mobile/paddle-mobile/API/Runner.swift +++ b/metal/paddle-mobile/paddle-mobile/API/Runner.swift @@ -16,185 +16,203 @@ import MetalKit import Foundation @objc public class ResultHolder: NSObject { - @objc public let result: UnsafeMutablePointer - @objc public let capacity: Int - - init(inResult: UnsafeMutablePointer, inCapacity: Int) { - result = inResult - capacity = inCapacity - } - - @objc public func releasePointer() { - result.deinitialize(count: capacity) - result.deallocate() - } + @objc public let result: UnsafeMutablePointer + @objc public let capacity: Int + @objc public let dim: [Int] + + init(inResult: UnsafeMutablePointer, inCapacity: Int, inDim: [Int]) { + result = inResult + capacity = inCapacity + dim = inDim + } + + @objc public func releasePointer() { + result.deinitialize(count: capacity) + result.deallocate() + } } @objc public class Runner: NSObject { - var program: Program? - var executor: Executor? - var queue: MTLCommandQueue? - var textureLoader: MTKTextureLoader? - public let net: Net - let device: MTLDevice? - let numel: Int - - /// 初始化函数 - /// - /// - Parameters: - /// - inNet: 传入自定义的网络 - /// - commandQueue: commandQueue - @objc public init(inNet: Net, commandQueue: MTLCommandQueue?) { - guard inNet.inputDim.cout() == 4 else { - fatalError(" input dim count must 4 ") - } + var program: Program? + var executor: Executorable? + var queue: MTLCommandQueue? + var textureLoader: MTKTextureLoader? + public let net: Net + let device: MTLDevice? + let numel: Int - net = inNet - queue = commandQueue - device = queue?.device - if let inDevice = device { - textureLoader = MTKTextureLoader.init(device: inDevice) - } - numel = net.inputDim.numel() - } - - /// load 模型, 返回 true 可进行预测 - /// - /// - Returns: load 成功或失败 - @objc public func load() -> Bool { - guard let inDevice = device, let inQueue = queue else { - print(" paddle mobile gpu load error, need MTLCommandQueue") - return false - } - let loader = Loader.init() - do { - - if let inParamPointer = net.paramPointer, let inModelPointer = net.modelPointer { - guard net.paramSize > 0 && net.modelSize > 0 else { - print(" load from memory param size or model size can't 0 ") - return false - } - program = try loader.load(device: inDevice, paramPointer: inParamPointer, paramSize: net.paramSize,modePointer:inModelPointer,modelSize:net.modelSize) - } else if let inModelPath = net.modelPath, let inParamPath = net.paramPath { - program = try loader.load(device: inDevice, modelPath: inModelPath, paraPath: inParamPath) - } else { - print(" model pointer or model file path need be specified") - return false + /// 初始化函数 + /// + /// - Parameters: + /// - inNet: 传入自定义的网络 + /// - commandQueue: commandQueue + @objc public init(inNet: Net, commandQueue: MTLCommandQueue?) { + guard inNet.inputDim.cout() == 4 else { + fatalError(" input dim count must 4 ") } - let initContext: InitContext = InitContext.init() - initContext.metalLoadMode = net.metalLoadMode - initContext.metalLibPath = net.metalLibPath - executor = try Executor.init(inDevice: inDevice, inQueue: inQueue, inProgram: program!, initContext: initContext) - net.updateProgram(program: program!) - } catch let error { - print(error) - return false - } - return true - } - - /// 预测 - /// - /// - Parameters: - /// - texture: 输入 texture 需要使用 getTexture 获得 - /// - completion: 结果回调, 当 success 为 true 时 result 不为 nil - @objc public func predict(texture: MTLTexture, completion: @escaping ( _ success: Bool, _ result: ResultHolder?) -> Void) { - do { - try self.executor?.predict(input: texture, dim: self.net.inputDim, completionHandle: { [weak self] (res) in - guard let SSelf = self else { - fatalError( " self nil " ) + net = inNet + queue = commandQueue + device = queue?.device + if let inDevice = device { + textureLoader = MTKTextureLoader.init(device: inDevice) } - let result = SSelf.net.fetchResult(paddleMobileRes: res) - completion(true, result) - }, preProcessKernle: self.net.preprocessKernel, except: self.net.except) - } catch let error { - print(error) - completion(false, nil) - return - } - } - - /// 清理内存, 调用此函数后, 不能再使用, 需重新 load - @objc public func clear() { - executor?.clear() - executor = nil - program = nil - } - - /// 获取 texture, 对 texture 进行预处理, 预测时使用 - /// - /// - Parameters: - /// - image: 输入图像 - /// - getTexture: 获取 texture 回调 - @objc public func getTexture(image: CGImage, getTexture: @escaping (MTLTexture) -> Void) { - let texture = try? textureLoader?.newTexture(cgImage: image, options: [:]) ?! " texture loader error" - scaleTexture(input: texture!, complete: getTexture) - } - - /// 通过 buffer 获取 texture, 内部会使用GPU进行转换操作 - /// - /// - Parameters: - /// - inBuffer: 输入buffer - /// - getTexture: 结果回调 - @objc public func getTexture(inBuffer: MTLBuffer, getTexture: @escaping (MTLTexture) -> Void) { - guard let inQueue = queue, let inDevice = device else { - fatalError( " queue or devcie nil " ) - } - - guard let buffer = inQueue.makeCommandBuffer() else { - fatalError( " make buffer error" ) + numel = net.inputDim.numel() } - let bufferToTextureKernel = BufferToTextureKernel.init(device: inDevice, outputDim: Shape.init(inWidth: net.inputDim[2], inHeight: net.inputDim[1], inChannel: net.inputDim[3]), metalLoadMode: net.metalLoadMode, metalLibPath: net.metalLibPath) - do { - try bufferToTextureKernel.compute(inputBuffer: inBuffer, commandBuffer: buffer) - } catch { - fatalError(" bufferToTextureKernel error ") + /// load 模型, 返回 true 可进行预测 + /// + /// - Returns: load 成功或失败 + @objc public func load() -> Bool { + guard let inDevice = device, let inQueue = queue else { + print(" paddle mobile gpu load error, need MTLCommandQueue") + return false + } + var loader: Loaderable + switch net.paramPrecision { + case .Float16: + loader = Loader.init() + case .Float32: + loader = Loader.init() + } + + do { + + if let inParamPointer = net.paramPointer, let inModelPointer = net.modelPointer { + guard net.paramSize > 0 && net.modelSize > 0 else { + print(" load from memory param size or model size can't 0 ") + return false + } + program = try loader.load(device: inDevice, paramPointer: inParamPointer, paramSize: net.paramSize,modePointer:inModelPointer,modelSize:net.modelSize) + } else if let inModelPath = net.modelPath, let inParamPath = net.paramPath { + program = try loader.load(device: inDevice, modelPath: inModelPath, paraPath: inParamPath) + } else { + print(" model pointer or model file path need be specified") + return false + } + + let initContext: InitContext = InitContext.init() + initContext.metalLoadMode = net.metalLoadMode + initContext.metalLibPath = net.metalLibPath + initContext.useMPS = net.useMPS + + switch net.paramPrecision { + case .Float16: + executor = try Executor.init(inDevice: inDevice, inQueue: inQueue, inProgram: program!, initContext: initContext) + case .Float32: + executor = try Executor.init(inDevice: inDevice, inQueue: inQueue, inProgram: program!, initContext: initContext) + } + + net.updateProgram(program: program!) + } catch let error { + print(error) + return false + } + return true } - buffer.addCompletedHandler { (buffer) in - getTexture(bufferToTextureKernel.outputTexture) + /// 预测 + /// + /// - Parameters: + /// - texture: 输入 texture 需要使用 getTexture 获得 + /// - completion: 结果回调, 当 success 为 true 时 result 不为 nil + @objc public func predict(texture: MTLTexture, completion: @escaping ( _ success: Bool, _ result: [ResultHolder]?) -> Void) { + do { + + try self.executor?.predict(input: texture, dim: self.net.inputDim, completionHandle: { [weak self] (res) in + guard let SSelf = self else { + fatalError( " self nil " ) + } + let result = SSelf.net.fetchResult(paddleMobileRes: res) + completion(true, result) + }, preProcessKernle: self.net.preprocessKernel, except: self.net.except) + } catch let error { + print(error) + completion(false, nil) + return + } } - buffer.commit() - } - - /// 更新输入维度, 针对可变长输入模型 - /// - /// - Parameter inDim: 输入维度 - @objc public func updateInputDim(inDim: Dim) { - if net.inputDim != inDim { - guard let inProgram = program else { - fatalError(" need load first ") - } - net.inputDim = inDim - net.updateProgram(program: inProgram) + /// 清理内存, 调用此函数后, 不能再使用, 需重新 load + @objc public func clear() { + executor?.clear() + executor = nil + program = nil } - } - - public func scaleTexture(input: MTLTexture , complete: @escaping (MTLTexture) -> Void) { - guard let inQueue = queue, let inDevice = device else { - fatalError( " queue or devcie nil " ) + /// 获取 texture, 对 texture 进行预处理, 预测时使用 + /// + /// - Parameters: + /// - image: 输入图像 + /// - getTexture: 获取 texture 回调 + @objc public func getTexture(image: CGImage, getTexture: @escaping (MTLTexture) -> Void) { + let texture = try? textureLoader?.newTexture(cgImage: image, options: [:]) ?! " texture loader error" + scaleTexture(input: texture!, complete: getTexture) } - guard let buffer = inQueue.makeCommandBuffer() else { - fatalError( " make buffer error" ) + /// 通过 buffer 获取 texture, 内部会使用GPU进行转换操作 + /// + /// - Parameters: + /// - inBuffer: 输入buffer + /// - getTexture: 结果回调 + @objc public func getTexture(inBuffer: MTLBuffer, getTexture: @escaping (MTLTexture) -> Void) { + guard let inQueue = queue, let inDevice = device else { + fatalError( " queue or devcie nil " ) + } + + guard let buffer = inQueue.makeCommandBuffer() else { + fatalError( " make buffer error" ) + } + + let bufferToTextureKernel = BufferToTextureKernel.init(device: inDevice, outputDim: Shape.init(inWidth: net.inputDim[2], inHeight: net.inputDim[1], inChannel: net.inputDim[3]), metalLoadMode: net.metalLoadMode, metalLibPath: net.metalLibPath) + do { + try bufferToTextureKernel.compute(inputBuffer: inBuffer, commandBuffer: buffer) + } catch { + fatalError(" bufferToTextureKernel error ") + } + + buffer.addCompletedHandler { (buffer) in + getTexture(bufferToTextureKernel.outputTexture) + } + + buffer.commit() } - let scaleKernel = ScaleKernel.init(device: inDevice, shape: Shape.init(inWidth: net.inputDim[2], inHeight: net.inputDim[1], inChannel: 3), metalLoadMode: net.metalLoadMode, metalLibPath: net.metalLibPath) - - do { - try scaleKernel.compute(inputTexuture: input, commandBuffer: buffer) - } catch let error { - print(error) - fatalError() + /// 更新输入维度, 针对可变长输入模型 + /// + /// - Parameter inDim: 输入维度 + @objc public func updateInputDim(inDim: Dim) { + if net.inputDim != inDim { + guard let inProgram = program else { + fatalError(" need load first ") + } + net.inputDim = inDim + net.updateProgram(program: inProgram) + } } - buffer.addCompletedHandler { (buffer) in - complete(scaleKernel.outputTexture) + public func scaleTexture(input: MTLTexture , complete: @escaping (MTLTexture) -> Void) { + + guard let inQueue = queue, let inDevice = device else { + fatalError( " queue or devcie nil " ) + } + + guard let buffer = inQueue.makeCommandBuffer() else { + fatalError( " make buffer error" ) + } + + let scaleKernel = ScaleKernel.init(device: inDevice, shape: Shape.init(inWidth: net.inputDim[2], inHeight: net.inputDim[1], inChannel: 3), metalLoadMode: net.metalLoadMode, metalLibPath: net.metalLibPath) + + do { + try scaleKernel.compute(inputTexuture: input, commandBuffer: buffer) + } catch let error { + print(error) + fatalError() + } + + buffer.addCompletedHandler { (buffer) in + complete(scaleKernel.outputTexture) + } + buffer.commit() } - buffer.commit() - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Common/Extensions.swift b/metal/paddle-mobile/paddle-mobile/Src/Common/Extensions.swift index 12bc909be97b372ce9f82daf035dced0b969cdc7..64786d0a45fde417021fc468e5526076ca760753 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Common/Extensions.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Common/Extensions.swift @@ -16,128 +16,128 @@ import Foundation // 自定义 ?! 如果 ?! 前的返回值为一个可选值, 则进行隐式解包, 如果有值则返回这个值, 如果为nil 则fatalError 传入的信息 precedencegroup ExecutedOrFatalError{ - associativity: left - higherThan: AssignmentPrecedence + associativity: left + higherThan: AssignmentPrecedence } infix operator ?!: ExecutedOrFatalError public func ?!(option: T?, excuteOrError: @autoclosure () -> String) -> T{ - if let inOpt = option { - return inOpt - }else{ - print(excuteOrError()) - fatalError(excuteOrError()) - } + if let inOpt = option { + return inOpt + }else{ + print(excuteOrError()) + fatalError(excuteOrError()) + } } //Lense struct Lense { - let from: (A) -> B - let to: (B, A) -> A + let from: (A) -> B + let to: (B, A) -> A } precedencegroup CombineLense{ - associativity: left - higherThan: AssignmentPrecedence + associativity: left + higherThan: AssignmentPrecedence } infix operator >>>: CombineLense func >>>(left: Lense, right: Lense) -> Lense { - return Lense.init(from: { (a) -> C in - left.from(right.from(a)) - }, to: { (c, a) -> A in - right.to( left.to(c, right.from(a)),a) - }) + return Lense.init(from: { (a) -> C in + left.from(right.from(a)) + }, to: { (c, a) -> A in + right.to( left.to(c, right.from(a)),a) + }) } protocol CIntIndex { - associatedtype T; - subscript(index: CInt) -> T { get set}; + associatedtype T; + subscript(index: CInt) -> T { get set}; } extension Array: CIntIndex{ - typealias T = Element - subscript(index: CInt) -> T { - get{ - guard Int64(Int.max) >= Int64(index) else{ - fatalError("cint index out of Int range") - } - return self[Int(index)] - } - set{ - guard Int64(Int.max) >= Int64(index) else{ - fatalError("cint index out of Int range") - } - self[Int(index)] = newValue + typealias T = Element + subscript(index: CInt) -> T { + get{ + guard Int64(Int.max) >= Int64(index) else{ + fatalError("cint index out of Int range") + } + return self[Int(index)] + } + set{ + guard Int64(Int.max) >= Int64(index) else{ + fatalError("cint index out of Int range") + } + self[Int(index)] = newValue + } + } - - } } extension Array where Element: AnyObject{ - mutating func remove(element: Element) { - if let index = index(where: { (node) -> Bool in - return unsafeBitCast(element, to: Int.self) == unsafeBitCast(node, to: Int.self) - }) { - remove(at: index) + mutating func remove(element: Element) { + if let index = index(where: { (node) -> Bool in + return unsafeBitCast(element, to: Int.self) == unsafeBitCast(node, to: Int.self) + }) { + remove(at: index) + } } - } - + } //MARK: Array extension extension Array where Element: Comparable{ - - /// 返回数组前 r 个元素, 并将元素处于原数组的位置作为元组的第一个元素返回 - /// - /// - Parameter r: 前 r 个元素 - /// - Returns: [(原有位置, 排好位置的元素)] - public func top(r: Int) -> [(Int, Element)] { - precondition(r <= self.count) - return Array<(Int, Element)>(zip(0.. $1.1 }.prefix(through: r - 1)) - } + + /// 返回数组前 r 个元素, 并将元素处于原数组的位置作为元组的第一个元素返回 + /// + /// - Parameter r: 前 r 个元素 + /// - Returns: [(原有位置, 排好位置的元素)] + public func top(r: Int) -> [(Int, Element)] { + precondition(r <= self.count) + return Array<(Int, Element)>(zip(0.. $1.1 }.prefix(through: r - 1)) + } } extension Array { - public func strideArray(inCount: Int = 20) -> [(Int, Element)] { - if count < inCount { - return (0.. [(Int, Element)] { + if count < inCount { + return (0.., count: Int) -> [Float32] { - var arr: [Float32] = [] - for i in 0.., count: Int) -> [Float32] { + var arr: [Float32] = [] + for i in 0.. [Pointee]{ - var arr: [Pointee] = [] - for i in 0.. [Pointee]{ + var arr: [Pointee] = [] + for i in 0.. UnsafePointer? { - return (self as NSString).utf8String - } + func cStr() -> UnsafePointer? { + return (self as NSString).utf8String + } } func address(o: T) -> String { - return String.init(format: "%018p", unsafeBitCast(o, to: Int.self)) + return String.init(format: "%018p", unsafeBitCast(o, to: Int.self)) } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Common/MetalExtension.swift b/metal/paddle-mobile/paddle-mobile/Src/Common/MetalExtension.swift index c3ba777b2740fd806be450b0e373fb37a95249e6..3af3a75a64fa64b5cc482c24925c762cec38a925 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Common/MetalExtension.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Common/MetalExtension.swift @@ -21,615 +21,615 @@ fileprivate var paddleMobileMetalLibrary: MTLLibrary? fileprivate var customMetalLibrary: MTLLibrary? extension MTLDevice { - func defaultLibrary() -> MTLLibrary { - if defaultMetalLibrary == nil { - defaultMetalLibrary = makeDefaultLibrary() - } - if let inDefaultLib = defaultMetalLibrary { - return inDefaultLib - } else { - fatalError(" default metal libary is nil") - } - } - - func customLibrary(metalLibPath: String) -> MTLLibrary { - if customMetalLibrary == nil { - do { - customMetalLibrary = try makeLibrary(filepath: metalLibPath) - } catch let error { - fatalError("\(error)") - } + func defaultLibrary() -> MTLLibrary { + if defaultMetalLibrary == nil { + defaultMetalLibrary = makeDefaultLibrary() + } + if let inDefaultLib = defaultMetalLibrary { + return inDefaultLib + } else { + fatalError(" default metal libary is nil") + } } - if let inMetalLib = customMetalLibrary { - return inMetalLib - } else { - fatalError(" customlib is nil ") - } - } - - func paddleMobileLibrary() -> MTLLibrary { - if paddleMobileMetalLibrary == nil { - guard let path = Bundle.init(for: Kernel.self).path(forResource: "default", ofType: "metallib") else { - fatalError("Counld't find paddle mobile library") - } - do { - paddleMobileMetalLibrary = try makeLibrary(filepath: path) - } catch _ { - fatalError("Counld't load paddle mobile library") - } + func customLibrary(metalLibPath: String) -> MTLLibrary { + if customMetalLibrary == nil { + do { + customMetalLibrary = try makeLibrary(filepath: metalLibPath) + } catch let error { + fatalError("\(error)") + } + } + + if let inMetalLib = customMetalLibrary { + return inMetalLib + } else { + fatalError(" customlib is nil ") + } } - if let inPaddleMobileLib = paddleMobileMetalLibrary { - return inPaddleMobileLib - } else { - fatalError("PaddleMobile metal libary is nil") - } - } - - func pipeLine(funcName: String, metalLoadMode: MetalLoadMode, metalLibPath: String?) -> MTLComputePipelineState { - let useLib: MTLLibrary - switch metalLoadMode { - case .LoadMetalInDefaultLib: - useLib = defaultLibrary() - case .LoadMetalInPaddleMobile: - useLib = paddleMobileLibrary() - case .LoadMetalInCustomMetalLib: - useLib = customLibrary(metalLibPath: metalLibPath ?! " can't be nil ") - default: - fatalError() + func paddleMobileLibrary() -> MTLLibrary { + if paddleMobileMetalLibrary == nil { + guard let path = Bundle.init(for: Kernel.self).path(forResource: "default", ofType: "metallib") else { + fatalError("Counld't find paddle mobile library") + } + do { + paddleMobileMetalLibrary = try makeLibrary(filepath: path) + } catch _ { + fatalError("Counld't load paddle mobile library") + } + } + + if let inPaddleMobileLib = paddleMobileMetalLibrary { + return inPaddleMobileLib + } else { + fatalError("PaddleMobile metal libary is nil") + } } - guard let function = useLib.makeFunction(name: funcName) else { - fatalError(" function " + funcName + " not found") - } - do { - let pipLine = try makeComputePipelineState(function: function) - return pipLine - } catch let error { - print(error) - fatalError("make pip line error occured : \(error)") + func pipeLine(funcName: String, metalLoadMode: MetalLoadMode, metalLibPath: String?) -> MTLComputePipelineState { + let useLib: MTLLibrary + switch metalLoadMode { + case .LoadMetalInDefaultLib: + useLib = defaultLibrary() + case .LoadMetalInPaddleMobile: + useLib = paddleMobileLibrary() + case .LoadMetalInCustomMetalLib: + useLib = customLibrary(metalLibPath: metalLibPath ?! " can't be nil ") + default: + fatalError() + } + + guard let function = useLib.makeFunction(name: funcName) else { + fatalError(" function " + funcName + " not found") + } + do { + let pipLine = try makeComputePipelineState(function: function) + return pipLine + } catch let error { + print(error) + fatalError("make pip line error occured : \(error)") + } + } - } - - func makeBuffer

(value: [P]) -> MTLBuffer { - let buffer = makeBuffer(length: value.count * MemoryLayout

.size, options: MTLResourceOptions.storageModeShared) - let contents = buffer?.contents().bindMemory(to: P.self, capacity: value.count * MemoryLayout

.size) - for i in 0..(texture: MTLTexture, cb: ([Int], P)->Void) -> Void { - let bpR = texture.width * 4 * MemoryLayout

.size - let bpI = texture.height * bpR - let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: texture.width, height: texture.height, depth: 1)) - for i in 0.. = UnsafeMutablePointer

.allocate(capacity: bpI) - texture.getBytes(pointer, bytesPerRow: bpR, bytesPerImage: bpI, from: region, mipmapLevel: 0, slice: i) - for tx in 0..(texture: MTLTexture, dim: [Int], transpose: [Int] = [0, 1, 2, 3]) -> [P] { - var tdim: [Int] = [1, 1, 1, 1] - for i in 0..(texture: MTLTexture, dim: [Int], transpose: [Int] = [0, 1, 2, 3]) -> [P] { - var tdim: [Int] = [1, 1, 1, 1] - for i in 0..(value: [P]) -> MTLBuffer { + let buffer = makeBuffer(length: value.count * MemoryLayout

.size, options: MTLResourceOptions.storageModeShared) + let contents = buffer?.contents().bindMemory(to: P.self, capacity: value.count * MemoryLayout

.size) + for i in 0..(texture: MTLTexture, dim: [Int], transpose: [Int] = [0, 1, 2, 3]) -> [P] { - var tdim: [Int] = [1, 1, 1, 1] - for i in 0..(texture: MTLTexture, cb: ([Int], P)->Void) -> Void { + let bpR = texture.width * 4 * MemoryLayout

.size + let bpI = texture.height * bpR + let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: texture.width, height: texture.height, depth: 1)) + for i in 0.. = UnsafeMutablePointer

.allocate(capacity: bpI) + texture.getBytes(pointer, bytesPerRow: bpR, bytesPerImage: bpI, from: region, mipmapLevel: 0, slice: i) + for tx in 0..(texture: MTLTexture, dim: [Int], transpose: [Int] = [0, 1, 2, 3]) -> [P] { - if dim.count == 3 { - return texture2tensor_3(texture: texture, dim: dim, transpose: transpose) - } else if dim.count == 2 { - return texture2tensor_2(texture: texture, dim: dim, transpose: transpose) - } else if dim.count == 1 { - return texture2tensor_1(texture: texture, dim: dim, transpose: transpose) - } - var tdim: [Int] = [1, 1, 1, 1] - for i in 0..(texture: MTLTexture, dim: [Int], transpose: [Int] = [0, 1, 2, 3]) -> [P] { + var tdim: [Int] = [1, 1, 1, 1] + for i in 0..(value: [P], dim: [Int], transpose: [Int] = [0, 1, 2, 3], inComputePrecision: ComputePrecision = .Float32) -> MTLTexture { - if value.count > 0 { - assert(value.count == dim.reduce(1) { $0 * $1 }) + func texture2tensor_2

(texture: MTLTexture, dim: [Int], transpose: [Int] = [0, 1, 2, 3]) -> [P] { + var tdim: [Int] = [1, 1, 1, 1] + for i in 0..(texture: MTLTexture, dim: [Int], transpose: [Int] = [0, 1, 2, 3]) -> [P] { + var tdim: [Int] = [1, 1, 1, 1] + for i in 0..(texture: MTLTexture, dim: [Int], transpose: [Int] = [0, 1, 2, 3]) -> [P] { + if dim.count == 3 { + return texture2tensor_3(texture: texture, dim: dim, transpose: transpose) + } else if dim.count == 2 { + return texture2tensor_2(texture: texture, dim: dim, transpose: transpose) + } else if dim.count == 1 { + return texture2tensor_1(texture: texture, dim: dim, transpose: transpose) + } + var tdim: [Int] = [1, 1, 1, 1] + for i in 0.. 0 { - var rcount: Int = (ndim[0] * ndim[3] + 3) / 4 - rcount = rcount * 4 * ndim[1] * ndim[2] - var nvalue: [Float32] = .init(repeating: 0.0, count: rcount) - - for i0 in 0.. = UnsafeMutablePointer(mutating: nvalue) - let outputP: UnsafeMutablePointer = UnsafeMutablePointer(mutating: xvalue) - float32ToFloat16(input: pointer, output: outputP, count: rcount) - let bpR = ndim[2] * 4 * 2 - let bpI = ndim[1] * bpR - for i in 0.. = UnsafeMutablePointer(mutating: nvalue) - let bpR = ndim[2] * 4 * MemoryLayout

.size - let bpI = ndim[1] * bpR - for i in 0..(value: [P], dim: [Int], transpose: [Int] = [0, 1, 2, 3], inComputePrecision: Precision = .Float32) -> MTLTexture { + if value.count > 0 { + assert(value.count == dim.reduce(1) { $0 * $1 }) + } + + var tdim: [Int] = [1, 1, 1, 1] + for i in 0.. 0 { + var rcount: Int = (ndim[0] * ndim[3] + 3) / 4 + rcount = rcount * 4 * ndim[1] * ndim[2] + var nvalue: [Float32] = .init(repeating: 0.0, count: rcount) + + for i0 in 0.. = UnsafeMutablePointer(mutating: nvalue) + let outputP: UnsafeMutablePointer = UnsafeMutablePointer(mutating: xvalue) + float32ToFloat16(input: pointer, output: outputP, count: rcount) + let bpR = ndim[2] * 4 * 2 + let bpI = ndim[1] * bpR + for i in 0.. = UnsafeMutablePointer(mutating: nvalue) + let bpR = ndim[2] * 4 * MemoryLayout

.size + let bpI = ndim[1] * bpR + for i in 0..(value: [P], textureWidth: Int, textureHeight: Int, arrayLength: Int) -> MTLTexture{ - - let textureDesc = MTLTextureDescriptor.init() - textureDesc.width = textureWidth - textureDesc.height = textureHeight - textureDesc.depth = 1 - textureDesc.usage = [.shaderRead, .shaderWrite] - textureDesc.pixelFormat = .rgba32Float - textureDesc.textureType = .type2DArray - textureDesc.storageMode = .shared - textureDesc.cpuCacheMode = .defaultCache - textureDesc.arrayLength = arrayLength - let texture = makeTexture(descriptor: textureDesc)! - if value.count >= 4{ - let counts = arrayLength * 4 * textureWidth * textureHeight - let pointer: UnsafeMutablePointer

= UnsafeMutablePointer

.allocate(capacity: counts * MemoryLayout

.size) - for i in 0...size - let bytesPerImage = texture.height * bytesPerRow - let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: texture.width, height: texture.height, depth: texture.depth)) - for i in 0..(value: [P], textureWidth: Int, textureHeight: Int, arrayLength: Int) -> MTLTexture{ + + let textureDesc = MTLTextureDescriptor.init() + textureDesc.width = textureWidth + textureDesc.height = textureHeight + textureDesc.depth = 1 + textureDesc.usage = [.shaderRead, .shaderWrite] + textureDesc.pixelFormat = .rgba32Float + textureDesc.textureType = .type2DArray + textureDesc.storageMode = .shared + textureDesc.cpuCacheMode = .defaultCache + textureDesc.arrayLength = arrayLength + let texture = makeTexture(descriptor: textureDesc)! + + if value.count >= 4{ + let counts = arrayLength * 4 * textureWidth * textureHeight + let pointer: UnsafeMutablePointer

= UnsafeMutablePointer

.allocate(capacity: counts * MemoryLayout

.size) + for i in 0...size + let bytesPerImage = texture.height * bytesPerRow + let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: texture.width, height: texture.height, depth: texture.depth)) + for i in 0..(stridable: Bool = true) -> [(index: Int, value: P)] { - var arr: [P] = floatArray { (p: P) -> P in - return p; - } - var result: [(index: Int, value: P)] = [] - if arr.count > 100 && stridable { - for j in stride(from: 0, to: arr.count , by: arr.count / 100){ - result.append((j, arr[j])) - } - } else { - for j in 0..(res: (P) -> T) -> [T] { - var fArr: [T] = [] - if textureType == .type2DArray { - for i in 0...size, alignment: MemoryLayout

.alignment) - let bytesPerRow = width * depth * 4 * MemoryLayout

.size - let bytesPerImage = width * height * depth * 4 * MemoryLayout

.size - let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth)) - getBytes(bytes, bytesPerRow: bytesPerRow, bytesPerImage: bytesPerImage, from: region, mipmapLevel: 0, slice: i) - let p = bytes.assumingMemoryBound(to: P.self) - - for j in 0...size, alignment: MemoryLayout

.alignment) - let bytesPerRow = width * depth * 4 * MemoryLayout

.size - let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth)) - getBytes(bytes, bytesPerRow: bytesPerRow, from: region, mipmapLevel: 0) - let p = bytes.assumingMemoryBound(to: P.self) - - for j in 0..(stridable: Bool = true) -> [(index: Int, value: P)] { + var arr: [P] = floatArray { (p: P) -> P in + return p; + } + var result: [(index: Int, value: P)] = [] + if arr.count > 100 && stridable { + for j in stride(from: 0, to: arr.count , by: arr.count / 100){ + result.append((j, arr[j])) + } + } else { + for j in 0.. [Float32] { - if pixelFormat == .rgba32Float { - let float32Array = floatArray { (f: Float32) -> Float32 in - return f - } - return float32Array - } else if pixelFormat == .rgba16Float { - - var float16Array = floatArray { (f: Float16) -> Float16 in - return f - } - return float16To32(input: &float16Array, count: float16Array.count) - } else { - fatalError() + + func floatArray(res: (P) -> T) -> [T] { + var fArr: [T] = [] + if textureType == .type2DArray { + for i in 0...size, alignment: MemoryLayout

.alignment) + let bytesPerRow = width * depth * 4 * MemoryLayout

.size + let bytesPerImage = width * height * depth * 4 * MemoryLayout

.size + let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth)) + getBytes(bytes, bytesPerRow: bytesPerRow, bytesPerImage: bytesPerImage, from: region, mipmapLevel: 0, slice: i) + let p = bytes.assumingMemoryBound(to: P.self) + + for j in 0...size, alignment: MemoryLayout

.alignment) + let bytesPerRow = width * depth * 4 * MemoryLayout

.size + let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth)) + getBytes(bytes, bytesPerRow: bytesPerRow, from: region, mipmapLevel: 0) + let p = bytes.assumingMemoryBound(to: P.self) + + for j in 0..(header: String = "", stridable: Bool = true) -> T? { - print(header) - print("texture: \(self)") - // let res: [(index: Int, value: T)] = stridableFloatArray(stridable: stridable) - // print(res) - if textureType == .type2DArray { - for i in 0...size, alignment: MemoryLayout.alignment) - let bytesPerRow = width * depth * 4 * MemoryLayout.size - let bytesPerImage = width * height * depth * 4 * MemoryLayout.size - let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth)) - getBytes(bytes, bytesPerRow: bytesPerRow, bytesPerImage: bytesPerImage, from: region, mipmapLevel: 0, slice: i) - let p = bytes.assumingMemoryBound(to: T.self) - str += "2d array count : \(width * height * depth * 4) \n" - if stridable && width * height * depth * 4 > 20 { - for j in stride(from: 0, to: width * height * depth * 4 , by: width * height * depth * 4 / 20){ - str += " index \(j): \(p[j])" - } + func float32Array() -> [Float32] { + if pixelFormat == .rgba32Float { + let float32Array = floatArray { (f: Float32) -> Float32 in + return f + } + return float32Array + } else if pixelFormat == .rgba16Float { + + var float16Array = floatArray { (f: Float16) -> Float16 in + return f + } + return float16To32(input: &float16Array, count: float16Array.count) } else { - for j in 0...size, alignment: MemoryLayout.alignment) - let bytesPerRow = width * depth * 4 * MemoryLayout.size - let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth)) - getBytes(bytes, bytesPerRow: bytesPerRow, from: region, mipmapLevel: 0) - let p = bytes.assumingMemoryBound(to: T.self) - str += "2d count : \(width * width * 4) \n" - - if stridable { - for j in stride(from: 0, to: width * height * 4, by: width * height * 4 / 20){ - str += "index \(j): \(p[j]) " - } - } else { - for j in 0.. [Float32] { - var textureArray: [Float32] - if pixelFormat == .rgba32Float { - textureArray = floatArray { (i : Float32) -> Float32 in - return i - } - } else if pixelFormat == .rgba16Float { - - var textureFloat16Array = floatArray { (i : Float16) -> Float16 in - return i - } - textureArray = float16To32(input: &textureFloat16Array, count: textureFloat16Array.count) - } else { - fatalError(" 目前还不支持其他类型 ") - } - print(textureArray.count) - var output: [Float32] = [] - for s in 0..(header: String = "", stridable: Bool = true) -> T? { + print(header) + print("texture: \(self)") + // let res: [(index: Int, value: T)] = stridableFloatArray(stridable: stridable) + // print(res) + + if textureType == .type2DArray { + for i in 0...size, alignment: MemoryLayout.alignment) + let bytesPerRow = width * depth * 4 * MemoryLayout.size + let bytesPerImage = width * height * depth * 4 * MemoryLayout.size + let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth)) + getBytes(bytes, bytesPerRow: bytesPerRow, bytesPerImage: bytesPerImage, from: region, mipmapLevel: 0, slice: i) + let p = bytes.assumingMemoryBound(to: T.self) + str += "2d array count : \(width * height * depth * 4) \n" + if stridable && width * height * depth * 4 > 20 { + for j in stride(from: 0, to: width * height * depth * 4 , by: width * height * depth * 4 / 20){ + str += " index \(j): \(p[j])" + } + } else { + for j in 0...size, alignment: MemoryLayout.alignment) + let bytesPerRow = width * depth * 4 * MemoryLayout.size + let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth)) + getBytes(bytes, bytesPerRow: bytesPerRow, from: region, mipmapLevel: 0) + let p = bytes.assumingMemoryBound(to: T.self) + str += "2d count : \(width * width * 4) \n" + + if stridable { + for j in stride(from: 0, to: width * height * 4, by: width * height * 4 / 20){ + str += "index \(j): \(p[j]) " + } + } else { + for j in 0.. [Float32] { -// print("origin dim: \(dim)") -// print("texture: ") -// print(self) - var textureArray: [Float32] - if pixelFormat == .rgba32Float { - textureArray = floatArray { (i : Float32) -> Float32 in - return i - } - } else if pixelFormat == .rgba16Float { - var textureFloat16Array = floatArray { (i : Float16) -> Float16 in - return i - } - textureArray = float16To32(input: &textureFloat16Array, count: textureFloat16Array.count) - } else { - fatalError(" 目前还不支持其他类型 ") + // n c h w - dim + func toTensor(dim: (n: Int, c: Int, h: Int, w: Int)) -> [Float32] { + var textureArray: [Float32] + if pixelFormat == .rgba32Float { + textureArray = floatArray { (i : Float32) -> Float32 in + return i + } + } else if pixelFormat == .rgba16Float { + + var textureFloat16Array = floatArray { (i : Float16) -> Float16 in + return i + } + textureArray = float16To32(input: &textureFloat16Array, count: textureFloat16Array.count) + } else { + fatalError(" 目前还不支持其他类型 ") + } + print(textureArray.count) + var output: [Float32] = [] + for s in 0.. dim.c { - for i in 0..<(4 - ((sliceIndex * 4 + 4) - dim.c)) { - let value = textureArray[sliceIndex * numOfASlice + h * dim.w * 4 + w * 4 + i] - output.append(value) - } - } else { - for i in 0..<4 { - let value = textureArray[sliceIndex * numOfASlice + h * dim.w * 4 + w * 4 + i] - output.append(value) - } - } - } - } + func realNHWC(dim: (n: Int, h: Int, w: Int, c: Int)) -> [Float32] { + // print("origin dim: \(dim)") + // print("texture: ") + // print(self) + + var textureArray: [Float32] + if pixelFormat == .rgba32Float { + textureArray = floatArray { (i : Float32) -> Float32 in + return i + } + } else if pixelFormat == .rgba16Float { + var textureFloat16Array = floatArray { (i : Float16) -> Float16 in + return i + } + textureArray = float16To32(input: &textureFloat16Array, count: textureFloat16Array.count) + } else { + fatalError(" 目前还不支持其他类型 ") + } + + var output: [Float32] = [] + let numOfASlice = dim.h * dim.w * 4 + for h in 0.. dim.c { + for i in 0..<(4 - ((sliceIndex * 4 + 4) - dim.c)) { + let value = textureArray[sliceIndex * numOfASlice + h * dim.w * 4 + w * 4 + i] + output.append(value) + } + } else { + for i in 0..<4 { + let value = textureArray[sliceIndex * numOfASlice + h * dim.w * 4 + w * 4 + i] + output.append(value) + } + } + } + } + } + return output } - return output - } - + } public extension MTLBuffer { - func logDesc(header: String = "", stridable: Bool = true) -> T? { - print(header) - print("MTLBuffer: \(self) ") - var str = "" - if stridable && length/MemoryLayout.stride > 1000{ - for j in stride(from: 0, to: length, by: length/MemoryLayout.stride / 100){ - str += " \(contents().assumingMemoryBound(to: T.self)[j])" - } - } else { - for i in 0...size { - str += " \(contents().assumingMemoryBound(to: T.self)[i])" - } + func logDesc(header: String = "", stridable: Bool = true) -> T? { + print(header) + print("MTLBuffer: \(self) ") + var str = "" + if stridable && length/MemoryLayout.stride > 1000{ + for j in stride(from: 0, to: length, by: length/MemoryLayout.stride / 100){ + str += " \(contents().assumingMemoryBound(to: T.self)[j])" + } + } else { + for i in 0...size { + str += " \(contents().assumingMemoryBound(to: T.self)[i])" + } + } + print(str) + return nil } - print(str) - return nil - } - - func makeTexture(textureWidth: Int, textureHeight: Int, arrayLength: Int) -> MTLTexture { - let textureDesc = MTLTextureDescriptor.init() - textureDesc.width = textureWidth - textureDesc.height = textureHeight - textureDesc.depth = 1 - textureDesc.usage = [.shaderRead, .shaderWrite] - textureDesc.pixelFormat = .rgba32Float - textureDesc.textureType = .type2DArray - textureDesc.storageMode = .shared - textureDesc.cpuCacheMode = .defaultCache - textureDesc.arrayLength = arrayLength - let texture = makeTexture(descriptor: textureDesc, offset: 0, bytesPerRow: textureWidth * 4 * 4)! - return texture - } - - func array() -> [T] { - var array: [T] = [] - let pointer = contents().bindMemory(to: T.self, capacity: length) - for i in 0..<(length / MemoryLayout.size) { - array.append(pointer[i]) + + func makeTexture(textureWidth: Int, textureHeight: Int, arrayLength: Int) -> MTLTexture { + let textureDesc = MTLTextureDescriptor.init() + textureDesc.width = textureWidth + textureDesc.height = textureHeight + textureDesc.depth = 1 + textureDesc.usage = [.shaderRead, .shaderWrite] + textureDesc.pixelFormat = .rgba32Float + textureDesc.textureType = .type2DArray + textureDesc.storageMode = .shared + textureDesc.cpuCacheMode = .defaultCache + textureDesc.arrayLength = arrayLength + let texture = makeTexture(descriptor: textureDesc, offset: 0, bytesPerRow: textureWidth * 4 * 4)! + return texture + } + + func array() -> [T] { + var array: [T] = [] + let pointer = contents().bindMemory(to: T.self, capacity: length) + for i in 0..<(length / MemoryLayout.size) { + array.append(pointer[i]) + } + return array; } - return array; - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Common/PaddleMobileUnitTest.swift b/metal/paddle-mobile/paddle-mobile/Src/Common/PaddleMobileUnitTest.swift index 724a44b0f4c9dcce189bc32abadb1675e01e8e72..52c27cceade8267aaeb5edee26db521419f1cf94 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Common/PaddleMobileUnitTest.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Common/PaddleMobileUnitTest.swift @@ -89,135 +89,135 @@ public class PaddleMobileUnitTest { } public func testConcat() { -// let buffer = queue.makeCommandBuffer() ?! "buffer is nil" -// var it: [[Float32]] = [] -// for _ in 0..<7 { -// it.append((0..<12).map { Float32($0) }) -// } -// let input = it.map { device.tensor2texture(value: $0, dim: [3, 4]) } -// let output = device.tensor2texture(value: [Float32](), dim: [3, 28]) -// -// let param = ConcatTestParam.init( -// input: input, -// output: output, -// dims: [[3, 4], [3, 4], [3, 4], [3, 4], [3, 4], [3, 4], [3, 4]], -// axis: 1, -// odim: [3, 28] -// ) -// let concatKernel = ConcatKernel.init(device: device, testParam: param) -// concatKernel.test(cmdBuffer: buffer, param: param) -// buffer.addCompletedHandler { (buffer) in -// for i in 0...init(device: device, testParam: param) + // concatKernel.test(cmdBuffer: buffer, param: param) + // buffer.addCompletedHandler { (buffer) in + // for i in 0...init(device: device, testParam: param) -// reshapeKernel.test(commandBuffer: buffer, testParam: param) -// buffer.addCompletedHandler { (buffer) in -// let _: Float32? = inTexture.logDesc() -// let _: Float32? = outTexture.logDesc() -// self.tensorPrint(tensor: input, dim: [2, 3, 4]) -// let tx: [Float32] = self.device.texture2tensor(texture: outTexture, dim: [4, 6]) -// self.tensorPrint(tensor: tx, dim: [4, 6]) -// } + // let buffer = queue.makeCommandBuffer() ?! "buffer is nil" + // let input: [Float32] = (0..<24).map { Float32($0) } + // let inTexture = device.tensor2texture(value: input, dim: [2, 3, 4]) + // let outTexture = device.tensor2texture(value: [Float32](), dim: [4, 6]) + // let mp = ReshapeMetalParam.init( + // idim: (1, 2, 3, 4), + // itrans: (0, 1, 2, 3), + // odim: (1, 1, 4, 6), + // otrans: (0, 1, 2, 3) + // ) + // let param = ReshapeTestParam.init( + // inputTexture: inTexture, + // outputTexture: outTexture, + // param: mp + // ) + // let reshapeKernel = ReshapeKernel.init(device: device, testParam: param) + // reshapeKernel.test(commandBuffer: buffer, testParam: param) + // buffer.addCompletedHandler { (buffer) in + // let _: Float32? = inTexture.logDesc() + // let _: Float32? = outTexture.logDesc() + // self.tensorPrint(tensor: input, dim: [2, 3, 4]) + // let tx: [Float32] = self.device.texture2tensor(texture: outTexture, dim: [4, 6]) + // self.tensorPrint(tensor: tx, dim: [4, 6]) + // } -// let input: [Float32] = (0..<24).map { Float32($0) } -// let inTexture = device.tensor2texture(value: input, dim: [2, 3, 4]) -// let outTexture = device.tensor2texture(value: [Float32](), dim: [24]) -// let mp = ReshapeMetalParam.init( -// idim: (1, 2, 3, 4), -// itrans: (0, 1, 2, 3), -// odim: (1, 1, 1, 24), -// otrans: (0, 1, 2, 3) -// ) -// let param = ReshapeTestParam.init( -// inputTexture: inTexture, -// outputTexture: outTexture, -// param: mp -// ) -// let reshapeKernel = ReshapeKernel.init(device: device, testParam: param) -// reshapeKernel.test(commandBuffer: buffer, testParam: param) -// buffer.addCompletedHandler { (buffer) in -// let _: Float32? = inTexture.logDesc() -// let _: Float32? = outTexture.logDesc() -// self.tensorPrint(tensor: input, dim: [2, 3, 4]) -// let tx: [Float32] = self.device.texture2tensor(texture: outTexture, dim: [24]) -// self.tensorPrint(tensor: tx, dim: [24]) -// } -// -// -// buffer.commit() + // let input: [Float32] = (0..<24).map { Float32($0) } + // let inTexture = device.tensor2texture(value: input, dim: [2, 3, 4]) + // let outTexture = device.tensor2texture(value: [Float32](), dim: [24]) + // let mp = ReshapeMetalParam.init( + // idim: (1, 2, 3, 4), + // itrans: (0, 1, 2, 3), + // odim: (1, 1, 1, 24), + // otrans: (0, 1, 2, 3) + // ) + // let param = ReshapeTestParam.init( + // inputTexture: inTexture, + // outputTexture: outTexture, + // param: mp + // ) + // let reshapeKernel = ReshapeKernel.init(device: device, testParam: param) + // reshapeKernel.test(commandBuffer: buffer, testParam: param) + // buffer.addCompletedHandler { (buffer) in + // let _: Float32? = inTexture.logDesc() + // let _: Float32? = outTexture.logDesc() + // self.tensorPrint(tensor: input, dim: [2, 3, 4]) + // let tx: [Float32] = self.device.texture2tensor(texture: outTexture, dim: [24]) + // self.tensorPrint(tensor: tx, dim: [24]) + // } + // + // + // buffer.commit() } public func testTranspose() { - + let buffer = queue.makeCommandBuffer() ?! "buffer is nil" -// var input: [Float32] = [] -// for i in 0..<72 { -// input.append(Float32(i)) -// } -//// let inputTexture = device.makeFloatTexture(value: input, textureWidth: 3, textureHeight: 2, arrayLength: 3) -// let inputTexture = device.tensor2texture(value: input, dim: [4, 3, 2, 3]); -// // group 1 -// let outputTexture = device.tensor2texture(value: [Float32](), dim: [3, 3, 2, 4]) -// let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 3, oC: 4, axis: [3, 1, 2, 0]) -//// let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 4, oC: 2, axis: [3, 0, 2, 1]) -//// // group 2 -//// let outputTexture = device.makeFloatTexture(value: [Float32](), textureWidth: 3, textureHeight: 3, arrayLength: 6) -//// let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 4, oC: 4, axis: [3, 0, 2, 1]) -//// -// let transposeKernel = TransposeKernel.init(device: device, testParam: param) -// -// transposeKernel.test(commandBuffer: buffer, param: param) -// -// buffer.addCompletedHandler { (buffer) in -// let _: Float32? = inputTexture.logDesc(header: "input texture", stridable: false) -// let _: Float32? = outputTexture.logDesc(header: "output texture", stridable: false) -// self.tensorPrint(tensor: input, dim: [4, 3, 2, 3]) -// let tx: [Float32] = self.device.texture2tensor(texture: outputTexture, dim: [3, 3, 2, 4]) -// self.tensorPrint(tensor: tx, dim: [3, 3, 2, 4]) -// } -// -// let input: [Float32] = (0..<24).map { Float32($0) } -// let inputTexture = device.tensor2texture(value: input, dim: [2, 3, 4]) -// let outputTexture = device.tensor2texture(value: [Float](), dim: [3, 4, 2]) -// let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 4, oC: 2, axis: [0, 2, 3, 1]) -// let transposeKernel = TransposeKernel.init(device: device, testParam: param) -// -// transposeKernel.test(commandBuffer: buffer, param: param) -// -// buffer.addCompletedHandler { (buffer) in -// let _: Float32? = inputTexture.logDesc(header: "input texture", stridable: false) -// let _: Float32? = outputTexture.logDesc(header: "output texture", stridable: false) -// self.tensorPrint(tensor: input, dim: [2, 3, 4]) -// let tx: [Float32] = self.device.texture2tensor(texture: outputTexture, dim: [3, 4, 2]) -// self.tensorPrint(tensor: tx, dim: [3, 4, 2]) -// } -// + // var input: [Float32] = [] + // for i in 0..<72 { + // input.append(Float32(i)) + // } + //// let inputTexture = device.makeFloatTexture(value: input, textureWidth: 3, textureHeight: 2, arrayLength: 3) + // let inputTexture = device.tensor2texture(value: input, dim: [4, 3, 2, 3]); + // // group 1 + // let outputTexture = device.tensor2texture(value: [Float32](), dim: [3, 3, 2, 4]) + // let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 3, oC: 4, axis: [3, 1, 2, 0]) + //// let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 4, oC: 2, axis: [3, 0, 2, 1]) + //// // group 2 + //// let outputTexture = device.makeFloatTexture(value: [Float32](), textureWidth: 3, textureHeight: 3, arrayLength: 6) + //// let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 4, oC: 4, axis: [3, 0, 2, 1]) + //// + // let transposeKernel = TransposeKernel.init(device: device, testParam: param) + // + // transposeKernel.test(commandBuffer: buffer, param: param) + // + // buffer.addCompletedHandler { (buffer) in + // let _: Float32? = inputTexture.logDesc(header: "input texture", stridable: false) + // let _: Float32? = outputTexture.logDesc(header: "output texture", stridable: false) + // self.tensorPrint(tensor: input, dim: [4, 3, 2, 3]) + // let tx: [Float32] = self.device.texture2tensor(texture: outputTexture, dim: [3, 3, 2, 4]) + // self.tensorPrint(tensor: tx, dim: [3, 3, 2, 4]) + // } + // + // let input: [Float32] = (0..<24).map { Float32($0) } + // let inputTexture = device.tensor2texture(value: input, dim: [2, 3, 4]) + // let outputTexture = device.tensor2texture(value: [Float](), dim: [3, 4, 2]) + // let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 4, oC: 2, axis: [0, 2, 3, 1]) + // let transposeKernel = TransposeKernel.init(device: device, testParam: param) + // + // transposeKernel.test(commandBuffer: buffer, param: param) + // + // buffer.addCompletedHandler { (buffer) in + // let _: Float32? = inputTexture.logDesc(header: "input texture", stridable: false) + // let _: Float32? = outputTexture.logDesc(header: "output texture", stridable: false) + // self.tensorPrint(tensor: input, dim: [2, 3, 4]) + // let tx: [Float32] = self.device.texture2tensor(texture: outputTexture, dim: [3, 4, 2]) + // self.tensorPrint(tensor: tx, dim: [3, 4, 2]) + // } + // buffer.commit() } @@ -225,72 +225,72 @@ public class PaddleMobileUnitTest { let buffer = queue.makeCommandBuffer() ?! " buffer is nil " let input: [Float32] = [ - 1.0, 2.0, 3.0, 4.0, - 1.0, 2.0, 3.0, 4.0, - 1.0, 2.0, 3.0, 4.0, - - 1.0, 2.0, 3.0, 4.0, - 1.0, 2.0, 3.0, 4.0, - 1.0, 2.0, 3.0, 4.0, - - 1.0, 2.0, 3.0, 4.0, - 1.0, 2.0, 3.0, 4.0, - 1.0, 2.0, 3.0, 4.0, - ] + 1.0, 2.0, 3.0, 4.0, + 1.0, 2.0, 3.0, 4.0, + 1.0, 2.0, 3.0, 4.0, + + 1.0, 2.0, 3.0, 4.0, + 1.0, 2.0, 3.0, 4.0, + 1.0, 2.0, 3.0, 4.0, + + 1.0, 2.0, 3.0, 4.0, + 1.0, 2.0, 3.0, 4.0, + 1.0, 2.0, 3.0, 4.0, + ] let filter: [Float32] = [ - //1.0 - 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, - - 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, - - 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, - - //2.0 - 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, - - 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, - - 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, - - //3.0 - 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, - - 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, - - 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, - - //4.0 - 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, - - 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, - - 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, - ] + //1.0 + 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, + + 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, + + 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, + + //2.0 + 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, + + 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, + + 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, + + //3.0 + 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, + + 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, + + 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, + + //4.0 + 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, + + 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, + + 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, + ] let biase: [Float32] = [1.0, 1.0, 1.0, 100.0] let newScalue: [Float32] = [1.0, 1.0, 1.0, 1.0] @@ -324,10 +324,10 @@ public class PaddleMobileUnitTest { let param = ConvAddBatchNormReluTestParam.init(inInputTexture: inputeTexture, inOutputTexture: outputTexture, inMetalParam: metalParam, inFilterBuffer: filterBuffer, inBiaseBuffer: biaseBuffer, inNewScaleBuffer: newScalueBuffer, inNewBiaseBuffer: newBiaseBuffer, inFilterSize: filterSize) - let initContext = InitContext.init() - initContext.metalLoadMode = .LoadMetalInDefaultLib + let initContext = InitContext.init() + initContext.metalLoadMode = .LoadMetalInDefaultLib - let convAddBnReluKernel = ConvAddBatchNormReluKernel.init(device: device, testParam: param, initContext: initContext) + let convAddBnReluKernel = ConvAddBatchNormReluKernel.init(device: device, testParam: param, initContext: initContext) convAddBnReluKernel.test(commandBuffer: buffer, param: param) diff --git a/metal/paddle-mobile/paddle-mobile/Src/Common/Tools.swift b/metal/paddle-mobile/paddle-mobile/Src/Common/Tools.swift index 23ad7113971de3d0843abe17accfe3d67f0caaa9..6128aa87768aaefddb782cf29ce0056a67284b37 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Common/Tools.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Common/Tools.swift @@ -14,7 +14,7 @@ import Foundation -func writeToLibrary(fileName: String, array: [P]) { +func writeToLibrary(fileName: String, array: [P]) { let libraryPath = NSSearchPathForDirectoriesInDomains(.libraryDirectory, .userDomainMask, true).last ?! " library path get error " let filePath = libraryPath + "/" + fileName let fileManager = FileManager.init() diff --git a/metal/paddle-mobile/paddle-mobile/Src/Common/Types.swift b/metal/paddle-mobile/paddle-mobile/Src/Common/Types.swift index ae7b898a8ecedefc21f0dce36a845fb024786246..21a9f4385d7205c02414634e042c1e958d1ab120 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Common/Types.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Common/Types.swift @@ -16,222 +16,165 @@ import Foundation import Accelerate public protocol SummableMultipliable: Equatable { - static func +(lhs: Self, rhs: Self) -> Self - static func *(lhs: Self, rhs: Self) -> Self - static func -(lhs: Self, rhs: Self) -> Self + static func +(lhs: Self, rhs: Self) -> Self + static func *(lhs: Self, rhs: Self) -> Self + static func -(lhs: Self, rhs: Self) -> Self } -public protocol PrecisionType: SummableMultipliable{ - init(inFloat: Float32) - init(inFloat16: Float16) - init(_ inP: P) - static var bitSize: UInt { get } + +public protocol PrecisionProtocol: SummableMultipliable{ + init(inFloat: Float32) + init(inFloat16: Float16) + init(_ inP: P) + static var bitSize: UInt { get } + static func initializeValue() -> Self + static var precisionType: Precision { get } } public typealias Float16 = Int16 -extension Float16: PrecisionType { - public static func * (prefix: Float16, postfix: Float16) { - return prefix * postfix - } - - public init

(_ inP: P) where P : PrecisionType { - if P.bitSize == Float32.bitSize { - self = Float16(inFloat: inP as! Float32) - } else if P.bitSize == Float16.bitSize { - self = inP as! Float16 - } else { - fatalError() +extension Float16: PrecisionProtocol { + + public static var precisionType: Precision { + return .Float16 + } + + public static func initializeValue() -> Int16 { + return 0 + } + + public init

(_ inP: P) where P : PrecisionProtocol { + if P.bitSize == Float32.bitSize { + self = Float16(inFloat: inP as! Float32) + } else if P.bitSize == Float16.bitSize { + self = inP as! Float16 + } else { + fatalError() + } + } + + public static var bitSize: UInt { + return 16 + } + + public init(inFloat16: Float16) { + self = inFloat16 + } + public init(inFloat: Float32) { + self = Int16(inFloat) } - } - - public static var bitSize: UInt { - return 16 - } - - public init(inFloat16: Float16) { - self = inFloat16 - } - public init(inFloat: Float32) { - self = Int16(inFloat) - } } -extension Float32: PrecisionType { - public init

(_ inP: P) where P : PrecisionType { - if P.bitSize == Float32.bitSize { - self = inP as! Float32 - } else if P.bitSize == Float16.bitSize { - self = Float32.init(inP as! Float16) - } else { - fatalError() +extension Float32: PrecisionProtocol { + + public static var precisionType: Precision { + return .Float32 + } + + public static func initializeValue() -> Float { + return 0.0 + } + + public init

(_ inP: P) where P : PrecisionProtocol { + if P.bitSize == Float32.bitSize { + self = inP as! Float32 + } else if P.bitSize == Float16.bitSize { + self = Float32.init(inP as! Float16) + } else { + fatalError() + } + } + + public init(inFloat: Float32) { + self = inFloat + } + + public init(inFloat16: Float16) { + self = Float32.init(inFloat16) + } + + public static var bitSize: UInt { + return 32 } - } - - public init(inFloat: Float32) { - self = inFloat - } - - public init(inFloat16: Float16) { - self = Float32.init(inFloat16) - } - - public static var bitSize: UInt { - return 32 - } } public func float32ToFloat16(input: UnsafeMutablePointer, output: UnsafeMutableRawPointer, count: Int) { - var float32Buffer = vImage_Buffer(data: input, height: 1, width: UInt(count), rowBytes: count * 4) - var float16buffer = vImage_Buffer(data: output, height: 1, width: UInt(count), rowBytes: count * 2) - guard vImageConvert_PlanarFtoPlanar16F(&float32Buffer, &float16buffer, 0) == kvImageNoError else { - fatalError(" float 32 to float 16 error ! ") - } + var float32Buffer = vImage_Buffer(data: input, height: 1, width: UInt(count), rowBytes: count * 4) + var float16buffer = vImage_Buffer(data: output, height: 1, width: UInt(count), rowBytes: count * 2) + guard vImageConvert_PlanarFtoPlanar16F(&float32Buffer, &float16buffer, 0) == kvImageNoError else { + fatalError(" float 32 to float 16 error ! ") + } } public func float16To32(input: UnsafeMutablePointer, count: Int) -> [Float32] { - var output = Array.init(repeating: 0.0, count: count) - float16to32(input: input, output: &output, count: count) - return output + var output = Array.init(repeating: 0.0, count: count) + float16to32(input: input, output: &output, count: count) + return output } public func float16to32(input: UnsafeMutablePointer, output: UnsafeMutablePointer, count: Int) { - var bufferFloat16 = vImage_Buffer(data: input, height: 1, width: UInt(count), rowBytes: count * 2) - var bufferFloat32 = vImage_Buffer(data: output, height: 1, width: UInt(count), rowBytes: count * 4) - if vImageConvert_Planar16FtoPlanarF(&bufferFloat16, &bufferFloat32, 0) != kvImageNoError { - fatalError(" convert float16 to float32 error") - } + var bufferFloat16 = vImage_Buffer(data: input, height: 1, width: UInt(count), rowBytes: count * 2) + var bufferFloat32 = vImage_Buffer(data: output, height: 1, width: UInt(count), rowBytes: count * 4) + if vImageConvert_Planar16FtoPlanarF(&bufferFloat16, &bufferFloat32, 0) != kvImageNoError { + fatalError(" convert float16 to float32 error") + } } // N - 0 C - 1 H - 2 W - 3 struct DataLayout { - - static func NCHW(dim: Dim = Dim.init(inDim: [0, 0, 0, 0])) -> DataLayout { - return DataLayout.init([(.N, dim[0]), (.C, dim[1]), (.H, dim[2]), (.W, dim[3])]) - } - - static func NHWC(dim: Dim = Dim.init(inDim: [0, 0, 0, 0])) -> DataLayout { - return DataLayout.init([(.N, dim[0]), (.H, dim[1]), (.W, dim[2]), (.C, dim[3])]) - } - - func count() -> Int { - return layoutWithDim.count - } - - var N: Int? { - get { - for layoutDim in layoutWithDim { - if layoutDim.0 == .N { - return layoutDim.1 - } - } - return nil - } - set { - var newN = (Layout.N, newValue) - if let index = layoutWithDim.index(where: { (layout: Layout, dim: Int) -> Bool in - return layout == .N - }) { - fatalError() - } - } - } - var C: Int? { - get { - for layoutDim in layoutWithDim { - if layoutDim.0 == .C { - return layoutDim.1 - } - } - return nil + + static func NCHW(dim: Dim = Dim.init(inDim: [0, 0, 0, 0])) -> DataLayout { + return DataLayout.init([(.N, dim[0]), (.C, dim[1]), (.H, dim[2]), (.W, dim[3])]) } - set { - var newN = (Layout.C, newValue) - if let index = layoutWithDim.index(where: { (layout: Layout, dim: Int) -> Bool in - return layout == .N - }) { - fatalError() - } + + static func NHWC(dim: Dim = Dim.init(inDim: [0, 0, 0, 0])) -> DataLayout { + return DataLayout.init([(.N, dim[0]), (.H, dim[1]), (.W, dim[2]), (.C, dim[3])]) } - } - var H: Int? { - get { - for layoutDim in layoutWithDim { - if layoutDim.0 == .H { - return layoutDim.1 - } - } - return nil + + func count() -> Int { + return layoutWithDim.count } - set { - var newN = (Layout.H, newValue) - if let index = layoutWithDim.index(where: { (layout: Layout, dim: Int) -> Bool in - return layout == .H - }) { - fatalError() - } + + init(_ inLayout: [(Layout, Int)]) { + layoutWithDim = inLayout } - } - var W: Int? { - get { - for layoutDim in layoutWithDim { - if layoutDim.0 == .W { - return layoutDim.1 - } - } - return nil + + func layout() -> [Layout] { + return layoutWithDim.map({ (layout: Layout, dim: Int) -> Layout in + return layout + }) } - set { - var newN = (Layout.W, newValue) - if let index = layoutWithDim.index(where: { (layout: Layout, dim: Int) -> Bool in - return layout == .W - }) { - fatalError() - } + + var layoutWithDim: [(Layout, Int)] = [(.N, 0), (.C, 0), (.H, 0), (.W, 0)] + + func convertTo(inLayout: [Layout]) { + } - } - - - init(_ inLayout: [(Layout, Int)]) { - layoutWithDim = inLayout - } - - func layout() -> [Layout] { - return layoutWithDim.map({ (layout: Layout, dim: Int) -> Layout in - return layout - }) - } - - var layoutWithDim: [(Layout, Int)] = [(.N, 0), (.C, 0), (.H, 0), (.W, 0)] - - func convertTo(inLayout: [Layout]) { - } - - enum Layout: Int{ - case N = 0 - case C = 1 - case H = 2 - case W = 3 - static func defaultLayout() -> [Layout] { - return [N, C, H, W] + enum Layout: Int{ + case N = 0 + case C = 1 + case H = 2 + case W = 3 + static func defaultLayout() -> [Layout] { + return [N, C, H, W] + } } - } } extension DataLayout: Equatable { - public static func == (lhs: DataLayout, rhs: DataLayout) -> Bool { - if lhs.layoutWithDim.count == rhs.layoutWithDim.count { - var result = true - for i in 0.. Bool { + if lhs.layoutWithDim.count == rhs.layoutWithDim.count { + var result = true + for i in 0.. { - guard let inResultBuffer = resultBuffer else { - fatalError() + var resultBuffer: MTLBuffer? + public var dim: Dim + public var capacity: Int + public var paddedCapacity: Int + + init(inPaddedCapacity: Int, inDim: Dim) { + paddedCapacity = inPaddedCapacity + capacity = inDim.numel() + dim = inDim + } + + public func initBuffer(device: MTLDevice) { + resultBuffer = device.makeBuffer(length: paddedCapacity * 4, options: []) } - return inResultBuffer.contents().bindMemory(to: Float32.self, capacity: paddedCapacity) - } - + + var result: UnsafeMutablePointer { + guard let inResultBuffer = resultBuffer else { + fatalError() + } + return inResultBuffer.contents().bindMemory(to: Float32.self, capacity: paddedCapacity) + } + } extension FetchHolder: CustomStringConvertible, CustomDebugStringConvertible { - public var description: String { - fatalError() -// return "\(result)" - } - - public var debugDescription: String { - fatalError() -// return "\(result)" - } - - + public var description: String { + fatalError() + // return "\(result)" + } + + public var debugDescription: String { + fatalError() + // return "\(result)" + } + + } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Framework/Dim.swift b/metal/paddle-mobile/paddle-mobile/Src/Framework/Dim.swift index 1817184bf7d5ef7ca9cbe6e9fd829aa14b564dc6..77b67bf16ca248c2e3d9bac525c5ee8d64d67255 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Framework/Dim.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Framework/Dim.swift @@ -15,41 +15,41 @@ import Foundation @objc public class Dim: NSObject { - private(set) var dims: [Int] - - @objc public init(inDim: [Int]) { - dims = inDim - } - - public func cout() -> Int { - return dims.count - } - - public func numel() -> Int { - return dims.reduce(1) { $0 * $1 } - } - - public static func ==(left: Dim, right: Dim) -> Bool { - return left.dims == right.dims; - } - - public static func !=(left: Dim, right: Dim) -> Bool { - return left.dims != right.dims; - } - - public subscript(index: Int) -> Int { - return dims[index]; - } - - public override var description: String { - return "\(dims)" - } - - func swapeDimAt(index1: Int, index2: Int) { - dims.swapAt(index1, index2) - } - - private override init(){ - fatalError() - } + private(set) var dims: [Int] + + @objc public init(inDim: [Int]) { + dims = inDim + } + + public func cout() -> Int { + return dims.count + } + + public func numel() -> Int { + return dims.reduce(1) { $0 * $1 } + } + + public static func ==(left: Dim, right: Dim) -> Bool { + return left.dims == right.dims; + } + + public static func !=(left: Dim, right: Dim) -> Bool { + return left.dims != right.dims; + } + + public subscript(index: Int) -> Int { + return dims[index]; + } + + public override var description: String { + return "\(dims)" + } + + func swapeDimAt(index1: Int, index2: Int) { + dims.swapAt(index1, index2) + } + + private override init(){ + fatalError() + } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Framework/Executor.swift b/metal/paddle-mobile/paddle-mobile/Src/Framework/Executor.swift index ec29df04e72794709da1cce18f7f8e13da6b2af8..28c1f6dddcc8a1664e741ae2301ff998c8df434a 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Framework/Executor.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Framework/Executor.swift @@ -14,136 +14,143 @@ import Foundation - let testTo = 5 var isTest = false @objc public class GPUResultHolder: NSObject{ - @objc public let dim: [Int] - @objc public let capacity: Int - @objc public var resultPointer: UnsafeMutablePointer? - @objc public var intermediateResults: [String : [MTLBuffer]]? - public init(inDim: [Int], inPointer: UnsafeMutablePointer?, inCapacity: Int, inIntermediateResults: [String : [MTLBuffer]]? = nil) { - dim = inDim - capacity = inCapacity + @objc public let dim: [Int] + @objc public let capacity: Int + @objc public var resultPointer: UnsafeMutablePointer? + @objc public var intermediateResults: [String : [MTLBuffer]]? + public init(inDim: [Int], inPointer: UnsafeMutablePointer?, inCapacity: Int, inIntermediateResults: [String : [MTLBuffer]]? = nil) { + dim = inDim + capacity = inCapacity + + if let inInPointer = inPointer { + resultPointer = UnsafeMutablePointer.allocate(capacity: inCapacity) + resultPointer?.initialize(from: inInPointer, count: inCapacity) + } + + intermediateResults = inIntermediateResults + } - if let inInPointer = inPointer { - resultPointer = UnsafeMutablePointer.allocate(capacity: inCapacity) - resultPointer?.initialize(from: inInPointer, count: inCapacity) + public override var description: String { + fatalError() } - intermediateResults = inIntermediateResults - } - - public override var description: String { - fatalError() - } - } -public class Executor { - var ops: [Runable & InferShaperable] = [] - var preInputDim: Dim = Dim.init(inDim: []) - let program: Program - let device: MTLDevice - let inflightSemaphore: DispatchSemaphore - let queue: MTLCommandQueue - init(inDevice:MTLDevice, inQueue: MTLCommandQueue, inProgram: Program, initContext: InitContext) throws { - self.inflightSemaphore = DispatchSemaphore(value: 1) - program = inProgram - device = inDevice - queue = inQueue - - for block in inProgram.programDesc.blocks { - //block.ops.count - for i in 0...shared.creat(device: inDevice, opDesc: opDesc, scope: inProgram.scope, initContext: initContext) - ops.append(op) - } catch let error { - throw error - } - } - } - } - - public func predict(input: MTLTexture, dim: Dim, completionHandle: @escaping (GPUResultHolder) -> Void, preProcessKernle: CusomKernel? = nil, except: Int = 0) throws { - inflightSemaphore.wait() +protocol Executorable { + func predict(input: MTLTexture, dim: Dim, completionHandle: @escaping ([GPUResultHolder]) -> Void, preProcessKernle: CusomKernel?, except: Int) throws + func clear() +} - guard let buffer = queue.makeCommandBuffer() else { - throw PaddleMobileError.predictError(message: "CommandBuffer is nil") - } - - let resInput: MTLTexture - if let inPre = preProcessKernle { - do { - try inPre.compute(inputTexuture: input, commandBuffer: buffer) - resInput = inPre.outputTexture - } catch let error { - throw error - } - } else { - resInput = input - } - - let inputTexture = InputTexture.init(inMTLTexture: resInput, inExpectDim: dim) - program.scope.setInput(input: inputTexture) - //(ops.count - except) - for i in 0..<(ops.count - except) { - let op = ops[i] - do { - try op.run(device: device, buffer: buffer) - } catch let error { - throw error - } +public class Executor: Executorable{ + var ops: [Runable & InferShaperable] = [] + var preInputDim: Dim = Dim.init(inDim: []) + let program: Program + let device: MTLDevice + let inflightSemaphore: DispatchSemaphore + let queue: MTLCommandQueue + init(inDevice:MTLDevice, inQueue: MTLCommandQueue, inProgram: Program, initContext: InitContext) throws { + self.inflightSemaphore = DispatchSemaphore(value: 1) + program = inProgram + device = inDevice + queue = inQueue + + for block in inProgram.programDesc.blocks { + //block.ops.count + for i in 0...shared.creat(device: inDevice, opDesc: opDesc, scope: inProgram.scope, initContext: initContext) + ops.append(op) + } catch let error { + throw error + } + } + } } - var outputTextures: [String : [MTLBuffer]]? - if except > 0 { - ops[ops.count - except].computeMiddleResult(device: device, buffer: buffer) - outputTextures = ops[ops.count - except].inputVariant() + public func predict(input: MTLTexture, dim: Dim, completionHandle: @escaping ([GPUResultHolder]) -> Void, preProcessKernle: CusomKernel? = nil, except: Int = 0) throws { + inflightSemaphore.wait() + + guard let buffer = queue.makeCommandBuffer() else { + throw PaddleMobileError.predictError(message: "CommandBuffer is nil") + } + + let resInput: MTLTexture + if let inPre = preProcessKernle { + do { + try inPre.compute(inputTexuture: input, commandBuffer: buffer) + resInput = inPre.outputTexture + } catch let error { + throw error + } + } else { + resInput = input + } + + let inputTexture = InputTexture.init(inMTLTexture: resInput, inExpectDim: dim) + program.scope.setInput(input: inputTexture) + //(ops.count - except) + for i in 0..<(ops.count - except) { + let op = ops[i] + do { + try op.run(device: device, buffer: buffer) + } catch let error { + throw error + } + } + + var outputTextures: [String : [MTLBuffer]]? + if except > 0 { + ops[ops.count - except].computeMiddleResult(device: device, buffer: buffer) + outputTextures = ops[ops.count - except].inputVariant() + } + + buffer.addCompletedHandler { [weak self] (commandbuffer) in + guard let SSelf = self else { + fatalError() + } + + //将输入写进文件 + /* + let inputArr = resInput.toTensor(dim: (n: dim[0], c: dim[3], h: dim[1], w: dim[2])) + print(dim) + writeToLibrary(fileName: "mobilenet_input", array: inputArr) + print(" write done ") + return + */ + + //输出 op 计算结果 + if GlobalConfig.shared.debug { + for i in 0.. 0 { + resultHolder = GPUResultHolder.init(inDim: [], inPointer: nil, inCapacity: 0, inIntermediateResults: outputTextures) + } else { + let outputVar: Variant = SSelf.program.scope.output()! + let output: FetchHolder = outputVar as! FetchHolder + resultHolder = GPUResultHolder.init(inDim: output.dim.dims, inPointer: output.result, inCapacity: output.capacity) + } + + completionHandle([resultHolder]) + SSelf.inflightSemaphore.signal() + } + + buffer.commit() } - buffer.addCompletedHandler { [weak self] (commandbuffer) in - guard let SSelf = self else { - fatalError() - } - - //将输入写进文件 - /* - let inputArr = resInput.toTensor(dim: (n: dim[0], c: dim[3], h: dim[1], w: dim[2])) - print(dim) - writeToLibrary(fileName: "test_image_super", array: inputArr) - print(" write done ") - return - */ - - /* 输出 op 计算结果 - for op in SSelf.ops { - op.delogOutput() - } - */ - - var resultHolder: GPUResultHolder - if except > 0 { - resultHolder = GPUResultHolder.init(inDim: [], inPointer: nil, inCapacity: 0, inIntermediateResults: outputTextures) - } else { - let outputVar: Variant = SSelf.program.scope.output()! - let output: FetchHolder = outputVar as! FetchHolder - resultHolder = GPUResultHolder.init(inDim: output.dim.dims, inPointer: output.result, inCapacity: output.capacity) - } - - completionHandle(resultHolder) - SSelf.inflightSemaphore.signal() + public func clear() { + program.scope.clear() } - buffer.commit() - } - - public func clear() { - program.scope.clear() - } - } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Framework/Loader.swift b/metal/paddle-mobile/paddle-mobile/Src/Framework/Loader.swift index 1d4f0ec14fa6442be708e729ce841969a12f5582..c437d284ccff5fd76d4bda26bc011083a8aeb998 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Framework/Loader.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Framework/Loader.swift @@ -15,252 +15,257 @@ import Foundation //import SwiftProtobuf -public class Loader { - class ParaLoader { - let file: UnsafeMutablePointer - let fileSize: Int - var nowIndex: Int - init(paramPath: String) throws { - guard let tmpFile = fopen(paramPath, "rb") else { - throw PaddleMobileError.loaderError(message: "open param file error" + paramPath) - } - file = tmpFile - fseek(file, 0, SEEK_END) - fileSize = ftell(file) - guard fileSize > 0 else { - throw PaddleMobileError.loaderError(message: "param file size is too small") - } - rewind(file) - nowIndex = 0 - } - - func read(tensor: Tensor

) throws { - guard nowIndex <= fileSize else { - throw PaddleMobileError.loaderError(message: "out of the file range") - } - - func pointerReader(type: T.Type) -> T { - let ptr = UnsafeMutablePointer.allocate(capacity: MemoryLayout.size) - fread(ptr, 1, MemoryLayout.size, file) - nowIndex += MemoryLayout.size - let pointee = ptr.pointee - ptr.deinitialize(count: MemoryLayout.size) - ptr.deallocate() - return pointee - } - - let _ = pointerReader(type: UInt32.self) - let lodLevel = pointerReader(type: UInt64.self) - for _ in 0...size)){ - _ = pointerReader(type: size_t.self) - } - } - - let _ = pointerReader(type: UInt32.self) - - let tensorDescSize = pointerReader(type: Int32.self) - - fseek(file, Int(tensorDescSize), SEEK_CUR) - nowIndex += Int(tensorDescSize) - - /* - 这里没有根据 Data Type 去判断, 而是从外部泛型直接指定了精度 - */ - - //现在模型传入模型为 Float 类型, 这块应该根据模型来 - // let tmpCapacity = MemoryLayout.size * tensor.numel() - // let tmpPointer = UnsafeMutablePointer.allocate(capacity: tmpCapacity); - let bytesRead = fread(tensor.data.pointer, 1, tensor.data.size, file) - - guard bytesRead == tensor.data.size else { - throw PaddleMobileError.loaderError(message: "param read size error") - } - - // TODO: use script to convert - // let bytesRead = fread(tmpPointer, 1, tmpCapacity, file) - // for i in 0..) throws { - guard nowIndex <= paramSize else { - throw PaddleMobileError.loaderError(message: "out of the file range") - } - var readerIndex: Int = 0 - func pointerReader(type: T.Type) -> T { - let ptr = UnsafeMutablePointer.allocate(capacity: MemoryLayout.size) - memcpy(ptr, paramPointer.advanced(by: Int(readerIndex)), MemoryLayout.size) - nowIndex += MemoryLayout.size - readerIndex += MemoryLayout.size - let pointee = ptr.pointee - ptr.deinitialize(count: MemoryLayout.size) - ptr.deallocate() - - return pointee - } - let _ = pointerReader(type: UInt32.self) - let lodLevel = pointerReader(type: UInt64.self) - for _ in 0...size)){ - _ = pointerReader(type: size_t.self) - } +protocol Loaderable { + func load(device:MTLDevice, paramPointer: UnsafeMutableRawPointer, paramSize:Int, modePointer: UnsafeMutableRawPointer, modelSize: Int) throws -> Program + func load(device: MTLDevice, modelPath: String, paraPath: String) throws -> Program +} + +public class Loader: Loaderable{ + class ParaLoader { + let file: UnsafeMutablePointer + let fileSize: Int + var nowIndex: Int + init(paramPath: String) throws { + guard let tmpFile = fopen(paramPath, "rb") else { + throw PaddleMobileError.loaderError(message: "open param file error" + paramPath) + } + file = tmpFile + fseek(file, 0, SEEK_END) + fileSize = ftell(file) + guard fileSize > 0 else { + throw PaddleMobileError.loaderError(message: "param file size is too small") + } + rewind(file) + nowIndex = 0 } - let _ = pointerReader(type: UInt32.self) - let tensorDescSize = pointerReader(type: Int32.self) - - paramPointer = paramPointer.advanced(by: Int(readerIndex)) - paramPointer = paramPointer.advanced(by: Int(tensorDescSize)) - nowIndex += Int(tensorDescSize) + func read(tensor: Tensor

) throws { + guard nowIndex <= fileSize else { + throw PaddleMobileError.loaderError(message: "out of the file range") + } + + func pointerReader(type: T.Type) -> T { + let ptr = UnsafeMutablePointer.allocate(capacity: MemoryLayout.size) + fread(ptr, 1, MemoryLayout.size, file) + nowIndex += MemoryLayout.size + let pointee = ptr.pointee + ptr.deinitialize(count: MemoryLayout.size) + ptr.deallocate() + return pointee + } + + let _ = pointerReader(type: UInt32.self) + let lodLevel = pointerReader(type: UInt64.self) + for _ in 0...size)){ + _ = pointerReader(type: size_t.self) + } + } + + let _ = pointerReader(type: UInt32.self) + + let tensorDescSize = pointerReader(type: Int32.self) + + fseek(file, Int(tensorDescSize), SEEK_CUR) + nowIndex += Int(tensorDescSize) + + /* + 这里没有根据 Data Type 去判断, 而是从外部泛型直接指定了精度 + */ + + //现在模型传入模型为 Float 类型, 这块应该根据模型来 + // let tmpCapacity = MemoryLayout.size * tensor.numel() + // let tmpPointer = UnsafeMutablePointer.allocate(capacity: tmpCapacity); + let bytesRead = fread(tensor.data.pointer, 1, tensor.data.size, file) + + guard bytesRead == tensor.data.size else { + throw PaddleMobileError.loaderError(message: "param read size error") + } + + // TODO: use script to convert + // let bytesRead = fread(tmpPointer, 1, tmpCapacity, file) + // for i in 0.. Program { - do { - /// swift protobuf serialized Data to instance class - // let protoProgram = try PaddleMobile_Framework_Proto_ProgramDesc.init( - // serializedData: modelData) + class ParaLoaderWithPointer { + var paramPointer: UnsafeMutableRawPointer + let paramSize: Int + var nowIndex: Int + init(pPointer: UnsafeMutableRawPointer,pSize:Int) throws { + paramPointer = UnsafeMutableRawPointer.init(pPointer) + paramSize = pSize + nowIndex = 0 + } - /// oc protobuf serialized Data to instance class - let protoProgram = try ProgramDesc.init(data: (modelData as NSData) as Data) - - let originProgramDesc = PMProgramDesc.init(protoProgram: protoProgram) - let programDesc = ProgramOptimize

.init().optimize(originProgramDesc: originProgramDesc) - -// let programDesc = PMProgramDesc.init(protoProgram: protoProgram) - - print(programDesc) - - guard programDesc.blocks.count > 0 else { - throw PaddleMobileError.loaderError(message: "count of blocks must greater than 0") - } - - // to get feed key and fetch key - let block = programDesc.blocks[0] - guard let firstOp = block.ops.first, let lastOp = block.ops.last else { - throw PaddleMobileError.loaderError(message: "at least two operator") - } - - guard firstOp.type == gFeedType, lastOp.type == gFetchType else { - throw PaddleMobileError.loaderError(message: "the first op is not feed or the last op is not fetch") - } - - guard let inputKey = opInfos[gFeedType]?.inputs.first, let outKey = opInfos[gFetchType]?.outputs.first else { - throw PaddleMobileError.loaderError(message: "the feed input key or fetch output key not found") - } - guard let feedKey = firstOp.inputs[inputKey]?.first, let fetchKey = lastOp.outputs[outKey]?.first else { - throw PaddleMobileError.loaderError(message: "feed key or fetch key not found") - } - - let scope = Scope.init(inFeedKey: feedKey, inFetchKey: fetchKey) - - // to load memory - for block in programDesc.blocks { - for varDesc in block.vars { - if (varDesc.type == .LodTensor) { - guard let tensorDesc = varDesc.tensorDesc else { - throw PaddleMobileError.loaderError(message: "get tensor desc failed") + func read(tensor: Tensor

) throws { + guard nowIndex <= paramSize else { + throw PaddleMobileError.loaderError(message: "out of the file range") } - - if (varDesc.persistable - && varDesc.type != .FeedMiniBatch - && varDesc.type != .FetchList) { - let dimArr = tensorDesc.dims - - guard dimArr.count > 0 else { - throw PaddleMobileError.loaderError(message: "tensor desc dim size error") - } - - let dim = Dim.init(inDim: dimArr) - let tensor = Tensor

.init(inDim: dim, inLayout: tensorDesc.dataLayout) - do { - if paraLoaderPointer != nil { - try paraLoaderPointer!.read(tensor: tensor) - } + var readerIndex: Int = 0 + func pointerReader(type: T.Type) -> T { + let ptr = UnsafeMutablePointer.allocate(capacity: MemoryLayout.size) + memcpy(ptr, paramPointer.advanced(by: Int(readerIndex)), MemoryLayout.size) + nowIndex += MemoryLayout.size + readerIndex += MemoryLayout.size + let pointee = ptr.pointee + ptr.deinitialize(count: MemoryLayout.size) + ptr.deallocate() - if paraLoader != nil { - try paraLoader!.read(tensor: tensor) - } - } catch let error { - throw error - } - // tensor.convert(to: DataLayout.NHWC()) - // tensor.initBuffer(device: device) - scope[varDesc.name] = tensor - } else { - let dim = Dim.init(inDim: tensorDesc.dims) - scope[varDesc.name] = Texture.init(device: device, inDim: dim) + return pointee } - } else { - if varDesc.name == fetchKey { -// scope[varDesc.name] = ResultHolder.init(inDim: [], inResult: [], inCapacity: <#Int#>, inElapsedTime: 0.0) - } else if varDesc.name == feedKey { + let _ = pointerReader(type: UInt32.self) + let lodLevel = pointerReader(type: UInt64.self) + for _ in 0...size)){ + _ = pointerReader(type: size_t.self) + } } - } + + let _ = pointerReader(type: UInt32.self) + let tensorDescSize = pointerReader(type: Int32.self) + + paramPointer = paramPointer.advanced(by: Int(readerIndex)) + paramPointer = paramPointer.advanced(by: Int(tensorDescSize)) + nowIndex += Int(tensorDescSize) + + let _ = memcpy(tensor.data.pointer, paramPointer, tensor.data.size) + paramPointer = paramPointer.advanced(by: Int(tensor.data.size)) + nowIndex += tensor.data.size + } + deinit { } - } - - let program = Program.init(inProgramDesc: programDesc, inScope: scope) - - return program - } catch _ { - throw PaddleMobileError.loaderError(message: "protobuf decoder error") - } - } - public func load(device:MTLDevice, paramPointer: UnsafeMutableRawPointer, paramSize:Int, modePointer: UnsafeMutableRawPointer, modelSize: Int) throws -> Program { - let modelData = Data.init(bytes:modePointer, count:modelSize) - guard let paraLoader = try? ParaLoaderWithPointer.init(pPointer: paramPointer,pSize: paramSize) else { - throw PaddleMobileError.loaderError(message: "load para error") - } - do { - let program = try loadModelandParam(device,modelData,paraLoader,nil) - return program - } catch let error { - throw error } - } - - public func load(device: MTLDevice, modelPath: String, paraPath: String) throws -> Program{ - guard let modelData = try? Data.init(contentsOf: URL.init(fileURLWithPath: modelPath)) else { - throw PaddleMobileError.loaderError(message: "load " + modelPath + " failed !") + public init(){} + private func loadModelandParam(_ device:MTLDevice,_ modelData:Data, _ paraLoaderPointer:ParaLoaderWithPointer?, _ paraLoader:ParaLoader?) throws -> Program { + do { + /// swift protobuf serialized Data to instance class + // let protoProgram = try PaddleMobile_Framework_Proto_ProgramDesc.init( + // serializedData: modelData) + + /// oc protobuf serialized Data to instance class + let protoProgram = try ProgramDesc.init(data: (modelData as NSData) as Data) + + let originProgramDesc = PMProgramDesc.init(protoProgram: protoProgram) + let programDesc = ProgramOptimize

.init().optimize(originProgramDesc: originProgramDesc) + + // let programDesc = PMProgramDesc.init(protoProgram: protoProgram) + + print(programDesc) + + guard programDesc.blocks.count > 0 else { + throw PaddleMobileError.loaderError(message: "count of blocks must greater than 0") + } + + // to get feed key and fetch key + let block = programDesc.blocks[0] + guard let firstOp = block.ops.first, let lastOp = block.ops.last else { + throw PaddleMobileError.loaderError(message: "at least two operator") + } + + guard firstOp.type == gFeedType, lastOp.type == gFetchType else { + throw PaddleMobileError.loaderError(message: "the first op is not feed or the last op is not fetch") + } + + guard let inputKey = opInfos[gFeedType]?.inputs.first, let outKey = opInfos[gFetchType]?.outputs.first else { + throw PaddleMobileError.loaderError(message: "the feed input key or fetch output key not found") + } + guard let feedKey = firstOp.inputs[inputKey]?.first, let fetchKey = lastOp.outputs[outKey]?.first else { + throw PaddleMobileError.loaderError(message: "feed key or fetch key not found") + } + + let scope = Scope.init(inFeedKey: feedKey, inFetchKey: fetchKey) + + // to load memory + for block in programDesc.blocks { + for varDesc in block.vars { + if (varDesc.type == .LodTensor) { + guard let tensorDesc = varDesc.tensorDesc else { + throw PaddleMobileError.loaderError(message: "get tensor desc failed") + } + + if (varDesc.persistable + && varDesc.type != .FeedMiniBatch + && varDesc.type != .FetchList) { + let dimArr = tensorDesc.dims + + guard dimArr.count > 0 else { + throw PaddleMobileError.loaderError(message: "tensor desc dim size error") + } + + let dim = Dim.init(inDim: dimArr) + let tensor = Tensor

.init(inDim: dim, inLayout: tensorDesc.dataLayout) + do { + if paraLoaderPointer != nil { + try paraLoaderPointer!.read(tensor: tensor) + } + + if paraLoader != nil { + try paraLoader!.read(tensor: tensor) + } + } catch let error { + throw error + } + // tensor.convert(to: DataLayout.NHWC()) + // tensor.initBuffer(device: device) + scope[varDesc.name] = tensor + } else { + let dim = Dim.init(inDim: tensorDesc.dims) + scope[varDesc.name] = Texture.init(device: device, inDim: dim) + } + } else { + if varDesc.name == fetchKey { + // scope[varDesc.name] = ResultHolder.init(inDim: [], inResult: [], inCapacity: <#Int#>, inElapsedTime: 0.0) + } else if varDesc.name == feedKey { + } + } + } + } + + let program = Program.init(inProgramDesc: programDesc, inScope: scope) + + return program + } catch _ { + throw PaddleMobileError.loaderError(message: "protobuf decoder error") + } } - guard let paraLoader = try? ParaLoader.init(paramPath: paraPath) else { - throw PaddleMobileError.loaderError(message: "load para error") + public func load(device:MTLDevice, paramPointer: UnsafeMutableRawPointer, paramSize:Int, modePointer: UnsafeMutableRawPointer, modelSize: Int) throws -> Program { + let modelData = Data.init(bytes:modePointer, count:modelSize) + guard let paraLoader = try? ParaLoaderWithPointer.init(pPointer: paramPointer,pSize: paramSize) else { + throw PaddleMobileError.loaderError(message: "load para error") + } + do { + let program = try loadModelandParam(device,modelData,paraLoader,nil) + return program + } catch let error { + throw error + } } - do { - let program = try loadModelandParam(device,modelData,nil,paraLoader) - return program - } catch let error { - throw error + public func load(device: MTLDevice, modelPath: String, paraPath: String) throws -> Program { + guard let modelData = try? Data.init(contentsOf: URL.init(fileURLWithPath: modelPath)) else { + throw PaddleMobileError.loaderError(message: "load " + modelPath + " failed !") + } + guard let paraLoader = try? ParaLoader.init(paramPath: paraPath) else { + throw PaddleMobileError.loaderError(message: "load para error") + } + + do { + let program = try loadModelandParam(device,modelData,nil,paraLoader) + return program + } catch let error { + throw error + } } - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Framework/Tensor.swift b/metal/paddle-mobile/paddle-mobile/Src/Framework/Tensor.swift index 97fe0a8fbadf443a5b71ce150c37c4b023af65c9..b1121875c8071f6f5e16ce1c0be94bcba6b4627e 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Framework/Tensor.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Framework/Tensor.swift @@ -17,337 +17,440 @@ import MetalKit import CoreMedia protocol Tensorial: Variant { - var dim: Dim { get set } - func numel() -> Int - var layout: DataLayout { get } + var dim: Dim { get set } + func numel() -> Int + var layout: DataLayout { get } } extension Tensorial { - func numel() -> Int { - return dim.numel() - } + func numel() -> Int { + return dim.numel() + } } - - -class Tensor: Tensorial { - - var data: Data - var dim: Dim - var buffer: MTLBuffer! - private(set) var layout: DataLayout - - class Data { - init(inSize: Int, inPointer: UnsafeMutablePointer

) { - size = inSize - pointer = inPointer - } - let size: Int - var pointer: UnsafeMutablePointer

- subscript(index: Int) -> P{ - get { - return pointer[index] - } - set { - pointer[index] = newValue - } - } - func release() { - pointer.deinitialize(count: size) - pointer.deallocate() - } - deinit { - // release() - } - } - - init(inDim: Dim, inLayout: DataLayout = DataLayout.NCHW()) { - dim = inDim - let size = inDim.numel() * MemoryLayout

.size - let pointer = UnsafeMutablePointer

.allocate(capacity: size) - data = Data.init(inSize: size, inPointer: pointer) - layout = inLayout - } - - func convert(to: DataLayout) { - guard to != layout else { - return +class DataConverter { + func convert(from: UnsafeMutablePointer

, to: UnsafeMutablePointer

, fromDim: Dim) { + fatalError(" need imp") } - guard dim.cout() == 4 else { - return + func getToDim(fromDim: Dim, layout: DataLayout) -> (dim: Dim, layout: DataLayout) { + fatalError(" need imp") } +} + +/// [ outputChannels ][ inputChannels ][ kernelHeight ][ kernelWidth ] -> +/// [ outputChannels ][ kernelHeight ][ kernelWidth ][ inputChannels ] +class MPSPointerConverter: DataConverter

{ - guard layout == DataLayout.NCHW() && to == DataLayout.NHWC() else { - // other not support - return + /// [ outputChannels ][ inputChannels ][ kernelHeight ][ kernelWidth ] -> + /// [ outputChannels ][ kernelHeight ][ kernelWidth ][ inputChannels ] + /// - Parameters: + /// - from: from pointer + /// - to: to pointer + override func convert(from: UnsafeMutablePointer

, to: UnsafeMutablePointer

, fromDim: Dim) { + let outputChannels = fromDim[0] + let inputChannels = fromDim[1] + let kernelHeight = fromDim[2] + let kernelWidth = fromDim[3] + + for outChannel in 0...allocate(capacity: data.size) - if layout == DataLayout.NCHW() { - NCHW2NHWC(newPtr: newPointer) + override func getToDim(fromDim: Dim, layout: DataLayout) -> (dim: Dim, layout: DataLayout) { + + if layout != DataLayout.NCHW() { + fatalError("not support") + } + + let outputChannels = fromDim[0] + let inputChannels = fromDim[1] + let kernelHeight = fromDim[2] + let kernelWidth = fromDim[3] + let toDim = Dim.init(inDim: [outputChannels, kernelHeight, kernelWidth, inputChannels]) + + return (dim: toDim, layout: DataLayout.NHWC()) } - - data.release() - data.pointer = newPointer - layout = to - } - +} - - func initBuffer(device: MTLDevice, precision: ComputePrecision = .Float16, padWhenOneC: Bool = false, convertToNHWC: Bool = true, withTranspose: Bool = false) { - if convertToNHWC { -// print(layout) - convert(to: DataLayout.NHWC()) - } +class Tensor: Tensorial { + + var data: Data + var dim: Dim - if withTranspose { - let transposePointer = UnsafeMutablePointer

.allocate(capacity: numel()) - let n = dim[0] - let hwc = numel()/n - for j in 0..) { + count = inCount + size = inCount * MemoryLayout

.size + pointer = inPointer + } + internal private(set) var pointer: UnsafeMutablePointer

+ subscript(index: Int) -> P { + get { + return pointer[index] + } + set { + pointer[index] = newValue + } + } + func release() { + if !released { + pointer.deinitialize(count: count) + pointer.deallocate() + released = true + } + } + + deinit { + if !released { + pointer.deinitialize(count: count) + pointer.deallocate() + released = true + } } - } - - dim.swapeDimAt(index1: 0, index2: 3) - data.release() - data.pointer = transposePointer } - guard let floatPointer = data.pointer as? UnsafeMutablePointer else { - fatalError(" not support yet ") + init(inDim: Dim, inLayout: DataLayout = DataLayout.NCHW()) { + tensorDim = inDim + dim = inDim + let pointer = UnsafeMutablePointer

.allocate(capacity: inDim.numel()) + data = Data.init(inCount: inDim.numel(), inPointer: pointer) + layout = inLayout } - let precisionSize: Int - switch precision { - case .Float32: - precisionSize = 4 - case .Float16: - precisionSize = 2 + func convert(converter: DataConverter

) -> UnsafeMutablePointer

{ + let to = UnsafeMutablePointer

.allocate(capacity: numel()) + converter.convert(from: data.pointer, to: to, fromDim: dim) + data = Data.init(inCount: numel(), inPointer: to) + let dimAndLayout = converter.getToDim(fromDim: dim, layout: layout) + dim = dimAndLayout.dim + layout = dimAndLayout.layout + return to } - if dim.cout() == 4 { - if layout == DataLayout.NHWC() { - let C = dim[3] - let cSlices = (C + 3) / 4 - let paddedC = cSlices * 4 - let count = paddedC * dim[0] * dim[1] * dim[2] - if C == paddedC { - buffer = device.makeBuffer(length: count * precisionSize) - switch precision { - case .Float32: - buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout

.stride) - case .Float16: - float32ToFloat16(input: floatPointer, output: buffer.contents(), count: count) - } - } else if C == 1 && !padWhenOneC { - buffer = device.makeBuffer(length: numel() * precisionSize) - switch precision { - case .Float32: - buffer?.contents().copyMemory(from: data.pointer, byteCount: numel() * MemoryLayout

.stride) - case .Float16: - float32ToFloat16(input: floatPointer, output: buffer.contents(), count: numel()) - } - } else { - buffer = device.makeBuffer(length: count * precisionSize) - let convertedPointer = UnsafeMutablePointer.allocate(capacity: count) - var tmpPointer = floatPointer - var dstPtr = convertedPointer - for _ in 0...allocate(capacity: numel()) + + if layout == DataLayout.NCHW() { + NCHW2NHWC(newPtr: newPointer) + } + + data.release() + data = Data.init(inCount: data.count, inPointer: newPointer) + layout = to + } + + func initBuffer(device: MTLDevice, precision computePrecision: Precision = .Float16, padWhenOneC: Bool = false, convertToNHWC: Bool = true, withTranspose: Bool = false) { + if convertToNHWC { + convert(to: DataLayout.NHWC()) + } + + if P.precisionType == .Float16 && computePrecision == .Float32{ + fatalError(" 不支持: 16位模型不能按照 32 位进行运算") + } + + if withTranspose { + let transposePointer = UnsafeMutablePointer

.allocate(capacity: numel()) + let n = dim[0] + let hwc = numel()/n + for j in 0...stride) - case .Float16: - float32ToFloat16(input: convertedPointer, output: buffer.contents(), count: count) - } - - convertedPointer.deinitialize(count: count) - convertedPointer.deallocate() + + dim.swapeDimAt(index1: 0, index2: 3) + data.release() + data = Data.init(inCount: data.count, inPointer: transposePointer) } - } else { - let C = dim[3] - let cSlices = (C + 3) / 4 - let paddedC = cSlices * 4 - let count = paddedC * dim[0] * dim[1] * dim[2] - if C == paddedC { - buffer = device.makeBuffer(length: count * precisionSize) - switch precision { - case .Float32: - buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout

.stride) - case .Float16: - float32ToFloat16(input: floatPointer, output: buffer.contents(), count: count) - } - } else if C == 1 { - fatalError(" not support ") - } else { - buffer = device.makeBuffer(length: count * precisionSize) - let convertedPointer = UnsafeMutablePointer.allocate(capacity: count) - var tmpPointer = floatPointer - var dstPtr = convertedPointer - for _ in 0...stride) + case .Float32: + switch computePrecision { + case .Float32: + buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout

.stride) + case .Float16: + float32ToFloat16(input: data.pointer as! UnsafeMutablePointer, output: buffer.contents(), count: count) + } + } + } else if C == 1 && !padWhenOneC { + buffer = device.makeBuffer(length: numel() * precisionSize) + switch P.precisionType { + case .Float16: + buffer?.contents().copyMemory(from: data.pointer, byteCount: numel() * MemoryLayout

.stride) + case .Float32: + switch computePrecision { + case .Float32: + buffer?.contents().copyMemory(from: data.pointer, byteCount: numel() * MemoryLayout

.stride) + case .Float16: + float32ToFloat16(input: data.pointer as! UnsafeMutablePointer, output: buffer.contents(), count: numel()) + } + } + } else { + buffer = device.makeBuffer(length: count * precisionSize) + let convertedPointer = UnsafeMutablePointer

.allocate(capacity: count) + var tmpPointer = data.pointer + var dstPtr = convertedPointer + for _ in 0...stride) + case .Float32: + switch computePrecision { + case .Float32: + buffer?.contents().copyMemory(from: convertedPointer, byteCount: count * MemoryLayout

.stride) + case .Float16: + float32ToFloat16(input: convertedPointer as! UnsafeMutablePointer, output: buffer.contents(), count: count) + } + } + convertedPointer.deinitialize(count: count) + convertedPointer.deallocate() + } + } else { + let C = dim[3] + let cSlices = (C + 3) / 4 + let paddedC = cSlices * 4 + let count = paddedC * dim[0] * dim[1] * dim[2] + if C == paddedC { + buffer = device.makeBuffer(length: count * precisionSize) + switch P.precisionType { + case .Float16: + buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout

.stride) + case .Float32: + switch computePrecision { + case .Float32: + buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout

.stride) + case .Float16: + float32ToFloat16(input: data.pointer as! UnsafeMutablePointer, output: buffer.contents(), count: count) + } + } + } else if C == 1 { + fatalError(" not support ") + } else { + buffer = device.makeBuffer(length: count * precisionSize) + let convertedPointer = UnsafeMutablePointer

.allocate(capacity: count) + var tmpPointer = data.pointer + var dstPtr = convertedPointer + for _ in 0...stride) + case .Float32: // 模型精度为 16 位 + switch computePrecision { + case .Float32: + buffer?.contents().copyMemory(from: convertedPointer, byteCount: count * MemoryLayout

.stride) + case .Float16: + float32ToFloat16(input: convertedPointer as! UnsafeMutablePointer, output: buffer.contents(), count: count) + } + } + convertedPointer.deinitialize(count: count) + convertedPointer.deallocate() + } } - tmpPointer += C - dstPtr += paddedC - } - - switch precision { - case .Float32: - buffer?.contents().copyMemory(from: convertedPointer, byteCount: count * MemoryLayout

.stride) - case .Float16: - float32ToFloat16(input: convertedPointer, output: buffer.contents(), count: count) - } - convertedPointer.deinitialize(count: count) - convertedPointer.deallocate() + } else if dim.cout() == 1 { + let num = ((numel() + 3) / 4) * 4 + buffer = device.makeBuffer(length: num * precisionSize) + + switch P.precisionType { + case .Float16: + buffer?.contents().copyMemory(from: data.pointer, byteCount: num * MemoryLayout

.stride) + case .Float32: + switch computePrecision { + case .Float32: + buffer?.contents().copyMemory(from: data.pointer, byteCount: num * MemoryLayout

.stride) + case .Float16: + float32ToFloat16(input: data.pointer as! UnsafeMutablePointer, output: buffer.contents(), count: num) + } + } + } else { + fatalError(" not support !") } - } - } else if dim.cout() == 1 { - let num = ((numel() + 3) / 4) * 4 - buffer = device.makeBuffer(length: num * precisionSize) - switch precision { - case .Float32: - buffer?.contents().copyMemory(from: data.pointer, byteCount: num * MemoryLayout

.stride) - case .Float16: - float32ToFloat16(input: floatPointer, output: buffer.contents(), count: num) - } - } else { - fatalError(" not support !") + //TODO: release + data.release() } - //TODO: release - data.release() - } - - var n: Int { - get { - if dim.cout() == 4 { - if layout == DataLayout.NCHW() { - return dim[0] - } else if layout == DataLayout.NHWC() { - return dim[0] - } else { - fatalError(" unsupport ") + + var n: Int { + get { + if dim.cout() == 4 { + if layout == DataLayout.NCHW() { + return dim[0] + } else if layout == DataLayout.NHWC() { + return dim[0] + } else { + fatalError(" unsupport ") + } + } else { + fatalError() + } } - } else { - fatalError() - } } - } - - var width: Int { - get { - if dim.cout() == 4 { - if layout == DataLayout.NHWC() { - return dim[2] - } else if layout == DataLayout.NCHW() { - return dim[3] - } else { - fatalError(" unsupport ") + + var width: Int { + get { + if dim.cout() == 4 { + if layout == DataLayout.NHWC() { + return dim[2] + } else if layout == DataLayout.NCHW() { + return dim[3] + } else { + fatalError(" unsupport ") + } + } else { + fatalError() + } } - } else { - fatalError() - } } - } - - var height: Int { - get { - if dim.cout() == 4 { - if layout == DataLayout.NHWC() { - return dim[1] - } else if layout == DataLayout.NCHW() { - return dim[2] - } else { - fatalError(" unsupport ") + + var height: Int { + get { + if dim.cout() == 4 { + if layout == DataLayout.NHWC() { + return dim[1] + } else if layout == DataLayout.NCHW() { + return dim[2] + } else { + fatalError(" unsupport ") + } + } else { + fatalError() + } } - } else { - fatalError() - } } - } - - var channel: Int { - get { - if dim.cout() == 4 { - if layout == DataLayout.NHWC() { - return dim[3] - } else if layout == DataLayout.NCHW() { - return dim[1] - } else { - fatalError(" unsupport ") + + var channel: Int { + get { + if dim.cout() == 4 { + if layout == DataLayout.NHWC() { + return dim[3] + } else if layout == DataLayout.NCHW() { + return dim[1] + } else { + fatalError(" unsupport ") + } + } else { + fatalError() + } } - } else { - fatalError() - } } - } - - - func NCHW2NHWC(newPtr: UnsafeMutablePointer

) { - let N = dim[0] - let C = dim[1] - let H = dim[2] - let W = dim[3] - let HXW = H * W - let CXHXW = C * H * W - var index: Int = 0 - for n in 0..) { + let N = dim[0] + let C = dim[1] + let H = dim[2] + let W = dim[3] + let HXW = H * W + let CXHXW = C * H * W + + var index: Int = 0 + for n in 0...size { - str += " \(buffer.contents().assumingMemoryBound(to: P.self)[i])" + + var debugDescription: String { + var str = "dim: \(dim) \n" + str += "MTLBuffer: \(self.buffer.description) \n" + for i in 0...size { + str += " \(buffer.contents().assumingMemoryBound(to: P.self)[i])" + } + return str } - return str - } - - func logDataPointer(header: String = "") { - print(header) - var str = "" - str += "data size: \(data.size) \n" - str += "dim: \(dim) \n" - for i in 0.. [1, 1, a, b] transpose 必须为 [0, 1, x, x] -// [a] -> [1, 1, 1, a] transpose 必须为 [0, 1, 2, 3] -// [a, b, c] -> [1, a, b, c] tranpose 必须为 [0, x, x, x] - -3 维 tensor [a, b, c] 对应的 texture_2darray, -.width = c -.height = b -.len = a + 3 / 4 + // TODO transpose 对于低维 tensor 的扩展原则。。。 + // [a, b] -> [1, 1, a, b] transpose 必须为 [0, 1, x, x] + // [a] -> [1, 1, 1, a] transpose 必须为 [0, 1, 2, 3] + // [a, b, c] -> [1, a, b, c] tranpose 必须为 [0, x, x, x] + + 3 维 tensor [a, b, c] 对应的 texture_2darray, + .width = c + .height = b + .len = a + 3 / 4 2 维 tensor [a, b] 对应的 texture_2darray .width = b + 3 / 4 @@ -69,136 +69,136 @@ extension InputTexture { .len = 1 */ public class Texture: Tensorial { - public var dim: Dim - public var tensorDim: Dim - - /// tensor dim pad to four - public var padToFourDim: Dim - private var textureDesc: MTLTextureDescriptor! - public var metalTexture: MTLTexture! - var transpose: [Int] = [0, 1, 2, 3] - - func elementCount() -> Int { - return metalTexture.width * metalTexture.height * metalTexture.arrayLength * 4 - } - - func toTensor() -> [Float32] { - guard padToFourDim.cout() == 4 else { - fatalError("- not support -") - } - return metalTexture.toTensor(dim: (n: dim[0], c: dim[3], h: dim[1], w: dim[2])) - } - - func realNHWC() -> [Float32] { - guard padToFourDim.cout() == 4 else { - fatalError(" - not support - ") - } - return metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3])) - } - - public func initTexture(device: MTLDevice, inTranspose: [Int] = [0, 1, 2, 3], computePrecision: ComputePrecision = .Float16) { - transpose = inTranspose - for i in 0..<(4 - tensorDim.cout()) { - if i != inTranspose[i] { - fatalError() - } - } + public var dim: Dim + public var tensorDim: Dim - let newDim = transpose.map { padToFourDim[$0] } - let newLayout = transpose.map { layout.layoutWithDim[$0] } + /// tensor dim pad to four + public var padToFourDim: Dim + private var textureDesc: MTLTextureDescriptor! + public var metalTexture: MTLTexture! + var transpose: [Int] = [0, 1, 2, 3] - layout = DataLayout.init(newLayout) - dim = Dim.init(inDim: newDim) + func elementCount() -> Int { + return metalTexture.width * metalTexture.height * metalTexture.arrayLength * 4 + } - let tmpTextureDes = MTLTextureDescriptor.init() - tmpTextureDes.textureType = .type2DArray - tmpTextureDes.depth = 1 + func toTensor() -> [Float32] { + guard padToFourDim.cout() == 4 else { + fatalError("- not support -") + } + return metalTexture.toTensor(dim: (n: dim[0], c: dim[3], h: dim[1], w: dim[2])) + } - switch tensorDim.cout() { - case 4: - tmpTextureDes.width = newDim[2] - tmpTextureDes.height = newDim[1] - tmpTextureDes.arrayLength = ((newDim[0]) * (newDim[3]) + 3) / 4 - case 3: - tmpTextureDes.width = newDim[3] - tmpTextureDes.height = newDim[2] - tmpTextureDes.arrayLength = (newDim[1] + 3) / 4 - case 2, 1: - tmpTextureDes.width = (newDim[3] + 3) / 4 - tmpTextureDes.height = newDim[2] - tmpTextureDes.arrayLength = 1 - default: - fatalError("unreachable") + func realNHWC() -> [Float32] { + guard padToFourDim.cout() == 4 else { + fatalError(" - not support - ") + } + return metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3])) } - - if computePrecision == .Float16 { - tmpTextureDes.pixelFormat = .rgba16Float - } else if computePrecision == .Float32 { - tmpTextureDes.pixelFormat = .rgba32Float + + public func initTexture(device: MTLDevice, inTranspose: [Int] = [0, 1, 2, 3], computePrecision: Precision = .Float16) { + transpose = inTranspose + for i in 0..<(4 - tensorDim.cout()) { + if i != inTranspose[i] { + fatalError() + } + } + + let newDim = transpose.map { padToFourDim[$0] } + let newLayout = transpose.map { layout.layoutWithDim[$0] } + + layout = DataLayout.init(newLayout) + dim = Dim.init(inDim: newDim) + + let tmpTextureDes = MTLTextureDescriptor.init() + tmpTextureDes.textureType = .type2DArray + tmpTextureDes.depth = 1 + + switch tensorDim.cout() { + case 4: + tmpTextureDes.width = newDim[2] + tmpTextureDes.height = newDim[1] + tmpTextureDes.arrayLength = ((newDim[0]) * (newDim[3]) + 3) / 4 + case 3: + tmpTextureDes.width = newDim[3] + tmpTextureDes.height = newDim[2] + tmpTextureDes.arrayLength = (newDim[1] + 3) / 4 + case 2, 1: + tmpTextureDes.width = (newDim[3] + 3) / 4 + tmpTextureDes.height = newDim[2] + tmpTextureDes.arrayLength = 1 + default: + fatalError("unreachable") + } + + if computePrecision == .Float16 { + tmpTextureDes.pixelFormat = .rgba16Float + } else if computePrecision == .Float32 { + tmpTextureDes.pixelFormat = .rgba32Float + } + + tmpTextureDes.usage = [.shaderRead, .shaderWrite] + tmpTextureDes.storageMode = .shared + textureDesc = tmpTextureDes + metalTexture = device.makeTexture(descriptor: tmpTextureDes) ?! " texture nil " } - tmpTextureDes.usage = [.shaderRead, .shaderWrite] - tmpTextureDes.storageMode = .shared - textureDesc = tmpTextureDes - metalTexture = device.makeTexture(descriptor: tmpTextureDes) ?! " texture nil " - } - - public func updateDims(inTensorDim: Dim, inDim: Dim) { - var fourDim: Dim - if inDim.cout() == 4 { - fourDim = inDim - } else if inDim.cout() < 4 { - var fourDimNum: [Int] = [] - for _ in 0..<(4 - inDim.cout()) { - fourDimNum.append(1) - } - fourDimNum.append(contentsOf: inDim.dims) - fourDim = Dim.init(inDim: fourDimNum) - } else { - fatalError(" not support ") + public func updateDims(inTensorDim: Dim, inDim: Dim) { + var fourDim: Dim + if inDim.cout() == 4 { + fourDim = inDim + } else if inDim.cout() < 4 { + var fourDimNum: [Int] = [] + for _ in 0..<(4 - inDim.cout()) { + fourDimNum.append(1) + } + fourDimNum.append(contentsOf: inDim.dims) + fourDim = Dim.init(inDim: fourDimNum) + } else { + fatalError(" not support ") + } + + tensorDim = inTensorDim + dim = fourDim + padToFourDim = fourDim } - tensorDim = inTensorDim - dim = fourDim - padToFourDim = fourDim - } - - // 初始化时 dim padToFourDim 模型中的维度(一般来说 nchw),前面补全0 - init(device: MTLDevice, inDim: Dim) { - print(" in dim > \(inDim)") - var fourDim: Dim - if inDim.cout() == 4 { - fourDim = inDim - } else if inDim.cout() < 4 { - var fourDimNum: [Int] = [] - for _ in 0..<(4 - inDim.cout()) { - fourDimNum.append(1) - } - fourDimNum.append(contentsOf: inDim.dims) - fourDim = Dim.init(inDim: fourDimNum) - } else { - fatalError(" not support ") + // 初始化时 dim padToFourDim 模型中的维度(一般来说 nchw),前面补全0 + init(device: MTLDevice, inDim: Dim) { + print(" in dim > \(inDim)") + var fourDim: Dim + if inDim.cout() == 4 { + fourDim = inDim + } else if inDim.cout() < 4 { + var fourDimNum: [Int] = [] + for _ in 0..<(4 - inDim.cout()) { + fourDimNum.append(1) + } + fourDimNum.append(contentsOf: inDim.dims) + fourDim = Dim.init(inDim: fourDimNum) + } else { + fatalError(" not support ") + } + tensorDim = inDim + dim = fourDim + padToFourDim = fourDim + layout = DataLayout.init([(.N, fourDim[0]), (.C, fourDim[1]), (.H, fourDim[2]), (.W, fourDim[3])]) } - tensorDim = inDim - dim = fourDim - padToFourDim = fourDim - layout = DataLayout.init([(.N, fourDim[0]), (.C, fourDim[1]), (.H, fourDim[2]), (.W, fourDim[3])]) - } - - private(set) var layout: DataLayout + + private(set) var layout: DataLayout } extension Texture { - public var description: String { - return debugDescription - } - - public var debugDescription: String{ - var str = "" - str += "Dim: \(dim) \n value:[ " - str += "\(metalTexture)" - str += " ]" - return str - } - + public var description: String { + return debugDescription + } + + public var debugDescription: String{ + var str = "" + str += "Dim: \(dim) \n value:[ " + str += "\(metalTexture.description)" + str += " ]" + return str + } + } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/OpCreator.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/OpCreator.swift index fcedbd36f7f50b348aab97de18c9fee414f182cf..a2f4104b9bf3d42b1c9aba06278b314409088d69 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/OpCreator.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/OpCreator.swift @@ -15,7 +15,7 @@ import Foundation fileprivate var singletons : [String : Any] = [:] -class OpCreator { +class OpCreator { static var shared : OpCreator

{ let key = String(describing: P.self) if let singleton = singletons[key] { @@ -27,7 +27,7 @@ class OpCreator { } } - func creat(device: MTLDevice, opDesc: PMOpDesc, scope: Scope, initContext: InitContext) throws -> Runable & InferShaperable { + func creat(device: MTLDevice, opDesc: PMOpDesc, scope: Scope, initContext: InitContext) throws -> Runable & InferShaperable { guard let opCreator = opCreators[opDesc.type] else { throw PaddleMobileError.opError(message: "there is no " + opDesc.type + " yet") } @@ -69,6 +69,6 @@ class OpCreator { gConvAddAddPreluType : ConvAddAddPreluOp

.creat, gElementwiseAddPreluType : ElementwiseAddPreluOp

.creat, gFusionConvAddType : ConvAddOp

.creat] - + private init(){} } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/OpParam.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/OpParam.swift index 01c22166642a1e16717f2cad3d434d2fb1ed0f76..37a847b7501ee93c2a9296957725a15594d8801f 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/OpParam.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/OpParam.swift @@ -22,199 +22,199 @@ import Foundation */ protocol OpParam { - associatedtype OutputType: Variant - var output: OutputType { get set } - func outputDesc() -> String - - //associatedtype ParamPrecisionType: PrecisionType - init(opDesc: PMOpDesc, inScope: Scope) throws - static func getFirstTensor(key: String, map: [String : [String]], from: Scope) throws -> VarType - static func inputX(inputs: [String : [String]], from: Scope) throws -> VarType - static func inputBiase(inputs: [String : [String]], from: Scope) throws -> VarType - static func inputMean(inputs: [String : [String]], from: Scope) throws -> VarType - static func inputScale(inputs: [String : [String]], from: Scope) throws -> VarType - static func inputVariance(inputs: [String : [String]], from: Scope) throws -> VarType - static func inputFilter(paraInputs: [String : [String]], from: Scope) throws -> VarType - static func input(inputs: [String : [String]], from: Scope) throws -> VarType - static func output(outputs: [String : [String]], from: Scope) throws -> VarType - static func outputY(outputs: [String : [String]], from: Scope) throws -> VarType - static func inputY(inputs: [String : [String]], from: Scope) throws -> VarType - - static func inputImage(inputs: [String : [String]], from: Scope) throws -> VarType - - static func outputBoxes(outputs: [String : [String]], from: Scope) throws -> VarType - - static func outputOut(outputs: [String : [String]], from: Scope) throws -> VarType - - static func outputVariances(outputs: [String : [String]], from: Scope) throws -> VarType - - static func getAttr(key: String, attrs: [String : Attr]) throws -> T - - static func paramInputAlpha(inputs: [String : [String]], from: Scope) throws -> VarType - + associatedtype OutputType: Variant + var output: OutputType { get set } + func outputDesc() -> String + + //associatedtype ParamPrecisionType: PrecisionProtocol + init(opDesc: PMOpDesc, inScope: Scope) throws + static func getFirstTensor(key: String, map: [String : [String]], from: Scope) throws -> VarType + static func inputX(inputs: [String : [String]], from: Scope) throws -> VarType + static func inputBiase(inputs: [String : [String]], from: Scope) throws -> VarType + static func inputMean(inputs: [String : [String]], from: Scope) throws -> VarType + static func inputScale(inputs: [String : [String]], from: Scope) throws -> VarType + static func inputVariance(inputs: [String : [String]], from: Scope) throws -> VarType + static func inputFilter(paraInputs: [String : [String]], from: Scope) throws -> VarType + static func input(inputs: [String : [String]], from: Scope) throws -> VarType + static func output(outputs: [String : [String]], from: Scope) throws -> VarType + static func outputY(outputs: [String : [String]], from: Scope) throws -> VarType + static func inputY(inputs: [String : [String]], from: Scope) throws -> VarType + + static func inputImage(inputs: [String : [String]], from: Scope) throws -> VarType + + static func outputBoxes(outputs: [String : [String]], from: Scope) throws -> VarType + + static func outputOut(outputs: [String : [String]], from: Scope) throws -> VarType + + static func outputVariances(outputs: [String : [String]], from: Scope) throws -> VarType + + static func getAttr(key: String, attrs: [String : Attr]) throws -> T + + static func paramInputAlpha(inputs: [String : [String]], from: Scope) throws -> VarType + } extension OpParam { - func outputDesc() -> String { - return output.debugDescription - } - - static func getFirstTensor(key: String, map: [String : [String]], from: Scope) throws -> VarType { - guard let mapKeys = map[key], mapKeys.count > 0 else { - throw PaddleMobileError.paramError(message: key + " not found in \(map) or maped values is empty") + func outputDesc() -> String { + return output.debugDescription + } + + static func getFirstTensor(key: String, map: [String : [String]], from: Scope) throws -> VarType { + guard let mapKeys = map[key], mapKeys.count > 0 else { + throw PaddleMobileError.paramError(message: key + " not found in \(map) or maped values is empty") + } + guard let variant = from[mapKeys[0]] else { + throw PaddleMobileError.paramError(message: mapKeys[0] + " not found in scope") + } + + guard let v = variant as? VarType else { + throw PaddleMobileError.paramError(message: " type error") + + } + return v } - guard let variant = from[mapKeys[0]] else { - throw PaddleMobileError.paramError(message: mapKeys[0] + " not found in scope") + + static func outputVariances(outputs: [String : [String]], from: Scope) throws -> VarType { + do { + let tensorVariances: VarType = try getFirstTensor(key: "Variances", map: outputs, from: from) + return tensorVariances + } catch let error { + throw error + } } - guard let v = variant as? VarType else { - throw PaddleMobileError.paramError(message: " type error") - + static func paramInputAlpha(inputs: [String : [String]], from: Scope) throws -> VarType { + do { + let alphaTensor: VarType = try getFirstTensor(key: "Alpha", map: inputs, from: from) + return alphaTensor + } catch let error { + throw error + } + } + + + static func inputImage(inputs: [String : [String]], from: Scope) throws -> VarType { + do { + let tensorImage: VarType = try getFirstTensor(key: "Image", map: inputs, from: from) + return tensorImage + } catch let error { + throw error + } + } + + static func inputX(inputs: [String : [String]], from: Scope) throws -> VarType { + do { + let tensorX: VarType = try getFirstTensor(key: "X", map: inputs, from: from) + return tensorX + } catch let error { + throw error + } + } + + static func outputBoxes(outputs: [String : [String]], from: Scope) throws -> VarType { + do { + let tensorBox: VarType = try getFirstTensor(key: "Boxes", map: outputs, from: from) + return tensorBox + } catch let error { + throw error + } + } + + static func input(inputs: [String : [String]], from: Scope) throws -> VarType { + do { + let tensorInput: VarType = try getFirstTensor(key: "Input", map: inputs, from: from) + return tensorInput + } catch let error { + throw error + } + } + + static func output(outputs: [String : [String]], from: Scope) throws -> VarType { + do { + let tensorOutput: VarType = try getFirstTensor(key: "Output", map: outputs, from: from) + return tensorOutput + } catch let error { + throw error + } + } + static func outputY(outputs: [String : [String]], from: Scope) throws -> VarType { + do { + let tensorOutputY: VarType = try getFirstTensor(key: "Y", map: outputs, from: from) + return tensorOutputY + } catch let error { + throw error + } + } + static func inputY(inputs: [String : [String]], from: Scope) throws -> VarType { + do { + let tensorY: VarType = try getFirstTensor(key: "Y", map: inputs, from: from) + return tensorY + } catch let error { + throw error + } + } + + static func outputOut(outputs: [String : [String]], from: Scope) throws -> VarType { + do { + let out: VarType = try getFirstTensor(key: "Out", map: outputs, from: from) + return out + } catch let error { + throw error + } + } + static func inputFilter(paraInputs: [String : [String]], from: Scope) throws -> VarType { + do { + let tensorFilter: VarType = try getFirstTensor(key: "Filter", map: paraInputs, from: from) + return tensorFilter + } catch let error { + throw error + } + } + + static func inputBiase(inputs: [String : [String]], from: Scope) throws -> VarType { + do { + let tensorBias: VarType = try getFirstTensor(key: "Bias", map: inputs, from: from) + return tensorBias + } catch let error { + throw error + } + } + + static func inputMean(inputs: [String : [String]], from: Scope) throws -> VarType { + do { + let tensorMean: VarType = try getFirstTensor(key: "Mean", map: inputs, from: from) + return tensorMean + } catch let error { + throw error + } + } + + static func inputScale(inputs: [String : [String]], from: Scope) throws -> VarType { + do { + let tensorScale: VarType = try getFirstTensor(key: "Scale", map: inputs, from: from) + return tensorScale + } catch let error { + throw error + } + } + + static func inputVariance(inputs: [String : [String]], from: Scope) throws -> VarType { + do { + let tensorVariance: VarType = try getFirstTensor(key: "Variance", map: inputs, from: from) + return tensorVariance + } catch let error { + throw error + } + } + + static func getAttr(key: String, attrs: [String : Attr]) throws -> T{ + guard let attr = attrs[key] else { + throw PaddleMobileError.paramError(message: "attr \(key) can't found in: \(attrs)" ) + } + + guard let tAttr = attr as? T else { + throw PaddleMobileError.paramError(message: "key: \(key) attr: \(attr) type error" ) + } + return tAttr } - return v - } - - static func outputVariances(outputs: [String : [String]], from: Scope) throws -> VarType { - do { - let tensorVariances: VarType = try getFirstTensor(key: "Variances", map: outputs, from: from) - return tensorVariances - } catch let error { - throw error - } - } - - static func paramInputAlpha(inputs: [String : [String]], from: Scope) throws -> VarType { - do { - let alphaTensor: VarType = try getFirstTensor(key: "Alpha", map: inputs, from: from) - return alphaTensor - } catch let error { - throw error - } - } - - - static func inputImage(inputs: [String : [String]], from: Scope) throws -> VarType { - do { - let tensorImage: VarType = try getFirstTensor(key: "Image", map: inputs, from: from) - return tensorImage - } catch let error { - throw error - } - } - - static func inputX(inputs: [String : [String]], from: Scope) throws -> VarType { - do { - let tensorX: VarType = try getFirstTensor(key: "X", map: inputs, from: from) - return tensorX - } catch let error { - throw error - } - } - - static func outputBoxes(outputs: [String : [String]], from: Scope) throws -> VarType { - do { - let tensorBox: VarType = try getFirstTensor(key: "Boxes", map: outputs, from: from) - return tensorBox - } catch let error { - throw error - } - } - - static func input(inputs: [String : [String]], from: Scope) throws -> VarType { - do { - let tensorInput: VarType = try getFirstTensor(key: "Input", map: inputs, from: from) - return tensorInput - } catch let error { - throw error - } - } - - static func output(outputs: [String : [String]], from: Scope) throws -> VarType { - do { - let tensorOutput: VarType = try getFirstTensor(key: "Output", map: outputs, from: from) - return tensorOutput - } catch let error { - throw error - } - } - static func outputY(outputs: [String : [String]], from: Scope) throws -> VarType { - do { - let tensorOutputY: VarType = try getFirstTensor(key: "Y", map: outputs, from: from) - return tensorOutputY - } catch let error { - throw error - } - } - static func inputY(inputs: [String : [String]], from: Scope) throws -> VarType { - do { - let tensorY: VarType = try getFirstTensor(key: "Y", map: inputs, from: from) - return tensorY - } catch let error { - throw error - } - } - - static func outputOut(outputs: [String : [String]], from: Scope) throws -> VarType { - do { - let out: VarType = try getFirstTensor(key: "Out", map: outputs, from: from) - return out - } catch let error { - throw error - } - } - static func inputFilter(paraInputs: [String : [String]], from: Scope) throws -> VarType { - do { - let tensorFilter: VarType = try getFirstTensor(key: "Filter", map: paraInputs, from: from) - return tensorFilter - } catch let error { - throw error - } - } - - static func inputBiase(inputs: [String : [String]], from: Scope) throws -> VarType { - do { - let tensorBias: VarType = try getFirstTensor(key: "Bias", map: inputs, from: from) - return tensorBias - } catch let error { - throw error - } - } - - static func inputMean(inputs: [String : [String]], from: Scope) throws -> VarType { - do { - let tensorMean: VarType = try getFirstTensor(key: "Mean", map: inputs, from: from) - return tensorMean - } catch let error { - throw error - } - } - - static func inputScale(inputs: [String : [String]], from: Scope) throws -> VarType { - do { - let tensorScale: VarType = try getFirstTensor(key: "Scale", map: inputs, from: from) - return tensorScale - } catch let error { - throw error - } - } - - static func inputVariance(inputs: [String : [String]], from: Scope) throws -> VarType { - do { - let tensorVariance: VarType = try getFirstTensor(key: "Variance", map: inputs, from: from) - return tensorVariance - } catch let error { - throw error - } - } - - static func getAttr(key: String, attrs: [String : Attr]) throws -> T{ - guard let attr = attrs[key] else { - throw PaddleMobileError.paramError(message: "attr \(key) can't found in: \(attrs)" ) - } - - guard let tAttr = attr as? T else { - throw PaddleMobileError.paramError(message: "key: \(key) attr: \(attr) type error" ) - } - return tAttr - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/Operator.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/Operator.swift index 532d1b661d4cb0e9823e09a9fc82d13af4f40f76..d6ba07add3f946e7ba5705d2ac06b80f3ae468ed 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/Operator.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/Operator.swift @@ -16,129 +16,135 @@ import Metal import Foundation protocol Fusion { - static func fusionNode() -> Node - static func change() -> [String : [(from: String, to: String)]] - static func fusionType() -> String - static func needCheck() -> [(Int, String)] + static func fusionNode() -> Node + static func change() -> [String : [(from: String, to: String)]] + static func fusionType() -> String + static func needCheck() -> [(Int, String)] } extension Fusion { - static func needCheck() -> [(Int, String)] { - return [] - } + static func needCheck() -> [(Int, String)] { + return [] + } } protocol Runable { - func run(device: MTLDevice, buffer: MTLCommandBuffer) throws - func runImpl(device: MTLDevice,buffer: MTLCommandBuffer) throws - func delogOutput() - func inputVariant() -> [String : [MTLBuffer]] - func computeMiddleResult(device: MTLDevice, buffer: MTLCommandBuffer) + func run(device: MTLDevice, buffer: MTLCommandBuffer) throws + func runImpl(device: MTLDevice,buffer: MTLCommandBuffer) throws + func delogOutput() + func inputVariant() -> [String : [MTLBuffer]] + func computeMiddleResult(device: MTLDevice, buffer: MTLCommandBuffer) } extension Runable where Self: OperatorProtocol{ - func run(device: MTLDevice, buffer: MTLCommandBuffer) throws { - do { - try runImpl(device: device, buffer: buffer) - } catch let error { - throw error + func run(device: MTLDevice, buffer: MTLCommandBuffer) throws { + do { + try runImpl(device: device, buffer: buffer) + } catch let error { + throw error + } + } + + func inputVariant() -> [String : [MTLBuffer]] { + // return [:] + fatalError(" op \(type) need implement inputVariant") } - } - - func inputVariant() -> [String : [MTLBuffer]] { -// return [:] - fatalError(" op \(type) need implement inputVariant") - } - - func computeMiddleResult(device: MTLDevice, buffer: MTLCommandBuffer) { - fatalError(" need implement ") - } - - func delogOutput() { - print(type + ": has no implementation" ) - } + func computeMiddleResult(device: MTLDevice, buffer: MTLCommandBuffer) { + fatalError(" need implement ") + } + + func delogOutput() { + + print(type + ": has no implementation" ) + } } public class InitContext { - /// metal 代码加载方式 - var metalLoadMode: MetalLoadMode = .LoadMetalInDefaultLib - /// 当 metalLoadMode 为 LoadMetalInCustomMetalLib 时, metal library 路径不能为空 - var metalLibPath: String? = nil - init() { - metalLoadMode = .LoadMetalInDefaultLib - metalLibPath = nil - } + + /// metal 代码加载方式 + var metalLoadMode: MetalLoadMode = .LoadMetalInDefaultLib + + /// 当 metalLoadMode 为 LoadMetalInCustomMetalLib 时, metal library 路径不能为空 + var metalLibPath: String? = nil + + /// 是否使用 MetalPerformanceShaders 进行运算 + var useMPS: Bool = false + + init() { + metalLoadMode = .LoadMetalInDefaultLib + metalLibPath = nil + } } protocol Creator where Self: OperatorProtocol{ - associatedtype OpType: OperatorProtocol & Runable & InferShaperable - static func creat(device: MTLDevice, opDesc: PMOpDesc, inScope: Scope, initContext: InitContext) throws -> OpType + associatedtype OpType: OperatorProtocol & Runable & InferShaperable + static func creat(device: MTLDevice, opDesc: PMOpDesc, inScope: Scope, initContext: InitContext) throws -> OpType } extension Creator where Self: OperatorProtocol { - static func creat(device: MTLDevice, opDesc: PMOpDesc, inScope: Scope, initContext: InitContext) throws -> OpType { - do { - return try OpType.provide(device:device, opDesc: opDesc, inScope: inScope, initContext: initContext) - } catch let error { - throw error + static func creat(device: MTLDevice, opDesc: PMOpDesc, inScope: Scope, initContext: InitContext) throws -> OpType { + do { + return try OpType.provide(device:device, opDesc: opDesc, inScope: inScope, initContext: initContext) + } catch let error { + throw error + } } - } } protocol InferShaperable { - func inferShape() + func inferShape() } protocol OperatorProtocol { - associatedtype ParamType - associatedtype KerType: Computable where Self.KerType.ParamType == ParamType - var type: String { get } - var scope: Scope { get } - var inputs: [String : [String]] { get } - var paraInputs: [String : [String]] { get set } - var outpus: [String : [String]] { get } - var attrs: [String : Attr] { get } - var para: ParamType { get } - var kernel: KerType { get } - init(device: MTLDevice, opDesc: PMOpDesc, inScope: Scope, initContext: InitContext) throws + associatedtype ParamType + associatedtype KerType: Computable where Self.KerType.ParamType == ParamType + var type: String { get } + var scope: Scope { get } + var inputs: [String : [String]] { get } + var paraInputs: [String : [String]] { get set } + var outpus: [String : [String]] { get } + var attrs: [String : Attr] { get } + var para: ParamType { get } + var kernel: KerType { get } + init(device: MTLDevice, opDesc: PMOpDesc, inScope: Scope, initContext: InitContext) throws } extension OperatorProtocol { - static func provide(device: MTLDevice, opDesc: PMOpDesc, inScope: Scope, initContext: InitContext) throws -> Self { - do { - return try Self.init(device: device, opDesc: opDesc, inScope: inScope, initContext: initContext) - } catch let error { - throw error + static func provide(device: MTLDevice, opDesc: PMOpDesc, inScope: Scope, initContext: InitContext) throws -> Self { + do { + return try Self.init(device: device, opDesc: opDesc, inScope: inScope, initContext: initContext) + } catch let error { + throw error + } } - } } class Operator : OperatorProtocol where KernelType.ParamType == ParameterType { - required init(device: MTLDevice, opDesc: PMOpDesc, inScope: Scope, initContext: InitContext) throws { - type = opDesc.type - scope = inScope - inputs = opDesc.inputs - outpus = opDesc.outputs - attrs = opDesc.attrs - paraInputs = opDesc.paraInputs - do { - para = try ParamType.init(opDesc:opDesc, inScope: inScope) - } catch let error { - throw error + required init(device: MTLDevice, opDesc: PMOpDesc, inScope: Scope, initContext: InitContext) throws { + type = opDesc.type + scope = inScope + inputs = opDesc.inputs + outpus = opDesc.outputs + attrs = opDesc.attrs + paraInputs = opDesc.paraInputs + do { + para = try ParamType.init(opDesc:opDesc, inScope: inScope) + } catch let error { + throw error + } + kernel = KernelType.init(device: device, param: para, initContext: initContext) } - kernel = KernelType.init(device: device, param: para, initContext: initContext) - } - - typealias ParamType = ParameterType - typealias KerType = KernelType - let type: String - let inputs: [String : [String]] - var paraInputs: [String : [String]] - let outpus: [String : [String]] - let attrs: [String : Attr] - let para: ParamType - let scope: Scope - var kernel: KerType + + typealias ParamType = ParameterType + typealias KerType = KernelType + let type: String + let inputs: [String : [String]] + var paraInputs: [String : [String]] + let outpus: [String : [String]] + let attrs: [String : Attr] + let para: ParamType + let scope: Scope + var kernel: KerType } // op infos @@ -202,4 +208,4 @@ let opInfos = [gConvType : (inputs: ["Input"], outputs: ["Out gConvAddAddPreluType : (inputs: ["Input"], outputs: ["Out"]), gElementwiseAddPreluType : (inputs: ["X"], outputs: ["Out"]), gFusionConvAddType : (inputs: ["Input"], outputs: ["Out"]) - ] +] diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/BatchNormOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/BatchNormOp.swift index a877620416cb1b12be1ac1ef2a86f198fe75fc60..7e53ea8d1cdd211fa377acb0438b17697685a78b 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/BatchNormOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/BatchNormOp.swift @@ -15,53 +15,53 @@ import Foundation import Metal -class BatchNormParam: OpParam { - //typealias ParamPrecisionType = P - required init(opDesc: PMOpDesc, inScope: Scope) throws { - do { - input = try BatchNormParam.inputX(inputs: opDesc.inputs, from: inScope) - if input.transpose != [0, 2, 3, 1] { - fatalError("batch norm only accepts NHWC") - } - output = try BatchNormParam.outputY(outputs: opDesc.outputs, from: inScope) - bias = try BatchNormParam.getFirstTensor(key: "Bias", map: opDesc.paraInputs, from: inScope) - mean = try BatchNormParam.getFirstTensor(key: "Mean", map: opDesc.paraInputs, from: inScope) - scale = try BatchNormParam.getFirstTensor(key: "Scale", map: opDesc.paraInputs, from: inScope) - variance = try BatchNormParam.getFirstTensor(key: "Variance", map: opDesc.paraInputs, from: inScope) - epsilon = try BatchNormParam.getAttr(key: "epsilon", attrs: opDesc.attrs) - momentum = try BatchNormParam.getAttr(key: "momentum", attrs: opDesc.attrs) - } catch let error { - throw error +class BatchNormParam: OpParam { + //typealias ParamPrecisionType = P + required init(opDesc: PMOpDesc, inScope: Scope) throws { + do { + input = try BatchNormParam.inputX(inputs: opDesc.inputs, from: inScope) + if input.transpose != [0, 2, 3, 1] { + fatalError("batch norm only accepts NHWC") + } + output = try BatchNormParam.outputY(outputs: opDesc.outputs, from: inScope) + bias = try BatchNormParam.getFirstTensor(key: "Bias", map: opDesc.paraInputs, from: inScope) + mean = try BatchNormParam.getFirstTensor(key: "Mean", map: opDesc.paraInputs, from: inScope) + scale = try BatchNormParam.getFirstTensor(key: "Scale", map: opDesc.paraInputs, from: inScope) + variance = try BatchNormParam.getFirstTensor(key: "Variance", map: opDesc.paraInputs, from: inScope) + epsilon = try BatchNormParam.getAttr(key: "epsilon", attrs: opDesc.attrs) + momentum = try BatchNormParam.getAttr(key: "momentum", attrs: opDesc.attrs) + } catch let error { + throw error + } } - } - let input: Texture - var output: Texture - let bias: Tensor

- let mean: Tensor

- let scale: Tensor

- let variance: Tensor

- let epsilon: Float - let momentum: Float + let input: Texture + var output: Texture + let bias: Tensor

+ let mean: Tensor

+ let scale: Tensor

+ let variance: Tensor

+ let epsilon: Float + let momentum: Float } -class BatchNormOp: Operator, BatchNormParam

>, Runable, Creator, InferShaperable{ - typealias OpType = BatchNormOp

- - func inferShape() { - para.output.dim = para.input.dim - } - func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { - do { - try kernel.compute(commandBuffer: buffer, param: para) - } catch let error { - throw error +class BatchNormOp: Operator, BatchNormParam

>, Runable, Creator, InferShaperable{ + typealias OpType = BatchNormOp

+ + func inferShape() { + para.output.dim = para.input.dim + } + func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { + do { + try kernel.compute(commandBuffer: buffer, param: para) + } catch let error { + throw error + } + } + + func delogOutput() { + print(" \(type) output: ") + let device = para.output.metalTexture!.device + let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose) + print(outputArray.strideArray()) } - } - - func delogOutput() { - print(" \(type) output: ") - let device = para.output.metalTexture!.device - let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose) - print(outputArray.strideArray()) - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/BilinearInterpOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/BilinearInterpOp.swift index a19dd1039073812b024a55c60bfad8c3c1387e71..51703c3e6fd07d6031bda497cf04e79dfc9aa1fe 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/BilinearInterpOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/BilinearInterpOp.swift @@ -15,51 +15,51 @@ import Foundation import Metal -class BilinearInterpParam: OpParam { - //typealias ParamPrecisionType = P - required init(opDesc: PMOpDesc, inScope: Scope) throws { - do { - input = try BilinearInterpParam.inputX(inputs: opDesc.inputs, from: inScope) - output = try BilinearInterpParam.outputOut(outputs: opDesc.outputs, from: inScope) - out_h = try BilinearInterpParam.getAttr(key: "out_h", attrs: opDesc.attrs) - out_w = try BilinearInterpParam.getAttr(key: "out_w", attrs: opDesc.attrs) - } catch let error { - throw error +class BilinearInterpParam: OpParam { + //typealias ParamPrecisionType = P + required init(opDesc: PMOpDesc, inScope: Scope) throws { + do { + input = try BilinearInterpParam.inputX(inputs: opDesc.inputs, from: inScope) + output = try BilinearInterpParam.outputOut(outputs: opDesc.outputs, from: inScope) + out_h = try BilinearInterpParam.getAttr(key: "out_h", attrs: opDesc.attrs) + out_w = try BilinearInterpParam.getAttr(key: "out_w", attrs: opDesc.attrs) + } catch let error { + throw error + } + if (input.transpose != [0, 2, 3, 1]) || (input.tensorDim.cout() != 4) { + fatalError() + } } - if (input.transpose != [0, 2, 3, 1]) || (input.tensorDim.cout() != 4) { - fatalError() - } - } - let input: Texture - var output: Texture - let out_h: Int - let out_w: Int + let input: Texture + var output: Texture + let out_h: Int + let out_w: Int } -class BilinearInterpOp: Operator, BilinearInterpParam

>, Runable, Creator, InferShaperable{ - - typealias OpType = BilinearInterpOp

- - func inferShape() { - // para.output.dim = para.input.dim - } - - func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { - do { - try kernel.compute(commandBuffer: buffer, param: para) - } catch let error { - throw error +class BilinearInterpOp: Operator, BilinearInterpParam

>, Runable, Creator, InferShaperable{ + + typealias OpType = BilinearInterpOp

+ + func inferShape() { + // para.output.dim = para.input.dim + } + + func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { + do { + try kernel.compute(commandBuffer: buffer, param: para) + } catch let error { + throw error + } + } + + func delogOutput() { + print(" \(type) output: ") + let device = para.output.metalTexture!.device + let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose) + // print(outputArray) + print(outputArray.strideArray()) } - } - - func delogOutput() { - print(" \(type) output: ") - let device = para.output.metalTexture!.device - let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose) -// print(outputArray) - print(outputArray.strideArray()) - } - + } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/BoxcoderOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/BoxcoderOp.swift index 4679885ab6e5c946d9b335f8b59f8537e37ea967..0ca6325922b4d1da2b3e1772503616482f6831d4 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/BoxcoderOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/BoxcoderOp.swift @@ -14,70 +14,70 @@ import Foundation -class BoxcoderParam: OpParam { - //typealias ParamPrecisionType = P - required init(opDesc: PMOpDesc, inScope: Scope) throws { - do { - priorBox = try BoxcoderParam.getFirstTensor(key: "PriorBox", map: opDesc.inputs, from: inScope) - priorBoxVar = try BoxcoderParam.getFirstTensor(key: "PriorBoxVar", map: opDesc.inputs, from: inScope) - targetBox = try BoxcoderParam.getFirstTensor(key: "TargetBox", map: opDesc.inputs, from: inScope) - output = try BoxcoderParam.getFirstTensor(key: "OutputBox", map: opDesc.outputs, from: inScope) - codeType = try BoxcoderParam.getAttr(key: "code_type", attrs: opDesc.attrs) - boxNormalized = try BoxcoderParam.getAttr(key: "box_normalized", attrs: opDesc.attrs) - } catch let error { - throw error +class BoxcoderParam: OpParam { + //typealias ParamPrecisionType = P + required init(opDesc: PMOpDesc, inScope: Scope) throws { + do { + priorBox = try BoxcoderParam.getFirstTensor(key: "PriorBox", map: opDesc.inputs, from: inScope) + priorBoxVar = try BoxcoderParam.getFirstTensor(key: "PriorBoxVar", map: opDesc.inputs, from: inScope) + targetBox = try BoxcoderParam.getFirstTensor(key: "TargetBox", map: opDesc.inputs, from: inScope) + output = try BoxcoderParam.getFirstTensor(key: "OutputBox", map: opDesc.outputs, from: inScope) + codeType = try BoxcoderParam.getAttr(key: "code_type", attrs: opDesc.attrs) + boxNormalized = try BoxcoderParam.getAttr(key: "box_normalized", attrs: opDesc.attrs) + } catch let error { + throw error + } + assert(priorBox.tensorDim.cout() == 2) + assert(priorBoxVar.tensorDim.cout() == 2) + assert(targetBox.tensorDim.cout() == 3) + assert(output.tensorDim.cout() == 3) + assert(priorBox.transpose == [0, 1, 2, 3]) + assert(priorBoxVar.transpose == [0, 1, 2, 3]) + assert(targetBox.transpose == [0, 1, 2, 3]) + assert(codeType == "decode_center_size") // encode_center_size is not implemented + assert((targetBox.tensorDim.cout() == 3) && (targetBox.tensorDim[0] == 1)) // N must be 1 (only handle batch size = 1) } - assert(priorBox.tensorDim.cout() == 2) - assert(priorBoxVar.tensorDim.cout() == 2) - assert(targetBox.tensorDim.cout() == 3) - assert(output.tensorDim.cout() == 3) - assert(priorBox.transpose == [0, 1, 2, 3]) - assert(priorBoxVar.transpose == [0, 1, 2, 3]) - assert(targetBox.transpose == [0, 1, 2, 3]) - assert(codeType == "decode_center_size") // encode_center_size is not implemented - assert((targetBox.tensorDim.cout() == 3) && (targetBox.tensorDim[0] == 1)) // N must be 1 (only handle batch size = 1) - } - let priorBox: Texture - let priorBoxVar: Texture - let targetBox: Texture - var output: Texture - let codeType: String - let boxNormalized: Bool + let priorBox: Texture + let priorBoxVar: Texture + let targetBox: Texture + var output: Texture + let codeType: String + let boxNormalized: Bool } -class BoxcoderOp: Operator, BoxcoderParam

>, Runable, Creator, InferShaperable{ - - typealias OpType = BoxcoderOp

- - func inferShape() { - // para.output.dim = para.input.dim - } - - func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { - do { - try kernel.compute(commandBuffer: buffer, param: para) - } catch let error { - throw error +class BoxcoderOp: Operator, BoxcoderParam

>, Runable, Creator, InferShaperable{ + + typealias OpType = BoxcoderOp

+ + func inferShape() { + // para.output.dim = para.input.dim + } + + func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { + do { + try kernel.compute(commandBuffer: buffer, param: para) + } catch let error { + throw error + } + } + + func delogOutput() { + print(" \(type) output: ") + let device = para.output.metalTexture!.device + let pbv : [Float32] = device.texture2tensor(texture: para.priorBoxVar.metalTexture!, dim: para.priorBoxVar.tensorDim.dims, transpose: para.priorBoxVar.transpose) + let pb : [Float32] = device.texture2tensor(texture: para.priorBox.metalTexture!, dim: para.priorBox.tensorDim.dims, transpose: para.priorBox.transpose) + let tb : [Float32] = device.texture2tensor(texture: para.targetBox.metalTexture!, dim: para.targetBox.tensorDim.dims, transpose: para.targetBox.transpose) + let out : [Float32] = device.texture2tensor(texture: para.output.metalTexture!, dim: para.output.tensorDim.dims, transpose: para.output.transpose) + print(" prior box var ") + print(pbv.strideArray()) + print(" target box ") + print(tb.strideArray()) + print(" prior box ") + print(pb.strideArray()) + print(" output ") + print(out.strideArray()) } - } - - func delogOutput() { - print(" \(type) output: ") - let device = para.output.metalTexture!.device - let pbv : [Float32] = device.texture2tensor(texture: para.priorBoxVar.metalTexture!, dim: para.priorBoxVar.tensorDim.dims, transpose: para.priorBoxVar.transpose) - let pb : [Float32] = device.texture2tensor(texture: para.priorBox.metalTexture!, dim: para.priorBox.tensorDim.dims, transpose: para.priorBox.transpose) - let tb : [Float32] = device.texture2tensor(texture: para.targetBox.metalTexture!, dim: para.targetBox.tensorDim.dims, transpose: para.targetBox.transpose) - let out : [Float32] = device.texture2tensor(texture: para.output.metalTexture!, dim: para.output.tensorDim.dims, transpose: para.output.transpose) - print(" prior box var ") - print(pbv.strideArray()) - print(" target box ") - print(tb.strideArray()) - print(" prior box ") - print(pb.strideArray()) - print(" output ") - print(out.strideArray()) - } - + } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConcatOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConcatOp.swift index c2c22d55af6fc33ca69cbc028f149d54285459e7..e526bf05db4e54cd48a3728b47d2ae31042e3499 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConcatOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConcatOp.swift @@ -14,63 +14,63 @@ import Foundation -class ConcatParam: OpParam { - //typealias ParamPrecisionType = P - required init(opDesc: PMOpDesc, inScope: Scope) throws { - do { - guard let xlist = opDesc.inputs["X"] else { - fatalError() - } - for x in xlist { - guard let variant = inScope[x], let v = variant as? Texture else { - fatalError() +class ConcatParam: OpParam { + //typealias ParamPrecisionType = P + required init(opDesc: PMOpDesc, inScope: Scope) throws { + do { + guard let xlist = opDesc.inputs["X"] else { + fatalError() + } + for x in xlist { + guard let variant = inScope[x], let v = variant as? Texture else { + fatalError() + } + if transpose.count == 0 { + transpose = v.transpose + } + if v.transpose != transpose { + fatalError() + } + + input.append(v) + } + axis = try ConcatParam.getAttr(key: "axis", attrs: opDesc.attrs) + output = try ConcatParam.outputOut(outputs: opDesc.outputs, from: inScope) + } catch let error { + throw error } - if transpose.count == 0 { - transpose = v.transpose - } - if v.transpose != transpose { - fatalError() - } - - input.append(v) - } - axis = try ConcatParam.getAttr(key: "axis", attrs: opDesc.attrs) - output = try ConcatParam.outputOut(outputs: opDesc.outputs, from: inScope) - } catch let error { - throw error } - } - var input: [Texture] = [] - var output: Texture - var transpose: [Int] = [] - let axis: Int + var input: [Texture] = [] + var output: Texture + var transpose: [Int] = [] + let axis: Int } -class ConcatOp: Operator, ConcatParam

>, Runable, Creator, InferShaperable{ - - typealias OpType = ConcatOp

- - func inferShape() { - // let dim = para.input.reduce([0, 0]) {[$0[0] + $1.dim[0], $1.dim[1]]} - // para.output.dim = Dim.init(inDim: dim) - } - - func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { - do { - try kernel.compute(commandBuffer: buffer, param: para) - } catch let error { - throw error +class ConcatOp: Operator, ConcatParam

>, Runable, Creator, InferShaperable{ + + typealias OpType = ConcatOp

+ + func inferShape() { + // let dim = para.input.reduce([0, 0]) {[$0[0] + $1.dim[0], $1.dim[1]]} + // para.output.dim = Dim.init(inDim: dim) + } + + func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { + do { + try kernel.compute(commandBuffer: buffer, param: para) + } catch let error { + throw error + } + } + + func delogOutput() { + print(" \(type) output: ") + + let device = para.output.metalTexture!.device + let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose) + print(outputArray.strideArray()) } - } - - func delogOutput() { - print(" \(type) output: ") - let device = para.output.metalTexture!.device - let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose) - print(outputArray.strideArray()) - } - } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddAddPreluOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddAddPreluOp.swift index 552d72f436bf6de89f52bae186f72a0a778b1f4c..f22a0026d3a941942feffe5781958727f4f76601 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddAddPreluOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddAddPreluOp.swift @@ -15,95 +15,95 @@ import Foundation import Metal -class ConvAddAddPreluParam: OpParam { - //typealias ParamPrecisionType = P - required init(opDesc: PMOpDesc, inScope: Scope) throws { - do { - filter = try ConvAddAddPreluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope) - input = try ConvAddAddPreluParam.input(inputs: opDesc.inputs, from: inScope) - output = try ConvAddAddPreluParam.outputOut(outputs: opDesc.outputs, from: inScope) - stride = try ConvAddAddPreluParam.getAttr(key: "strides", attrs: opDesc.attrs) - paddings = try ConvAddAddPreluParam.getAttr(key: "paddings", attrs: opDesc.attrs) - dilations = try ConvAddAddPreluParam.getAttr(key: "dilations", attrs: opDesc.attrs) - groups = try ConvAddAddPreluParam.getAttr(key: "groups", attrs: opDesc.attrs) - alpha = try ConvAddAddPreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope) - mode = try ConvAddAddPreluParam.getAttr(key: "mode", attrs: opDesc.attrs) - y = try ConvAddAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope) - } catch let error { - throw error +class ConvAddAddPreluParam: OpParam { + //typealias ParamPrecisionType = P + required init(opDesc: PMOpDesc, inScope: Scope) throws { + do { + filter = try ConvAddAddPreluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope) + input = try ConvAddAddPreluParam.input(inputs: opDesc.inputs, from: inScope) + output = try ConvAddAddPreluParam.outputOut(outputs: opDesc.outputs, from: inScope) + stride = try ConvAddAddPreluParam.getAttr(key: "strides", attrs: opDesc.attrs) + paddings = try ConvAddAddPreluParam.getAttr(key: "paddings", attrs: opDesc.attrs) + dilations = try ConvAddAddPreluParam.getAttr(key: "dilations", attrs: opDesc.attrs) + groups = try ConvAddAddPreluParam.getAttr(key: "groups", attrs: opDesc.attrs) + alpha = try ConvAddAddPreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope) + mode = try ConvAddAddPreluParam.getAttr(key: "mode", attrs: opDesc.attrs) + y = try ConvAddAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope) + } catch let error { + throw error + } } - } - - let input: Texture - let y: Tensor

- let filter: Tensor

- let mode: String - let alpha: Tensor

- var output: Texture - let stride: [Int32] - let paddings: [Int32] - let dilations: [Int32] - let groups: Int + + let input: Texture + let y: Tensor

+ let filter: Tensor

+ let mode: String + let alpha: Tensor

+ var output: Texture + let stride: [Int32] + let paddings: [Int32] + let dilations: [Int32] + let groups: Int } -class ConvAddAddPreluOp: Operator, ConvAddAddPreluParam

>, Runable, Creator, InferShaperable, Fusion{ - typealias OpType = ConvAddAddPreluOp

- - static func fusionNode() -> Node { - let beginNode = Node.init(inType: gConvType) - _ = beginNode - --> Node.init(inType: gElementwiseAddType) --> Node.init(inType: gElementwiseAddType) --> Node.init(inType: gPreluType) - return beginNode - } - - static func change() -> [String : [(from: String, to: String)]] { - return [:] - } - - static func fusionType() -> String { - return gConvAddAddPreluType - } - - static func needCheck() -> [(Int, String)] { - return [(2, "Y"), (2, "X")] - } - - - - func inferShape() { - let inDims = para.input.dim - let filterDim = para.filter.dim - let strides = para.stride - let paddings = para.paddings - let dilations = para.dilations +class ConvAddAddPreluOp: Operator, ConvAddAddPreluParam

>, Runable, Creator, InferShaperable, Fusion{ + typealias OpType = ConvAddAddPreluOp

+ + static func fusionNode() -> Node { + let beginNode = Node.init(inType: gConvType) + _ = beginNode + --> Node.init(inType: gElementwiseAddType) --> Node.init(inType: gElementwiseAddType) --> Node.init(inType: gPreluType) + return beginNode + } + + static func change() -> [String : [(from: String, to: String)]] { + return [:] + } + + static func fusionType() -> String { + return gConvAddAddPreluType + } + + static func needCheck() -> [(Int, String)] { + return [(2, "Y"), (2, "X")] + } + + + + func inferShape() { + let inDims = para.input.dim + let filterDim = para.filter.dim + let strides = para.stride + let paddings = para.paddings + let dilations = para.dilations + + var outDim = [inDims[0]] + for i in 0..: OpParam { - //typealias ParamPrecisionType = P - required init(opDesc: PMOpDesc, inScope: Scope) throws { - do { - - filter = try ConvAddBatchNormReluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope) - input = try ConvAddBatchNormReluParam.input(inputs: opDesc.inputs, from: inScope) - output = try ConvAddBatchNormReluParam.outputOut(outputs: opDesc.outputs, from: inScope) - stride = try ConvAddBatchNormReluParam.getAttr(key: "strides", attrs: opDesc.attrs) - paddings = try ConvAddBatchNormReluParam.getAttr(key: "paddings", attrs: opDesc.attrs) - dilations = try ConvAddBatchNormReluParam.getAttr(key: "dilations", attrs: opDesc.attrs) - epsilon = try ConvAddBatchNormReluParam.getAttr(key: "epsilon", attrs: opDesc.attrs) - - groups = try ConvAddBatchNormReluParam.getAttr(key: "groups", attrs: opDesc.attrs) - variance = try ConvAddBatchNormReluParam.inputVariance(inputs: opDesc.paraInputs, from: inScope) - bias = try ConvAddBatchNormReluParam.inputBiase(inputs: opDesc.paraInputs, from: inScope) - - scale = try ConvAddBatchNormReluParam.inputScale(inputs: opDesc.paraInputs, from: inScope) - mean = try ConvAddBatchNormReluParam.inputMean(inputs: opDesc.paraInputs, from: inScope) - y = try ConvAddBatchNormReluParam.inputY(inputs: opDesc.paraInputs, from: inScope) - } catch let error { - throw error +class ConvAddBatchNormReluParam: OpParam { + //typealias ParamPrecisionType = P + required init(opDesc: PMOpDesc, inScope: Scope) throws { + do { + + filter = try ConvAddBatchNormReluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope) + input = try ConvAddBatchNormReluParam.input(inputs: opDesc.inputs, from: inScope) + output = try ConvAddBatchNormReluParam.outputOut(outputs: opDesc.outputs, from: inScope) + stride = try ConvAddBatchNormReluParam.getAttr(key: "strides", attrs: opDesc.attrs) + paddings = try ConvAddBatchNormReluParam.getAttr(key: "paddings", attrs: opDesc.attrs) + dilations = try ConvAddBatchNormReluParam.getAttr(key: "dilations", attrs: opDesc.attrs) + epsilon = try ConvAddBatchNormReluParam.getAttr(key: "epsilon", attrs: opDesc.attrs) + + groups = try ConvAddBatchNormReluParam.getAttr(key: "groups", attrs: opDesc.attrs) + variance = try ConvAddBatchNormReluParam.inputVariance(inputs: opDesc.paraInputs, from: inScope) + bias = try ConvAddBatchNormReluParam.inputBiase(inputs: opDesc.paraInputs, from: inScope) + + scale = try ConvAddBatchNormReluParam.inputScale(inputs: opDesc.paraInputs, from: inScope) + mean = try ConvAddBatchNormReluParam.inputMean(inputs: opDesc.paraInputs, from: inScope) + y = try ConvAddBatchNormReluParam.inputY(inputs: opDesc.paraInputs, from: inScope) + } catch let error { + throw error + } } - } - - let input: Texture - - let variance: Tensor

- let bias: Tensor

- let mean: Tensor

- let scale: Tensor

- let y: Tensor

- let filter: Tensor

- let epsilon: Float32 - var newScale: MTLBuffer? - var newBiase: MTLBuffer? - - var output: Texture - let stride: [Int32] - let paddings: [Int32] - let dilations: [Int32] - let groups: Int + + let input: Texture + + let variance: Tensor

+ let bias: Tensor

+ let mean: Tensor

+ let scale: Tensor

+ let y: Tensor

+ let filter: Tensor

+ let epsilon: Float32 + var newScale: MTLBuffer? + var newBiase: MTLBuffer? + + var output: Texture + let stride: [Int32] + let paddings: [Int32] + let dilations: [Int32] + let groups: Int } -class ConvAddBatchNormReluOp: Operator, ConvAddBatchNormReluParam

>, Runable, Creator, InferShaperable, Fusion{ - - typealias OpType = ConvAddBatchNormReluOp

- - func inferShape() { - let inDims = para.input.dim - let filterDim = para.filter.dim - let strides = para.stride - let paddings = para.paddings - let dilations = para.dilations +class ConvAddBatchNormReluOp: Operator, ConvAddBatchNormReluParam

>, Runable, Creator, InferShaperable, Fusion{ + + typealias OpType = ConvAddBatchNormReluOp

+ + func inferShape() { + let inDims = para.input.dim + let filterDim = para.filter.dim + let strides = para.stride + let paddings = para.paddings + let dilations = para.dilations + + var outDim = [inDims[0]] + for i in 0.. Node { + let beginNode = Node.init(inType: gConvType) + _ = beginNode + --> Node.init(inType: gElementwiseAddType) + --> Node.init(inType: gBatchNormType) + --> Node.init(inType: gReluType) + return beginNode } - } - - static func fusionNode() -> Node { - let beginNode = Node.init(inType: gConvType) - _ = beginNode - --> Node.init(inType: gElementwiseAddType) - --> Node.init(inType: gBatchNormType) - --> Node.init(inType: gReluType) - return beginNode - } - - static func change() -> [String : [(from: String, to: String)]] { - return [:] - } - - static func fusionType() -> String { - return gConvAddBatchNormReluType - } - - func delogOutput() { - print(" conv add batchnorm relu output ") - print(para.output.toTensor().strideArray()) - // let _: P? = para.input.metalTexture.logDesc(header: "conv add batchnorm relu input: ", stridable: false) - // para.filter.logDataPointer(header: "filter data pointer: ") - // print("filter: \(para.filter)") - // print("biase: \(para.y)") - // print("padding: \(para.paddings)") - // print("stride: \(para.stride)") + static func change() -> [String : [(from: String, to: String)]] { + return [:] + } - // let _: P? = para.y.buffer?.logDesc(header: " biase: ", stridable: false) - // let _: P? = para.newBiase?.logDesc(header: "new biase: ", stridable: false) - // let _: P? = para.newScale?.logDesc(header: "new scale: ", stridable: false) + static func fusionType() -> String { + return gConvAddBatchNormReluType + } - // let _: P? = para.output.metalTexture.logDesc(header: "conv add batchnorm relu output: ", stridable: false) - } + func delogOutput() { + print(" conv add batchnorm relu output ") + print(para.output.toTensor().strideArray()) + // let _: P? = para.input.metalTexture.logDesc(header: "conv add batchnorm relu input: ", stridable: false) + // para.filter.logDataPointer(header: "filter data pointer: ") + // print("filter: \(para.filter)") + + // print("biase: \(para.y)") + // print("padding: \(para.paddings)") + // print("stride: \(para.stride)") + + // let _: P? = para.y.buffer?.logDesc(header: " biase: ", stridable: false) + // let _: P? = para.newBiase?.logDesc(header: "new biase: ", stridable: false) + // let _: P? = para.newScale?.logDesc(header: "new scale: ", stridable: false) + + // let _: P? = para.output.metalTexture.logDesc(header: "conv add batchnorm relu output: ", stridable: false) + } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddOp.swift index 923c2c210ddba99dcebec77ae91299cd28ed638e..c24441f7f34956629774cab951819d89c09a1c41 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddOp.swift @@ -14,104 +14,104 @@ import Foundation -class ConvAddParam: OpParam { - //typealias ParamPrecisionType = P - required init(opDesc: PMOpDesc, inScope: Scope) throws { - do { - filter = try ConvAddParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope) - input = try ConvAddParam.input(inputs: opDesc.inputs, from: inScope) - output = try ConvAddParam.outputOut(outputs: opDesc.outputs, from: inScope) - stride = try ConvAddParam.getAttr(key: "strides", attrs: opDesc.attrs) - paddings = try ConvAddParam.getAttr(key: "paddings", attrs: opDesc.attrs) - dilations = try ConvAddParam.getAttr(key: "dilations", attrs: opDesc.attrs) - groups = try ConvAddParam.getAttr(key: "groups", attrs: opDesc.attrs) - - y = try ConvAddParam.inputY(inputs: opDesc.paraInputs, from: inScope) - } catch let error { - throw error +class ConvAddParam: OpParam { + //typealias ParamPrecisionType = P + required init(opDesc: PMOpDesc, inScope: Scope) throws { + do { + filter = try ConvAddParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope) + input = try ConvAddParam.input(inputs: opDesc.inputs, from: inScope) + output = try ConvAddParam.outputOut(outputs: opDesc.outputs, from: inScope) + stride = try ConvAddParam.getAttr(key: "strides", attrs: opDesc.attrs) + paddings = try ConvAddParam.getAttr(key: "paddings", attrs: opDesc.attrs) + dilations = try ConvAddParam.getAttr(key: "dilations", attrs: opDesc.attrs) + groups = try ConvAddParam.getAttr(key: "groups", attrs: opDesc.attrs) + + y = try ConvAddParam.inputY(inputs: opDesc.paraInputs, from: inScope) + } catch let error { + throw error + } } - } - - let input: Texture - let y: Tensor

- let filter: Tensor

- - var output: Texture - let stride: [Int32] - let paddings: [Int32] - let dilations: [Int32] - let groups: Int + + let input: Texture + let y: Tensor

+ let filter: Tensor

+ + var output: Texture + let stride: [Int32] + let paddings: [Int32] + let dilations: [Int32] + let groups: Int } -class ConvAddOp: Operator, ConvAddParam

>, Runable, Creator, InferShaperable, Fusion{ - typealias OpType = ConvAddOp

- - static func fusionNode() -> Node { - let beginNode = Node.init(inType: gConvType) - _ = beginNode - --> Node.init(inType: gElementwiseAddType) - return beginNode - } - - static func change() -> [String : [(from: String, to: String)]] { - return [:] - } - - static func fusionType() -> String { - return gConvAddType - } - - func inferShape() { +class ConvAddOp: Operator, ConvAddParam

>, Runable, Creator, InferShaperable, Fusion{ + typealias OpType = ConvAddOp

+ + static func fusionNode() -> Node { + let beginNode = Node.init(inType: gConvType) + _ = beginNode + --> Node.init(inType: gElementwiseAddType) + return beginNode + } - let inDims = para.input.dim - let filterDim = para.filter.dim - let strides = para.stride - let paddings = para.paddings - let dilations = para.dilations + static func change() -> [String : [(from: String, to: String)]] { + return [:] + } - var outDim = [inDims[0]] - for i in 0.. String { + return gConvAddType } - outDim.append(filterDim[0]) - para.output.dim = Dim.init(inDim: outDim) - } - - func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { - do { - try kernel.compute(commandBuffer: buffer, param: para) - } catch let error { - throw error + + func inferShape() { + + let inDims = para.input.dim + let filterDim = para.filter.dim + let strides = para.stride + let paddings = para.paddings + let dilations = para.dilations + + var outDim = [inDims[0]] + for i in 0..: OpParam { - //typealias ParamPrecisionType = P - required init(opDesc: PMOpDesc, inScope: Scope) throws { - do { - filter = try ConvAddPreluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope) - input = try ConvAddPreluParam.input(inputs: opDesc.inputs, from: inScope) - output = try ConvAddPreluParam.outputOut(outputs: opDesc.outputs, from: inScope) - stride = try ConvAddPreluParam.getAttr(key: "strides", attrs: opDesc.attrs) - paddings = try ConvAddPreluParam.getAttr(key: "paddings", attrs: opDesc.attrs) - dilations = try ConvAddPreluParam.getAttr(key: "dilations", attrs: opDesc.attrs) - groups = try ConvAddPreluParam.getAttr(key: "groups", attrs: opDesc.attrs) - alpha = try ConvAddPreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope) - mode = try ConvAddPreluParam.getAttr(key: "mode", attrs: opDesc.attrs) - y = try ConvAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope) - } catch let error { - throw error +class ConvAddPreluParam: OpParam { + //typealias ParamPrecisionType = P + required init(opDesc: PMOpDesc, inScope: Scope) throws { + do { + filter = try ConvAddPreluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope) + input = try ConvAddPreluParam.input(inputs: opDesc.inputs, from: inScope) + output = try ConvAddPreluParam.outputOut(outputs: opDesc.outputs, from: inScope) + stride = try ConvAddPreluParam.getAttr(key: "strides", attrs: opDesc.attrs) + paddings = try ConvAddPreluParam.getAttr(key: "paddings", attrs: opDesc.attrs) + dilations = try ConvAddPreluParam.getAttr(key: "dilations", attrs: opDesc.attrs) + groups = try ConvAddPreluParam.getAttr(key: "groups", attrs: opDesc.attrs) + alpha = try ConvAddPreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope) + mode = try ConvAddPreluParam.getAttr(key: "mode", attrs: opDesc.attrs) + y = try ConvAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope) + } catch let error { + throw error + } } - } - - let input: Texture - let y: Tensor

- let filter: Tensor

- let mode: String - let alpha: Tensor

- var output: Texture - let stride: [Int32] - let paddings: [Int32] - let dilations: [Int32] - let groups: Int + + let input: Texture + let y: Tensor

+ let filter: Tensor

+ let mode: String + let alpha: Tensor

+ var output: Texture + let stride: [Int32] + let paddings: [Int32] + let dilations: [Int32] + let groups: Int } -class ConvAddPreluOp: Operator, ConvAddPreluParam

>, Runable, Creator, InferShaperable, Fusion{ - typealias OpType = ConvAddPreluOp

- - static func fusionNode() -> Node { - let beginNode = Node.init(inType: gConvType) - _ = beginNode - --> Node.init(inType: gElementwiseAddType) --> Node.init(inType: gPreluType) - return beginNode - } - - static func change() -> [String : [(from: String, to: String)]] { - return [:] - } - - static func fusionType() -> String { - return gConvAddPreluType - } - - func inferShape() { - let inDims = para.input.dim - let filterDim = para.filter.dim - let strides = para.stride - let paddings = para.paddings - let dilations = para.dilations +class ConvAddPreluOp: Operator, ConvAddPreluParam

>, Runable, Creator, InferShaperable, Fusion{ + typealias OpType = ConvAddPreluOp

+ + static func fusionNode() -> Node { + let beginNode = Node.init(inType: gConvType) + _ = beginNode + --> Node.init(inType: gElementwiseAddType) --> Node.init(inType: gPreluType) + return beginNode + } - var outDim = [inDims[0]] - for i in 0.. [String : [(from: String, to: String)]] { + return [:] } - outDim.append(filterDim[0]) - para.output.dim = Dim.init(inDim: outDim) - } - - func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { - do { - try kernel.compute(commandBuffer: buffer, param: para) - } catch let error { - throw error + + static func fusionType() -> String { + return gConvAddPreluType + } + + func inferShape() { + let inDims = para.input.dim + let filterDim = para.filter.dim + let strides = para.stride + let paddings = para.paddings + let dilations = para.dilations + + var outDim = [inDims[0]] + for i in 0..: OpParam { - //typealias ParamPrecisionType = P - required init(opDesc: PMOpDesc, inScope: Scope) throws { - do { - filter = try ConvBNReluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope) - input = try ConvBNReluParam.input(inputs: opDesc.inputs, from: inScope) - output = try ConvBNReluParam.outputOut(outputs: opDesc.outputs, from: inScope) - stride = try ConvBNReluParam.getAttr(key: "strides", attrs: opDesc.attrs) - paddings = try ConvBNReluParam.getAttr(key: "paddings", attrs: opDesc.attrs) - dilations = try ConvBNReluParam.getAttr(key: "dilations", attrs: opDesc.attrs) - epsilon = try ConvBNReluParam.getAttr(key: "epsilon", attrs: opDesc.attrs) - - groups = try ConvBNReluParam.getAttr(key: "groups", attrs: opDesc.attrs) - variance = try ConvBNReluParam.inputVariance(inputs: opDesc.paraInputs, from: inScope) - bias = try ConvBNReluParam.inputBiase(inputs: opDesc.paraInputs, from: inScope) - scale = try ConvBNReluParam.inputScale(inputs: opDesc.paraInputs, from: inScope) - mean = try ConvBNReluParam.inputMean(inputs: opDesc.paraInputs, from: inScope) - } catch let error { - throw error +class ConvBNReluParam: OpParam { + //typealias ParamPrecisionType = P + required init(opDesc: PMOpDesc, inScope: Scope) throws { + do { + filter = try ConvBNReluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope) + input = try ConvBNReluParam.input(inputs: opDesc.inputs, from: inScope) + output = try ConvBNReluParam.outputOut(outputs: opDesc.outputs, from: inScope) + stride = try ConvBNReluParam.getAttr(key: "strides", attrs: opDesc.attrs) + paddings = try ConvBNReluParam.getAttr(key: "paddings", attrs: opDesc.attrs) + dilations = try ConvBNReluParam.getAttr(key: "dilations", attrs: opDesc.attrs) + epsilon = try ConvBNReluParam.getAttr(key: "epsilon", attrs: opDesc.attrs) + + groups = try ConvBNReluParam.getAttr(key: "groups", attrs: opDesc.attrs) + variance = try ConvBNReluParam.inputVariance(inputs: opDesc.paraInputs, from: inScope) + bias = try ConvBNReluParam.inputBiase(inputs: opDesc.paraInputs, from: inScope) + scale = try ConvBNReluParam.inputScale(inputs: opDesc.paraInputs, from: inScope) + mean = try ConvBNReluParam.inputMean(inputs: opDesc.paraInputs, from: inScope) + } catch let error { + throw error + } } - } - - let input: Texture - let variance: Tensor

- let bias: Tensor

- let mean: Tensor

- let scale: Tensor

- let filter: Tensor

- let epsilon: Float32 - var newScale: MTLBuffer? - var newBiase: MTLBuffer? - - var output: Texture - let stride: [Int32] - let paddings: [Int32] - let dilations: [Int32] - let groups: Int + + let input: Texture + let variance: Tensor

+ let bias: Tensor

+ let mean: Tensor

+ let scale: Tensor

+ let filter: Tensor

+ let epsilon: Float32 + var newScale: MTLBuffer? + var newBiase: MTLBuffer? + + var output: Texture + let stride: [Int32] + let paddings: [Int32] + let dilations: [Int32] + let groups: Int } -class ConvBNReluOp: Operator, ConvBNReluParam

>, Runable, Creator, InferShaperable, Fusion{ - typealias OpType = ConvBNReluOp

- - func inputs() -> [Variant] { - return [para.input, para.variance, para.bias, para.mean, para.scale, para.filter] - } - - - func inferShape() { - let inDims = para.input.dim - let filterDim = para.filter.dim - let strides = para.stride - let paddings = para.paddings - let dilations = para.dilations +class ConvBNReluOp: Operator, ConvBNReluParam

>, Runable, Creator, InferShaperable, Fusion{ + typealias OpType = ConvBNReluOp

+ + func inputs() -> [Variant] { + return [para.input, para.variance, para.bias, para.mean, para.scale, para.filter] + } - var outDim = [inDims[0]] - for i in 0.. Node { + let beginNode = Node.init(inType: gConvType) + _ = beginNode + --> Node.init(inType: gBatchNormType) + --> Node.init(inType: gReluType) + return beginNode } - } - - static func fusionNode() -> Node { - let beginNode = Node.init(inType: gConvType) - _ = beginNode - --> Node.init(inType: gBatchNormType) - --> Node.init(inType: gReluType) - return beginNode - } - - static func change() -> [String : [(from: String, to: String)]] { - return [:] - } - - static func fusionType() -> String { - return gConvBnReluType - } - - func delogOutput() { - print(" \(type) output: ") - print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray()) - } - + + static func change() -> [String : [(from: String, to: String)]] { + return [:] + } + + static func fusionType() -> String { + return gConvBnReluType + } + + func delogOutput() { + print(" \(type) output: ") + print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray()) + } + } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvOp.swift index c66813b166fefd8fe5f139c94d73cf55ff83d682..93ddb53bc92cf96447aee39a6b9762b42ef79b26 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvOp.swift @@ -14,68 +14,68 @@ import Foundation -class ConvParam: OpParam { - //typealias ParamPrecisionType = P - required init(opDesc: PMOpDesc, inScope: Scope) throws { - do { - filter = try ConvParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope) - input = try ConvParam.input(inputs: opDesc.inputs, from: inScope) - output = try ConvParam.output(outputs: opDesc.outputs, from: inScope) - stride = try ConvParam.getAttr(key: "strides", attrs: opDesc.attrs) - paddings = try ConvParam.getAttr(key: "paddings", attrs: opDesc.attrs) - dilations = try ConvParam.getAttr(key: "dilations", attrs: opDesc.attrs) - groups = try ConvParam.getAttr(key: "groups", attrs: opDesc.attrs) - - } catch let error { - throw error +class ConvParam: OpParam { + //typealias ParamPrecisionType = P + required init(opDesc: PMOpDesc, inScope: Scope) throws { + do { + filter = try ConvParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope) + input = try ConvParam.input(inputs: opDesc.inputs, from: inScope) + output = try ConvParam.output(outputs: opDesc.outputs, from: inScope) + stride = try ConvParam.getAttr(key: "strides", attrs: opDesc.attrs) + paddings = try ConvParam.getAttr(key: "paddings", attrs: opDesc.attrs) + dilations = try ConvParam.getAttr(key: "dilations", attrs: opDesc.attrs) + groups = try ConvParam.getAttr(key: "groups", attrs: opDesc.attrs) + + } catch let error { + throw error + } } - } - - let input: Texture - let filter: Tensor

- var output: Texture - let stride: [Int32] - let paddings: [Int32] - let dilations: [Int32] - let groups: Int + + let input: Texture + let filter: Tensor

+ var output: Texture + let stride: [Int32] + let paddings: [Int32] + let dilations: [Int32] + let groups: Int } -class ConvOp: Operator, ConvParam

>, Runable, Creator, InferShaperable { - typealias OpType = ConvOp

- - func inferShape() { - let inDims = para.input.dim - let filterDim = para.filter.dim - let strides = para.stride - let paddings = para.paddings - let dilations = para.dilations +class ConvOp: Operator, ConvParam

>, Runable, Creator, InferShaperable { + typealias OpType = ConvOp

- var outDim = [inDims[0]] - for i in 0..: ConvParam

{ - //typealias ParamPrecisionType = P +class ConvTransposeParam: ConvParam

{ + //typealias ParamPrecisionType = P required init(opDesc: PMOpDesc, inScope: Scope) throws { - do { - try super.init(opDesc: opDesc, inScope: inScope) - } catch let error { - throw error + do { + try super.init(opDesc: opDesc, inScope: inScope) + } catch let error { + throw error + } } - } } -class ConvTransposeOp: Operator, ConvTransposeParam

>, Runable, Creator, InferShaperable{ - - typealias OpType = ConvTransposeOp

- - func inferShape() { - // para.output.dim = para.input.dim - } - - func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { - do { - try kernel.compute(commandBuffer: buffer, param: para) - } catch let error { - throw error +class ConvTransposeOp: Operator, ConvTransposeParam

>, Runable, Creator, InferShaperable{ + + typealias OpType = ConvTransposeOp

+ + func inferShape() { + // para.output.dim = para.input.dim } - } - - func delogOutput() { - - print(" \(type) output: ") - let padToFourDim = para.output.padToFourDim - if para.output.transpose == [0, 1, 2, 3] { - let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3])) - print(outputArray.strideArray()) - } else if para.output.transpose == [0, 2, 3, 1] { - let output = para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])) - print(output.strideArray()) - } else { - print(" not implement") + + func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { + do { + try kernel.compute(commandBuffer: buffer, param: para) + } catch let error { + throw error + } + } + + func delogOutput() { + + print(" \(type) output: ") + let padToFourDim = para.output.padToFourDim + if para.output.transpose == [0, 1, 2, 3] { + let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3])) + print(outputArray.strideArray()) + } else if para.output.transpose == [0, 2, 3, 1] { + let output = para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])) + print(output.strideArray()) + } else { + print(" not implement") + } } - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/DepthwiseConvOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/DepthwiseConvOp.swift index 96818a9fd8bd14a69b249200eb7c32c222096318..49e146b688e626e87d6641e6271e20b6109cc98e 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/DepthwiseConvOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/DepthwiseConvOp.swift @@ -14,42 +14,42 @@ import Foundation -class DepthConvOp: Operator, ConvParam

>, Runable, Creator, InferShaperable { - - typealias OpType = DepthConvOp

- - func inferShape() { - let inDims = para.input.dim - let filterDim = para.filter.dim - let strides = para.stride - let paddings = para.paddings - let dilations = para.dilations +class DepthConvOp: Operator, ConvParam

>, Runable, Creator, InferShaperable { + + typealias OpType = DepthConvOp

- var outDim = [inDims[0]] - for i in 0..: Operator, ConvBNReluParam

>, Runable, Creator, InferShaperable, Fusion{ - typealias OpType = ConvBNReluOp

- - func inferShape() { - let inDims = para.input.dim - let filterDim = para.filter.dim - let strides = para.stride - let paddings = para.paddings - let dilations = para.dilations +class DwConvBNReluOp: Operator, ConvBNReluParam

>, Runable, Creator, InferShaperable, Fusion{ + typealias OpType = ConvBNReluOp

- var outDim = [inDims[0]] - for i in 0.. Node { + let beginNode = Node.init(inType: gDepthConvType) + _ = beginNode + --> Node.init(inType: gBatchNormType) + --> Node.init(inType: gReluType) + return beginNode + } + + static func change() -> [String : [(from: String, to: String)]] { + return [:] + } + + static func fusionType() -> String { + return gDwConvBnReluType + } + + func delogOutput() { + print(" \(type) output: ") + print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray()) } - } - - static func fusionNode() -> Node { - let beginNode = Node.init(inType: gDepthConvType) - _ = beginNode - --> Node.init(inType: gBatchNormType) - --> Node.init(inType: gReluType) - return beginNode - } - - static func change() -> [String : [(from: String, to: String)]] { - return [:] - } - - static func fusionType() -> String { - return gDwConvBnReluType - } - - func delogOutput() { - print(" \(type) output: ") - print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray()) - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ElementwiseAddOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ElementwiseAddOp.swift index 5fa69d4f44e48603dec9213be78d08b11b433edd..73a278e3c72460398c2aa17f39d67a7b7a37338d 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ElementwiseAddOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ElementwiseAddOp.swift @@ -15,81 +15,81 @@ import Foundation import Metal -class ElementwiseAddParam: OpParam { - //typealias ParamPrecisionType = P - required init(opDesc: PMOpDesc, inScope: Scope) throws { - do { - inputX = try ElementwiseAddParam.inputX(inputs: opDesc.inputs, from: inScope) - output = try ElementwiseAddParam.outputOut(outputs: opDesc.outputs, from: inScope) - axis = try ElementwiseAddParam.getAttr(key: "axis", attrs: opDesc.attrs) - } catch let error { - throw error +class ElementwiseAddParam: OpParam { + //typealias ParamPrecisionType = P + required init(opDesc: PMOpDesc, inScope: Scope) throws { + do { + inputX = try ElementwiseAddParam.inputX(inputs: opDesc.inputs, from: inScope) + output = try ElementwiseAddParam.outputOut(outputs: opDesc.outputs, from: inScope) + axis = try ElementwiseAddParam.getAttr(key: "axis", attrs: opDesc.attrs) + } catch let error { + throw error + } + do { + inputY = try ElementwiseAddParam.inputY(inputs: opDesc.paraInputs, from: inScope) + } catch _ { + let tensorY: Tensor

= try ElementwiseAddParam.inputY(inputs: opDesc.paraInputs, from: inScope) + let device = inputX.metalTexture!.device + inputY = Texture.init(device: device, inDim: tensorY.dim) + let value: [P] = Array(UnsafeBufferPointer(start: tensorY.data.pointer, count: tensorY.dim.numel())) + inputY.metalTexture = device.tensor2texture(value: value, dim: tensorY.dim.dims, transpose: [0, 1, 2, 3], inComputePrecision: GlobalConfig.shared.computePrecision) + } + + // required init(device: MTLDevice, param: ElementwiseAddParam

) { + // param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: computePrecision) + // if computePrecision == .Float32 { + // super.init(device: device, inFunctionName: "elementwise_add") + // } else if computePrecision == .Float16 { + // super.init(device: device, inFunctionName: "elementwise_add_half") + // } else { + // fatalError() + // } + // } + + var offset = axis + if axis == -1 { + offset = inputX.tensorDim.cout() - inputY.tensorDim.cout() + } + for i in 0..<(inputY.tensorDim.cout()) { + assert(inputX.tensorDim[offset + i] == inputY.tensorDim[i]) + } } - do { - inputY = try ElementwiseAddParam.inputY(inputs: opDesc.paraInputs, from: inScope) - } catch _ { - let tensorY: Tensor

= try ElementwiseAddParam.inputY(inputs: opDesc.paraInputs, from: inScope) - let device = inputX.metalTexture!.device - inputY = Texture.init(device: device, inDim: tensorY.dim) - let value: [P] = Array(UnsafeBufferPointer(start: tensorY.data.pointer, count: tensorY.dim.numel())) - inputY.metalTexture = device.tensor2texture(value: value, dim: tensorY.dim.dims, transpose: [0, 1, 2, 3], inComputePrecision: GlobalConfig.shared.computePrecision) - } - -// required init(device: MTLDevice, param: ElementwiseAddParam

) { -// param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: computePrecision) -// if computePrecision == .Float32 { -// super.init(device: device, inFunctionName: "elementwise_add") -// } else if computePrecision == .Float16 { -// super.init(device: device, inFunctionName: "elementwise_add_half") -// } else { -// fatalError() -// } -// } - var offset = axis - if axis == -1 { - offset = inputX.tensorDim.cout() - inputY.tensorDim.cout() - } - for i in 0..<(inputY.tensorDim.cout()) { - assert(inputX.tensorDim[offset + i] == inputY.tensorDim[i]) - } - } - - var inputX: Texture - var inputY: Texture - var output: Texture - var axis: Int + var inputX: Texture + var inputY: Texture + var output: Texture + var axis: Int } -class ElementwiseAddOp: Operator, ElementwiseAddParam

>, Runable, Creator, InferShaperable{ - typealias OpType = ElementwiseAddOp

- - func inferShape() { -// para.output.dim = para.input.dim - } - - func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { - do { - try kernel.compute(commandBuffer: buffer, param: para) - } catch let error { - throw error +class ElementwiseAddOp: Operator, ElementwiseAddParam

>, Runable, Creator, InferShaperable{ + typealias OpType = ElementwiseAddOp

+ + func inferShape() { + // para.output.dim = para.input.dim + } + + func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { + do { + try kernel.compute(commandBuffer: buffer, param: para) + } catch let error { + throw error + } } - } - - func delogOutput() { - print(" \(type) output: ") - print(para.output) - let padToFourDim = para.output.padToFourDim - if para.output.transpose == [0, 1, 2, 3] { - let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3])) - print(outputArray.strideArray()) - } else if para.output.transpose == [0, 2, 3, 1] { - print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray()) - } else { - print(" not implement") + func delogOutput() { + print(" \(type) output: ") + print(para.output) + + let padToFourDim = para.output.padToFourDim + if para.output.transpose == [0, 1, 2, 3] { + let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3])) + print(outputArray.strideArray()) + } else if para.output.transpose == [0, 2, 3, 1] { + print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray()) + } else { + print(" not implement") + } } - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ElementwiseAddPreluOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ElementwiseAddPreluOp.swift index 6a49d7bfa2fe4f060eedc84d47a8c1f8d64ee4d0..bcf0ba994ca48b6af18533f36718c1d458239e8e 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ElementwiseAddPreluOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ElementwiseAddPreluOp.swift @@ -15,102 +15,102 @@ import Foundation import Metal -class ElementwiseAddPreluParam: OpParam { - //typealias ParamPrecisionType = P - required init(opDesc: PMOpDesc, inScope: Scope) throws { - do { - alpha = try ElementwiseAddPreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope) - mode = try ElementwiseAddPreluParam.getAttr(key: "mode", attrs: opDesc.attrs) - inputX = try ElementwiseAddPreluParam.inputX(inputs: opDesc.inputs, from: inScope) - output = try ElementwiseAddPreluParam.outputOut(outputs: opDesc.outputs, from: inScope) - axis = try ElementwiseAddPreluParam.getAttr(key: "axis", attrs: opDesc.attrs) - } catch let error { - throw error +class ElementwiseAddPreluParam: OpParam { + //typealias ParamPrecisionType = P + required init(opDesc: PMOpDesc, inScope: Scope) throws { + do { + alpha = try ElementwiseAddPreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope) + mode = try ElementwiseAddPreluParam.getAttr(key: "mode", attrs: opDesc.attrs) + inputX = try ElementwiseAddPreluParam.inputX(inputs: opDesc.inputs, from: inScope) + output = try ElementwiseAddPreluParam.outputOut(outputs: opDesc.outputs, from: inScope) + axis = try ElementwiseAddPreluParam.getAttr(key: "axis", attrs: opDesc.attrs) + } catch let error { + throw error + } + do { + inputY = try ElementwiseAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope) + } catch _ { + let tensorY: Tensor

= try ElementwiseAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope) + let device = inputX.metalTexture!.device + inputY = Texture.init(device: device, inDim: tensorY.dim) + let value: [P] = Array(UnsafeBufferPointer(start: tensorY.data.pointer, count: tensorY.dim.numel())) + inputY.metalTexture = device.tensor2texture(value: value, dim: tensorY.dim.dims, transpose: [0, 1, 2, 3], inComputePrecision: GlobalConfig.shared.computePrecision) + } + + // required init(device: MTLDevice, param: ElementwiseAddParam

) { + // param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: computePrecision) + // if computePrecision == .Float32 { + // super.init(device: device, inFunctionName: "elementwise_add") + // } else if computePrecision == .Float16 { + // super.init(device: device, inFunctionName: "elementwise_add_half") + // } else { + // fatalError() + // } + // } + + var offset = axis + if axis == -1 { + offset = inputX.tensorDim.cout() - inputY.tensorDim.cout() + } + for i in 0..<(inputY.tensorDim.cout()) { + assert(inputX.tensorDim[offset + i] == inputY.tensorDim[i]) + } } - do { - inputY = try ElementwiseAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope) - } catch _ { - let tensorY: Tensor

= try ElementwiseAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope) - let device = inputX.metalTexture!.device - inputY = Texture.init(device: device, inDim: tensorY.dim) - let value: [P] = Array(UnsafeBufferPointer(start: tensorY.data.pointer, count: tensorY.dim.numel())) - inputY.metalTexture = device.tensor2texture(value: value, dim: tensorY.dim.dims, transpose: [0, 1, 2, 3], inComputePrecision: GlobalConfig.shared.computePrecision) + + let mode: String + let alpha: Tensor

+ var inputX: Texture + var inputY: Texture + var output: Texture + var axis: Int +} + +class ElementwiseAddPreluOp: Operator, ElementwiseAddPreluParam

>, Runable, Creator, InferShaperable, Fusion{ + static func fusionNode() -> Node { + let beginNode = Node.init(inType: gElementwiseAddType) + _ = beginNode + --> Node.init(inType: gPreluType) + return beginNode } - // required init(device: MTLDevice, param: ElementwiseAddParam

) { - // param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: computePrecision) - // if computePrecision == .Float32 { - // super.init(device: device, inFunctionName: "elementwise_add") - // } else if computePrecision == .Float16 { - // super.init(device: device, inFunctionName: "elementwise_add_half") - // } else { - // fatalError() - // } - // } + static func change() -> [String : [(from: String, to: String)]] { + return [:] + } - var offset = axis - if axis == -1 { - offset = inputX.tensorDim.cout() - inputY.tensorDim.cout() + static func fusionType() -> String { + return gElementwiseAddPreluType } - for i in 0..<(inputY.tensorDim.cout()) { - assert(inputX.tensorDim[offset + i] == inputY.tensorDim[i]) + + typealias OpType = ElementwiseAddPreluOp

+ + func inferShape() { + // para.output.dim = para.input.dim } - } - - let mode: String - let alpha: Tensor

- var inputX: Texture - var inputY: Texture - var output: Texture - var axis: Int -} - -class ElementwiseAddPreluOp: Operator, ElementwiseAddPreluParam

>, Runable, Creator, InferShaperable, Fusion{ - static func fusionNode() -> Node { - let beginNode = Node.init(inType: gElementwiseAddType) - _ = beginNode - --> Node.init(inType: gPreluType) - return beginNode - } - - static func change() -> [String : [(from: String, to: String)]] { - return [:] - } - - static func fusionType() -> String { - return gElementwiseAddPreluType - } - - typealias OpType = ElementwiseAddPreluOp

- - func inferShape() { - // para.output.dim = para.input.dim - } - - func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { - do { - try kernel.compute(commandBuffer: buffer, param: para) - } catch let error { - throw error + + func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { + do { + try kernel.compute(commandBuffer: buffer, param: para) + } catch let error { + throw error + } } - } - - - - func delogOutput() { - print(" \(type) output: ") - print(para.output) - let padToFourDim = para.output.padToFourDim - if para.output.transpose == [0, 1, 2, 3] { - let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3])) - print(outputArray.strideArray()) - } else if para.output.transpose == [0, 2, 3, 1] { - print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray()) - } else { - print(" not implement") + + + func delogOutput() { + print(" \(type) output: ") + print(para.output) + + let padToFourDim = para.output.padToFourDim + if para.output.transpose == [0, 1, 2, 3] { + let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3])) + print(outputArray.strideArray()) + } else if para.output.transpose == [0, 2, 3, 1] { + print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray()) + } else { + print(" not implement") + } } - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/FeedOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/FeedOp.swift index 46defcb58332a02cbc365a087708e792a66c6e5c..0d9510d2b0353890c517c6ece71b60635a10eaf0 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/FeedOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/FeedOp.swift @@ -16,55 +16,55 @@ import Foundation import MetalKit import CoreMedia -class FeedParam: OpParam{ - var output: Texture - var input: InputTexture { - return scope.input() as! InputTexture - } - let scope: Scope - - required init(opDesc: PMOpDesc, inScope: Scope) throws { - scope = inScope - do { - output = try FeedParam.outputOut(outputs: opDesc.outputs, from: inScope) - } catch let error { - throw error +class FeedParam: OpParam{ + var output: Texture + var input: InputTexture { + return scope.input() as! InputTexture } - } - - //typealias ParamPrecisionType = P + let scope: Scope + + required init(opDesc: PMOpDesc, inScope: Scope) throws { + scope = inScope + do { + output = try FeedParam.outputOut(outputs: opDesc.outputs, from: inScope) + } catch let error { + throw error + } + } + + //typealias ParamPrecisionType = P } -class FeedOp: Operator, FeedParam

>, Runable, Creator, InferShaperable { - typealias OpType = FeedOp

- - func inferShape() { - // print("feed input: \(para.input.expectDim)") - print("feed output: \(para.output.dim)") - // para.output.dim = - // para.output.dim = para.input.expectDim - } - - func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { - do { - try kernel.compute(commandBuffer: buffer, param: para) - } catch let error { - throw error +class FeedOp: Operator, FeedParam

>, Runable, Creator, InferShaperable { + typealias OpType = FeedOp

+ + func inferShape() { + // print("feed input: \(para.input.expectDim)") + print("feed output: \(para.output.dim)") + // para.output.dim = + // para.output.dim = para.input.expectDim + } + + func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { + do { + try kernel.compute(commandBuffer: buffer, param: para) + } catch let error { + throw error + } + + // let resizeKernel = ResizeKernel

.init(device: device) + // let resizeParam = ResizeParam.init(input: para.input.mtlTexture, output: para.output.metalTexture, expectDim: para.input.expectDim) + // do { + // try resizeKernel.compute(commandBuffer: buffer, param: resizeParam) + // } catch let error { + // throw error + // } } - // let resizeKernel = ResizeKernel

.init(device: device) - // let resizeParam = ResizeParam.init(input: para.input.mtlTexture, output: para.output.metalTexture, expectDim: para.input.expectDim) - // do { - // try resizeKernel.compute(commandBuffer: buffer, param: resizeParam) - // } catch let error { - // throw error - // } - } - - func delogOutput() { - print(" \(type) output: ") - print(para.output.metalTexture) - print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[3], h: para.output.padToFourDim[2], w: para.output.padToFourDim[1])).strideArray()) - } + func delogOutput() { + print(" \(type) output: ") + print(para.output.metalTexture) + print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[3], h: para.output.padToFourDim[2], w: para.output.padToFourDim[1])).strideArray()) + } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/FetchOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/FetchOp.swift index a5d04a4b03a182a4e843a31628bd2892de597093..e9a5a395aed840d37e29ea1dec16b34085d80624 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/FetchOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/FetchOp.swift @@ -15,44 +15,44 @@ import Foundation import Metal -class FetchParam: OpParam{ - var output: FetchHolder - let input: Texture - let scope: Scope - required init(opDesc: PMOpDesc, inScope: Scope) throws { - scope = inScope - do { - input = try FetchParam.inputX(inputs: opDesc.inputs, from: inScope) - output = FetchHolder.init(inPaddedCapacity: input.elementCount(), inDim: input.tensorDim) - scope.setOutput(output: output) - } catch let error { - throw error +class FetchParam: OpParam{ + var output: FetchHolder + let input: Texture + let scope: Scope + required init(opDesc: PMOpDesc, inScope: Scope) throws { + scope = inScope + do { + input = try FetchParam.inputX(inputs: opDesc.inputs, from: inScope) + output = FetchHolder.init(inPaddedCapacity: input.elementCount(), inDim: input.tensorDim) + scope.setOutput(output: output) + } catch let error { + throw error + } } - } - - //typealias ParamPrecisionType = P + + //typealias ParamPrecisionType = P } -class FetchOp: Operator< FetchKernel

, FetchParam

>, Runable, Creator, InferShaperable { - - typealias OpType = FetchOp

- - func inferShape() { - print(para.input.dim) - } - - func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { - do { - try kernel.compute(commandBuffer: buffer, param: para) - } catch let error { - throw error +class FetchOp: Operator< FetchKernel

, FetchParam

>, Runable, Creator, InferShaperable { + + typealias OpType = FetchOp

+ + func inferShape() { + print(para.input.dim) + } + + func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { + do { + try kernel.compute(commandBuffer: buffer, param: para) + } catch let error { + throw error + } + } + + func delogOutput() { + print("fetch output: ") + let resArr = self.para.output.result.floatArr(count: self.para.output.capacity) + print(resArr.strideArray()) } - } - - func delogOutput() { - print("fetch output: ") - let resArr = self.para.output.result.floatArr(count: self.para.output.capacity) - print(resArr.strideArray()) - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/FlattenOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/FlattenOp.swift index 8500798adc75f9fac9e960857e9b0de319157c95..f5d100494835a6980a3604914010797b1d6c0198 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/FlattenOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/FlattenOp.swift @@ -14,46 +14,46 @@ import Foundation -class FlattenParam: OpParam { - //typealias ParamPrecisionType = P - required init(opDesc: PMOpDesc, inScope: Scope) throws { - do { - input = try FlattenParam.inputX(inputs: opDesc.inputs, from: inScope) - output = try FlattenParam.outputOut(outputs: opDesc.outputs, from: inScope) - axis = try FlattenParam.getAttr(key: "axis", attrs: opDesc.attrs) - } catch let error { - throw error +class FlattenParam: OpParam { + //typealias ParamPrecisionType = P + required init(opDesc: PMOpDesc, inScope: Scope) throws { + do { + input = try FlattenParam.inputX(inputs: opDesc.inputs, from: inScope) + output = try FlattenParam.outputOut(outputs: opDesc.outputs, from: inScope) + axis = try FlattenParam.getAttr(key: "axis", attrs: opDesc.attrs) + } catch let error { + throw error + } } - } - let input: Texture - var output: Texture - let axis: Int + let input: Texture + var output: Texture + let axis: Int } -class FlattenOp: Operator, FlattenParam

>, Runable, Creator, InferShaperable{ - - typealias OpType = FlattenOp

- - func inferShape() { - // para.output.dim = para.input.dim - } - - func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { - do { - try kernel.compute(commandBuffer: buffer, param: para) - } catch let error { - throw error +class FlattenOp: Operator, FlattenParam

>, Runable, Creator, InferShaperable{ + + typealias OpType = FlattenOp

+ + func inferShape() { + // para.output.dim = para.input.dim + } + + func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { + do { + try kernel.compute(commandBuffer: buffer, param: para) + } catch let error { + throw error + } + } + + func delogOutput() { + print(" \(type) output: ") + let device = para.output.metalTexture!.device + let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose) + print(outputArray.strideArray()) } - } - - func delogOutput() { - print(" \(type) output: ") - let device = para.output.metalTexture!.device - let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose) - print(outputArray.strideArray()) - } - + } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Base/Kernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Base/Kernel.swift index a7aaa9eddc49361ef718e5f6e627face49bc43b0..43ce7927ebf90c5ccc2ae1acf7df8f3f6b681863 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Base/Kernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Base/Kernel.swift @@ -19,125 +19,125 @@ public protocol TestParam { } public protocol Testable { - associatedtype TestParamType: TestParam - func test(commandBuffer: MTLCommandBuffer, param: TestParamType) - init(device: MTLDevice, testParam: TestParamType, initContext: InitContext) + associatedtype TestParamType: TestParam + func test(commandBuffer: MTLCommandBuffer, param: TestParamType) + init(device: MTLDevice, testParam: TestParamType, initContext: InitContext) } protocol Computable { - associatedtype ParamType: OpParam - func compute(commandBuffer: MTLCommandBuffer, param: ParamType) throws - init(device: MTLDevice, param: ParamType, initContext: InitContext) + associatedtype ParamType: OpParam + func compute(commandBuffer: MTLCommandBuffer, param: ParamType) throws + init(device: MTLDevice, param: ParamType, initContext: InitContext) } protocol KernelProtocol { - var pipline: MTLComputePipelineState { get set } - var functionName: String { get set } - + var pipline: MTLComputePipelineState { get set } + var functionName: String { get set } + } @objc open class Kernel: NSObject{ - let pipline: MTLComputePipelineState - let functionName: String - public init(device: MTLDevice, inFunctionName: String, usePaddleMobileLib: Bool = false, initContext: InitContext) { - pipline = device.pipeLine(funcName: inFunctionName, metalLoadMode: initContext.metalLoadMode, metalLibPath: initContext.metalLibPath) - functionName = inFunctionName - } + let pipline: MTLComputePipelineState + let functionName: String + public init(device: MTLDevice, inFunctionName: String, usePaddleMobileLib: Bool = false, initContext: InitContext) { + pipline = device.pipeLine(funcName: inFunctionName, metalLoadMode: initContext.metalLoadMode, metalLibPath: initContext.metalLibPath) + functionName = inFunctionName + } } @objc public class Shape: NSObject { - public let width: Int - public let height: Int - public let channel: Int - @objc public init(inWidth: Int, inHeight: Int, inChannel: Int){ - width = inWidth - height = inHeight - channel = inChannel - } + public let width: Int + public let height: Int + public let channel: Int + @objc public init(inWidth: Int, inHeight: Int, inChannel: Int){ + width = inWidth + height = inHeight + channel = inChannel + } } open class BufferToTextureKernel: Kernel { - public let outputTexture: MTLTexture - - public init(device: MTLDevice, outputDim: Shape, metalLoadMode: MetalLoadMode, metalLibPath: String?) { - let textureDesc = MTLTextureDescriptor.init() - textureDesc.textureType = .type2D - textureDesc.width = outputDim.width - textureDesc.height = outputDim.height - textureDesc.depth = (outputDim.channel + 3) / 4 + public let outputTexture: MTLTexture - if GlobalConfig.shared.computePrecision == .Float16 { - textureDesc.pixelFormat = .rgba16Float - } else if GlobalConfig.shared.computePrecision == .Float32 { - textureDesc.pixelFormat = .rgba32Float - } else { - fatalError() + public init(device: MTLDevice, outputDim: Shape, metalLoadMode: MetalLoadMode, metalLibPath: String?) { + let textureDesc = MTLTextureDescriptor.init() + textureDesc.textureType = .type2D + textureDesc.width = outputDim.width + textureDesc.height = outputDim.height + textureDesc.depth = (outputDim.channel + 3) / 4 + + if GlobalConfig.shared.computePrecision == .Float16 { + textureDesc.pixelFormat = .rgba16Float + } else if GlobalConfig.shared.computePrecision == .Float32 { + textureDesc.pixelFormat = .rgba32Float + } else { + fatalError() + } + + textureDesc.usage = [.shaderRead, .shaderWrite] + textureDesc.storageMode = .shared + outputTexture = device.makeTexture(descriptor: textureDesc) ?! " make texture error " + let initContext = InitContext.init() + initContext.metalLibPath = metalLibPath + initContext.metalLoadMode = metalLoadMode + if GlobalConfig.shared.computePrecision == .Float32 { + super.init(device: device, inFunctionName: "buffer_to_texture_kernel", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "buffer_to_texture_kernel_half", initContext: initContext) + } } - textureDesc.usage = [.shaderRead, .shaderWrite] - textureDesc.storageMode = .shared - outputTexture = device.makeTexture(descriptor: textureDesc) ?! " make texture error " - let initContext = InitContext.init() - initContext.metalLibPath = metalLibPath - initContext.metalLoadMode = metalLoadMode - if GlobalConfig.shared.computePrecision == .Float32 { - super.init(device: device, inFunctionName: "buffer_to_texture_kernel", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "buffer_to_texture_kernel_half", initContext: initContext) - } - } - - public func compute(inputBuffer: MTLBuffer , commandBuffer: MTLCommandBuffer) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encode is nil") + public func compute(inputBuffer: MTLBuffer , commandBuffer: MTLCommandBuffer) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encode is nil") + } + + encoder.setBuffer(inputBuffer, offset: 0, index: 0) + encoder.setTexture(outputTexture, index: 0) + encoder.dispatch(computePipline: pipline, outTexture: outputTexture) + encoder.endEncoding() } - encoder.setBuffer(inputBuffer, offset: 0, index: 0) - encoder.setTexture(outputTexture, index: 0) - encoder.dispatch(computePipline: pipline, outTexture: outputTexture) - encoder.endEncoding() - } - } @objc open class CusomKernel: Kernel { - - public let outputTexture: MTLTexture - public init(device: MTLDevice, inFunctionName: String, outputDim: Shape, metalLoadModel: MetalLoadMode, metalLibPath: String?) { - let textureDesc = MTLTextureDescriptor.init() - textureDesc.textureType = .type2D - textureDesc.width = outputDim.width - textureDesc.height = outputDim.height - textureDesc.depth = (outputDim.channel + 3) / 4 - if GlobalConfig.shared.computePrecision == .Float16 { - textureDesc.pixelFormat = .rgba16Float - } else if GlobalConfig.shared.computePrecision == .Float32 { - textureDesc.pixelFormat = .rgba32Float - } else { - fatalError() + public let outputTexture: MTLTexture + public init(device: MTLDevice, inFunctionName: String, outputDim: Shape, metalLoadModel: MetalLoadMode, metalLibPath: String?) { + let textureDesc = MTLTextureDescriptor.init() + textureDesc.textureType = .type2D + textureDesc.width = outputDim.width + textureDesc.height = outputDim.height + textureDesc.depth = (outputDim.channel + 3) / 4 + + if GlobalConfig.shared.computePrecision == .Float16 { + textureDesc.pixelFormat = .rgba16Float + } else if GlobalConfig.shared.computePrecision == .Float32 { + textureDesc.pixelFormat = .rgba32Float + } else { + fatalError() + } + + textureDesc.usage = [.shaderRead, .shaderWrite] + textureDesc.storageMode = .shared + outputTexture = device.makeTexture(descriptor: textureDesc) ?! " make texture error " + + let context = InitContext.init() + context.metalLoadMode = metalLoadModel + context.metalLibPath = metalLibPath + super.init(device: device, inFunctionName: inFunctionName, initContext: context) } - textureDesc.usage = [.shaderRead, .shaderWrite] - textureDesc.storageMode = .shared - outputTexture = device.makeTexture(descriptor: textureDesc) ?! " make texture error " - - let context = InitContext.init() - context.metalLoadMode = metalLoadModel - context.metalLibPath = metalLibPath - super.init(device: device, inFunctionName: inFunctionName, initContext: context) - } - - public func compute(inputTexuture: MTLTexture, commandBuffer: MTLCommandBuffer) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encode is nil") + public func compute(inputTexuture: MTLTexture, commandBuffer: MTLCommandBuffer) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encode is nil") + } + encoder.setTexture(inputTexuture, index: 0) + encoder.setTexture(outputTexture, index: 1) + encoder.dispatch(computePipline: pipline, outTexture: outputTexture) + encoder.endEncoding() } - encoder.setTexture(inputTexuture, index: 0) - encoder.setTexture(outputTexture, index: 1) - encoder.dispatch(computePipline: pipline, outTexture: outputTexture) - encoder.endEncoding() - } - + } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BatchNormKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BatchNormKernel.swift index 9eeb2aff9cdb3d476be93b75b5e642430f531331..ee2b4ad7e52646d4b415bcd6123dd78cdebece26 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BatchNormKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BatchNormKernel.swift @@ -14,40 +14,40 @@ import Foundation -class BatchNormKernel: Kernel, Computable { - required init(device: MTLDevice, param: BatchNormParam

, initContext: InitContext) { - let count = param.variance.dim.numel() - let varianceP = param.variance.data.pointer - let meanP = param.mean.data.pointer - let scaleP = param.scale.data.pointer - let biasP = param.bias.data.pointer - for i in 0..: Kernel, Computable { + required init(device: MTLDevice, param: BatchNormParam

, initContext: InitContext) { + let count = param.variance.dim.numel() + let varianceP = param.variance.data.pointer + let meanP = param.mean.data.pointer + let scaleP = param.scale.data.pointer + let biasP = param.bias.data.pointer + for i in 0..) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encoder is nil") + + func compute(commandBuffer: MTLCommandBuffer, param: BatchNormParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encoder is nil") + } + encoder.setTexture(param.input.metalTexture, index: 0) + encoder.setTexture(param.output.metalTexture, index: 1) + encoder.setBuffer(param.scale.buffer, offset: 0, index: 0) + encoder.setBuffer(param.bias.buffer, offset: 0, index: 1) + encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) + encoder.endEncoding() } - encoder.setTexture(param.input.metalTexture, index: 0) - encoder.setTexture(param.output.metalTexture, index: 1) - encoder.setBuffer(param.scale.buffer, offset: 0, index: 0) - encoder.setBuffer(param.bias.buffer, offset: 0, index: 1) - encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) - encoder.endEncoding() - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BilinearInterpKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BilinearInterpKernel.swift index 0db2e98651df8a7d778b7b9754ba1d059a54f365..8a217eacb11cdc3732540d576f62b91835221450 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BilinearInterpKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BilinearInterpKernel.swift @@ -15,41 +15,41 @@ import Foundation struct BilinearInterpMetalParam { - var ratio_h: Float32 - var ratio_w: Float32 + var ratio_h: Float32 + var ratio_w: Float32 } -class BilinearInterpKernel: Kernel, Computable{ - func compute(commandBuffer: MTLCommandBuffer, param: BilinearInterpParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encode is nil") +class BilinearInterpKernel: Kernel, Computable{ + func compute(commandBuffer: MTLCommandBuffer, param: BilinearInterpParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encode is nil") + } + + encoder.setTexture(param.input.metalTexture, index: 0) + encoder.setTexture(param.output.metalTexture, index: 1) + var ratio_h: Float32 = 0 + var ratio_w: Float32 = 0 + if param.output.tensorDim.dims[2] > 1 { + ratio_h = Float32(param.input.tensorDim.dims[2]-1) / Float32(param.output.tensorDim.dims[2]-1) + } + if param.output.tensorDim.dims[3] > 1 { + ratio_w = Float32(param.input.tensorDim.dims[3]-1) / Float32(param.output.tensorDim.dims[3]-1) + } + var p = BilinearInterpMetalParam.init(ratio_h: ratio_h, ratio_w: ratio_w) + encoder.setBytes(&p, length: MemoryLayout.size, index: 0) + encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) + encoder.endEncoding() } - encoder.setTexture(param.input.metalTexture, index: 0) - encoder.setTexture(param.output.metalTexture, index: 1) - var ratio_h: Float32 = 0 - var ratio_w: Float32 = 0 - if param.output.tensorDim.dims[2] > 1 { - ratio_h = Float32(param.input.tensorDim.dims[2]-1) / Float32(param.output.tensorDim.dims[2]-1) + required init(device: MTLDevice, param: BilinearInterpParam

, initContext: InitContext) { + param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision) + if GlobalConfig.shared.computePrecision == .Float32 { + super.init(device: device, inFunctionName: "bilinear_interp_float", initContext: initContext) + } else if GlobalConfig.shared.computePrecision == .Float16 { + super.init(device: device, inFunctionName: "bilinear_interp_half", initContext: initContext) + } else { + fatalError() + } } - if param.output.tensorDim.dims[3] > 1 { - ratio_w = Float32(param.input.tensorDim.dims[3]-1) / Float32(param.output.tensorDim.dims[3]-1) - } - var p = BilinearInterpMetalParam.init(ratio_h: ratio_h, ratio_w: ratio_w) - encoder.setBytes(&p, length: MemoryLayout.size, index: 0) - encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) - encoder.endEncoding() - } - - required init(device: MTLDevice, param: BilinearInterpParam

, initContext: InitContext) { - param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision) - if GlobalConfig.shared.computePrecision == .Float32 { - super.init(device: device, inFunctionName: "bilinear_interp_float", initContext: initContext) - } else if GlobalConfig.shared.computePrecision == .Float16 { - super.init(device: device, inFunctionName: "bilinear_interp_half", initContext: initContext) - } else { - fatalError() - } - } - + } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BoxcoderKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BoxcoderKernel.swift index 6e528a59650f017da0e50dff56f748e0255d6eee..94ece69eebdcab2faca3d287a770a16439843d4f 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BoxcoderKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/BoxcoderKernel.swift @@ -17,30 +17,30 @@ import Foundation struct BoxcoderMetalParam { } -class BoxcoderKernel: Kernel, Computable{ - func compute(commandBuffer: MTLCommandBuffer, param: BoxcoderParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encode is nil") +class BoxcoderKernel: Kernel, Computable{ + func compute(commandBuffer: MTLCommandBuffer, param: BoxcoderParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encode is nil") + } + encoder.setTexture(param.priorBox.metalTexture, index: 0) + encoder.setTexture(param.priorBoxVar.metalTexture, index: 1) + encoder.setTexture(param.targetBox.metalTexture, index: 2) + encoder.setTexture(param.output.metalTexture, index: 3) + var bmp = BoxcoderMetalParam.init() + encoder.setBytes(&bmp, length: MemoryLayout.size, index: 0) + encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) + encoder.endEncoding() } - encoder.setTexture(param.priorBox.metalTexture, index: 0) - encoder.setTexture(param.priorBoxVar.metalTexture, index: 1) - encoder.setTexture(param.targetBox.metalTexture, index: 2) - encoder.setTexture(param.output.metalTexture, index: 3) - var bmp = BoxcoderMetalParam.init() - encoder.setBytes(&bmp, length: MemoryLayout.size, index: 0) - encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) - encoder.endEncoding() - } - - required init(device: MTLDevice, param: BoxcoderParam

, initContext: InitContext) { - param.output.initTexture(device: device, inTranspose: [0, 3, 1, 2], computePrecision: GlobalConfig.shared.computePrecision) - if GlobalConfig.shared.computePrecision == .Float32 { - super.init(device: device, inFunctionName: "boxcoder_float", initContext: initContext) - } else if GlobalConfig.shared.computePrecision == .Float16 { - super.init(device: device, inFunctionName: "boxcoder_half", initContext: initContext) - } else { - fatalError() + + required init(device: MTLDevice, param: BoxcoderParam

, initContext: InitContext) { + param.output.initTexture(device: device, inTranspose: [0, 3, 1, 2], computePrecision: GlobalConfig.shared.computePrecision) + if GlobalConfig.shared.computePrecision == .Float32 { + super.init(device: device, inFunctionName: "boxcoder_float", initContext: initContext) + } else if GlobalConfig.shared.computePrecision == .Float16 { + super.init(device: device, inFunctionName: "boxcoder_half", initContext: initContext) + } else { + fatalError() + } } - } - + } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConcatKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConcatKernel.swift index edb028968838f3ec8c11b45b649463da9f1d9ea1..803b2f593ccd3c8afffe4d6219be5314f950f254 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConcatKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConcatKernel.swift @@ -16,133 +16,133 @@ import Foundation import Metal struct ConcatTestParam: TestParam { - var input: [MTLTexture] - var output: MTLTexture - var dims: [[Int]] - var axis: Int - var odim: [Int] + var input: [MTLTexture] + var output: MTLTexture + var dims: [[Int]] + var axis: Int + var odim: [Int] } struct ConcatMetalParam { - var odim: (Int32, Int32, Int32, Int32) = (1, 1, 1, 1) - var axis: Int32 = 0 - var offset: Int32 = 0 - var trans: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3) - var vdim: (Int32, Int32, Int32, Int32, Int32, Int32) = (0, 0, 0, 0, 0, 0) + var odim: (Int32, Int32, Int32, Int32) = (1, 1, 1, 1) + var axis: Int32 = 0 + var offset: Int32 = 0 + var trans: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3) + var vdim: (Int32, Int32, Int32, Int32, Int32, Int32) = (0, 0, 0, 0, 0, 0) } -class ConcatKernel: Kernel, Computable{ - var v = "normal" - var pm = ConcatMetalParam.init() - func compute(commandBuffer: MTLCommandBuffer, param: ConcatParam

) throws { - - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encode is nil") - } - let num = param.input.count - for i in 0...size, index: 0) - encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) - encoder.endEncoding() - } - - required init(device: MTLDevice, param: ConcatParam

, initContext: InitContext) { - param.output.initTexture(device: device, inTranspose: param.transpose, computePrecision: GlobalConfig.shared.computePrecision) - let orank = param.output.tensorDim.cout() - let num = param.input.count - assert(num <= 6) - var axis = 4 - param.output.tensorDim.cout() + param.axis - for i in 0..<4 { - if param.transpose[i] == axis { - axis = i - break - } - } - pm.axis = Int32(axis) - pm.odim = (Int32(param.output.dim[0]), Int32(param.output.dim[1]), Int32(param.output.dim[2]), Int32(param.output.dim[3])) - pm.trans = (Int32(param.output.transpose[0]), Int32(param.output.transpose[1]), Int32(param.output.transpose[2]), Int32(param.output.transpose[3])) - var vdim: [Int] = [0, 0, 0, 0, 0, 0] - for i in 0..: Kernel, Computable{ + var v = "normal" + var pm = ConcatMetalParam.init() + func compute(commandBuffer: MTLCommandBuffer, param: ConcatParam

) throws { + + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encode is nil") } - } - } else if orank == 3 { - if axis == 2 { - v = "y" - } else if axis == 3 { - v = "x" - } else if axis == 1 { - var vz = true + let num = param.input.count for i in 0...size, index: 0) + encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) + encoder.endEncoding() + } + + required init(device: MTLDevice, param: ConcatParam

, initContext: InitContext) { + param.output.initTexture(device: device, inTranspose: param.transpose, computePrecision: GlobalConfig.shared.computePrecision) + let orank = param.output.tensorDim.cout() + let num = param.input.count + assert(num <= 6) + var axis = 4 - param.output.tensorDim.cout() + param.axis + for i in 0..<4 { + if param.transpose[i] == axis { + axis = i + break + } + } + pm.axis = Int32(axis) + pm.odim = (Int32(param.output.dim[0]), Int32(param.output.dim[1]), Int32(param.output.dim[2]), Int32(param.output.dim[3])) + pm.trans = (Int32(param.output.transpose[0]), Int32(param.output.transpose[1]), Int32(param.output.transpose[2]), Int32(param.output.transpose[3])) + var vdim: [Int] = [0, 0, 0, 0, 0, 0] for i in 0..: Kernel, Computable { - var metalParam: MetalConvParam! - required init(device: MTLDevice, param: ConvAddAddPreluParam

, initContext: InitContext) { - param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision) - param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) - param.y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) - param.alpha.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) - - if GlobalConfig.shared.computePrecision == .Float16 { - if param.filter.width == 1 && param.filter.height == 1 { - if param.mode == "channel" { - super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_half", initContext: initContext) - } else if param.mode == "element" { - super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_half", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_half", initContext: initContext) - } +class ConvAddAddPreluKernel: Kernel, Computable { + var metalParam: MetalConvParam! + required init(device: MTLDevice, param: ConvAddAddPreluParam

, initContext: InitContext) { + param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision) + param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) + param.y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) + param.alpha.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) - } else if param.filter.channel == 1 { - if param.mode == "channel" { - super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_half", initContext: initContext) - } else if param.mode == "element" { - super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_half", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_half", initContext: initContext) - } - } else if param.filter.width == 3 && param.filter.height == 3 { - if param.mode == "channel" { - super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_half", initContext: initContext) - } else if param.mode == "element" { - super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_half", initContext: initContext) + if GlobalConfig.shared.computePrecision == .Float16 { + if param.filter.width == 1 && param.filter.height == 1 { + if param.mode == "channel" { + super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_half", initContext: initContext) + } else if param.mode == "element" { + super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_half", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_half", initContext: initContext) + } + + } else if param.filter.channel == 1 { + if param.mode == "channel" { + super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_half", initContext: initContext) + } else if param.mode == "element" { + super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_half", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_half", initContext: initContext) + } + } else if param.filter.width == 3 && param.filter.height == 3 { + if param.mode == "channel" { + super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_half", initContext: initContext) + } else if param.mode == "element" { + super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_half", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_half", initContext: initContext) + } + + } else if param.filter.width == 1 && param.filter.height == 5 { + if param.mode == "channel" { + super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_half", initContext: initContext) + } else if param.mode == "element" { + super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_half", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_half", initContext: initContext) + } + } else if param.filter.width == 5 && param.filter.height == 1 { + if param.mode == "channel" { + super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_half", initContext: initContext) + } else if param.mode == "element" { + super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_half", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_half", initContext: initContext) + } + } else { + fatalError(" unsupport yet ") + } + } else if GlobalConfig.shared.computePrecision == .Float32 { + if param.filter.width == 1 && param.filter.height == 1 { + if param.mode == "channel" { + super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_float", initContext: initContext) + } else if param.mode == "element" { + super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_float", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_float", initContext: initContext) + } + } else if param.filter.channel == 1 { + if param.mode == "channel" { + super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_float", initContext: initContext) + } else if param.mode == "element" { + super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_float", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_float", initContext: initContext) + } + } else if param.filter.width == 3 && param.filter.height == 3 { + if param.mode == "channel" { + super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_float", initContext: initContext) + } else if param.mode == "element" { + super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_float", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_float", initContext: initContext) + } + + } else if param.filter.width == 1 && param.filter.height == 5 { + if param.mode == "channel" { + super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_float", initContext: initContext) + } else if param.mode == "element" { + super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_float", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_float", initContext: initContext) + } + } else if param.filter.width == 5 && param.filter.height == 1 { + if param.mode == "channel" { + super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_float", initContext: initContext) + } else if param.mode == "element" { + super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_float", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_float", initContext: initContext) + } + } else { + fatalError(" unsupport yet ") + } } else { - super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_half", initContext: initContext) + fatalError() } - } else if param.filter.width == 1 && param.filter.height == 5 { - if param.mode == "channel" { - super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_half", initContext: initContext) - } else if param.mode == "element" { - super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_half", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_half", initContext: initContext) - } - } else if param.filter.width == 5 && param.filter.height == 1 { - if param.mode == "channel" { - super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_half", initContext: initContext) - } else if param.mode == "element" { - super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_half", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_half", initContext: initContext) - } - } else { - fatalError(" unsupport yet ") - } - } else if GlobalConfig.shared.computePrecision == .Float32 { - if param.filter.width == 1 && param.filter.height == 1 { - if param.mode == "channel" { - super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_float", initContext: initContext) - } else if param.mode == "element" { - super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_float", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_float", initContext: initContext) - } - } else if param.filter.channel == 1 { - if param.mode == "channel" { - super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_float", initContext: initContext) - } else if param.mode == "element" { - super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_float", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_float", initContext: initContext) - } - } else if param.filter.width == 3 && param.filter.height == 3 { - if param.mode == "channel" { - super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_float", initContext: initContext) - } else if param.mode == "element" { - super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_float", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_float", initContext: initContext) - } + let offsetY = (Int(param.dilations[1]) * (param.filter.height - 1) + 1)/2 - Int(param.paddings[1]) - } else if param.filter.width == 1 && param.filter.height == 5 { - if param.mode == "channel" { - super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_float", initContext: initContext) - } else if param.mode == "element" { - super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_float", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_float", initContext: initContext) - } - } else if param.filter.width == 5 && param.filter.height == 1 { - if param.mode == "channel" { - super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_float", initContext: initContext) - } else if param.mode == "element" { - super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_float", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_float", initContext: initContext) - } - } else { - fatalError(" unsupport yet ") - } - } else { - fatalError() + let offsetX = (Int(param.dilations[0]) * (param.filter.width - 1) + 1)/2 - Int(param.paddings[0]) + + // print(" function: \(functionName)") + // print("offset x: \(offsetX)") + // print("offset y: \(offsetY)") + + let offsetZ = 0.0 + let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) + // print("metal param: ") + // print(inMetalParam) + + metalParam = inMetalParam } - let offsetY = (Int(param.dilations[1]) * (param.filter.height - 1) + 1)/2 - Int(param.paddings[1]) - - let offsetX = (Int(param.dilations[0]) * (param.filter.width - 1) + 1)/2 - Int(param.paddings[0]) - - // print(" function: \(functionName)") - // print("offset x: \(offsetX)") - // print("offset y: \(offsetY)") - - let offsetZ = 0.0 - let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) - // print("metal param: ") - // print(inMetalParam) - - metalParam = inMetalParam - } - - func compute(commandBuffer: MTLCommandBuffer, param: ConvAddAddPreluParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encode is nil") + func compute(commandBuffer: MTLCommandBuffer, param: ConvAddAddPreluParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encode is nil") + } + + encoder.setTexture(param.input.metalTexture, index: 0) + encoder.setTexture(param.output.metalTexture, index: 1) + encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) + encoder.setBuffer(param.filter.buffer, offset: 0, index: 1) + encoder.setBuffer(param.y.buffer, offset: 0, index: 2) + encoder.setBuffer(param.alpha.buffer, offset: 0, index: 3) + encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) + encoder.endEncoding() } - - encoder.setTexture(param.input.metalTexture, index: 0) - encoder.setTexture(param.output.metalTexture, index: 1) - encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) - encoder.setBuffer(param.filter.buffer, offset: 0, index: 1) - encoder.setBuffer(param.y.buffer, offset: 0, index: 2) - encoder.setBuffer(param.alpha.buffer, offset: 0, index: 3) - encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) - encoder.endEncoding() - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddBatchNormReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddBatchNormReluKernel.swift index 6274e3df8f6e588ecee75ac243c3abe1b5f45828..ca6cae65c8fcac12ed29cd46775edeb2f43a3f8c 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddBatchNormReluKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddBatchNormReluKernel.swift @@ -16,165 +16,165 @@ import Foundation import Metal struct ConvAddBatchNormReluTestParam: TestParam { - let inputTexture: MTLTexture - let outputTexture: MTLTexture - var metalParam: MetalConvParam - let filterBuffer: MTLBuffer - let biaseBuffer: MTLBuffer - let newScaleBuffer: MTLBuffer - let newBiaseBuffer: MTLBuffer - let filterSize: (width: Int, height: Int, channel: Int) - init(inInputTexture: MTLTexture, inOutputTexture: MTLTexture, inMetalParam: MetalConvParam, inFilterBuffer: MTLBuffer, inBiaseBuffer: MTLBuffer, inNewScaleBuffer: MTLBuffer, inNewBiaseBuffer: MTLBuffer, inFilterSize: (width: Int, height: Int, channel: Int)) { - inputTexture = inInputTexture - outputTexture = inOutputTexture - metalParam = inMetalParam - filterBuffer = inFilterBuffer - biaseBuffer = inBiaseBuffer - newScaleBuffer = inNewScaleBuffer - newBiaseBuffer = inNewBiaseBuffer - filterSize = inFilterSize - } + let inputTexture: MTLTexture + let outputTexture: MTLTexture + var metalParam: MetalConvParam + let filterBuffer: MTLBuffer + let biaseBuffer: MTLBuffer + let newScaleBuffer: MTLBuffer + let newBiaseBuffer: MTLBuffer + let filterSize: (width: Int, height: Int, channel: Int) + init(inInputTexture: MTLTexture, inOutputTexture: MTLTexture, inMetalParam: MetalConvParam, inFilterBuffer: MTLBuffer, inBiaseBuffer: MTLBuffer, inNewScaleBuffer: MTLBuffer, inNewBiaseBuffer: MTLBuffer, inFilterSize: (width: Int, height: Int, channel: Int)) { + inputTexture = inInputTexture + outputTexture = inOutputTexture + metalParam = inMetalParam + filterBuffer = inFilterBuffer + biaseBuffer = inBiaseBuffer + newScaleBuffer = inNewScaleBuffer + newBiaseBuffer = inNewBiaseBuffer + filterSize = inFilterSize + } } -class ConvAddBatchNormReluKernel: Kernel, Computable, Testable { - required init(device: MTLDevice, testParam: ConvAddBatchNormReluTestParam, initContext: InitContext) { - if testParam.filterSize.width == 1 && testParam.filterSize.height == 1 { - super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1", initContext: initContext) - } else if testParam.filterSize.channel == 1 { - super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3", initContext: initContext) +class ConvAddBatchNormReluKernel: Kernel, Computable, Testable { + required init(device: MTLDevice, testParam: ConvAddBatchNormReluTestParam, initContext: InitContext) { + if testParam.filterSize.width == 1 && testParam.filterSize.height == 1 { + super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1", initContext: initContext) + } else if testParam.filterSize.channel == 1 { + super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3", initContext: initContext) + } } - } - - var metalParam: MetalConvParam! - - required init(device: MTLDevice, param: ConvAddBatchNormReluParam

, initContext: InitContext) { - param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision) - param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) - param.y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) - param.variance.initBuffer(device: device, precision: .Float32) - param.mean.initBuffer(device: device, precision: .Float32) - param.scale.initBuffer(device: device, precision: .Float32) - param.bias.initBuffer(device: device, precision: .Float32) - if GlobalConfig.shared.computePrecision == .Float32 { - if param.filter.width == 1 && param.filter.height == 1 { - super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1", initContext: initContext) - } else if param.filter.channel == 1 { - super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3", initContext: initContext) - } else if param.filter.width == 3 && param.filter.height == 3 { - super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3", initContext: initContext) - } else { - fatalError(" unsupport ") - } - } else if GlobalConfig.shared.computePrecision == .Float16 { - if param.filter.width == 1 && param.filter.height == 1 { - super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1_half", initContext: initContext) - } else if param.filter.channel == 1 { - super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3_half", initContext: initContext) - } else if param.filter.width == 3 && param.filter.height == 3 { - super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3_half", initContext: initContext) - } else { - fatalError(" unsupport ") - } - } else { - fatalError() + var metalParam: MetalConvParam! + + required init(device: MTLDevice, param: ConvAddBatchNormReluParam

, initContext: InitContext) { + param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision) + param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) + param.y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) + param.variance.initBuffer(device: device, precision: .Float32) + param.mean.initBuffer(device: device, precision: .Float32) + param.scale.initBuffer(device: device, precision: .Float32) + param.bias.initBuffer(device: device, precision: .Float32) + + if GlobalConfig.shared.computePrecision == .Float32 { + if param.filter.width == 1 && param.filter.height == 1 { + super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1", initContext: initContext) + } else if param.filter.channel == 1 { + super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3", initContext: initContext) + } else if param.filter.width == 3 && param.filter.height == 3 { + super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3", initContext: initContext) + } else { + fatalError(" unsupport ") + } + } else if GlobalConfig.shared.computePrecision == .Float16 { + if param.filter.width == 1 && param.filter.height == 1 { + super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1_half", initContext: initContext) + } else if param.filter.channel == 1 { + super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3_half", initContext: initContext) + } else if param.filter.width == 3 && param.filter.height == 3 { + super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3_half", initContext: initContext) + } else { + fatalError(" unsupport ") + } + } else { + fatalError() + } + + let offsetX = param.filter.width/2 - Int(param.paddings[0]) + let offsetY = param.filter.height/2 - Int(param.paddings[1]) + + print("offset x: \(offsetX)") + print("offset y: \(offsetY)") + + let offsetZ = 0.0 + metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) + + var invs: [P] = [] + let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self) + + for i in 0...stride { + let inv = 1.0/pow(Float32.init(varianceContents[i]) + param.epsilon, 0.5) + invs.append(P(inv)) + } + + let newScale: UnsafeMutablePointer

= UnsafeMutablePointer

.allocate(capacity: param.scale.buffer.length) + let newBiase: UnsafeMutablePointer

= UnsafeMutablePointer

.allocate(capacity: param.bias.buffer.length) + + let scaleContents = param.scale.buffer.contents().assumingMemoryBound(to: P.self) + let biaseContents = param.bias.buffer.contents().assumingMemoryBound(to: P.self) + let meanContents = param.mean.buffer.contents().assumingMemoryBound(to: P.self) + for i in 0...stride { + newScale[i] = invs[i] * scaleContents[i] + newBiase[i] = biaseContents[i] - meanContents[i] * invs[i] * scaleContents[i] + } + + // var newScaleFP16: UnsafeMutableRawPointer + // + // float32ToFloat16(input: newScale as! UnsafeMutablePointer, output: newScaleFP16, count: param.scale.buffer.length / MemoryLayout

.size) + + + // let newBiaseFloat16 = device.makeBuffer(length: <#T##Int#>, options: <#T##MTLResourceOptions#>) + + var newBiaseBuffer: MTLBuffer + var newScaleBuffer: MTLBuffer + + if GlobalConfig.shared.computePrecision == .Float32 { + newBiaseBuffer = device.makeBuffer(bytes: newBiase, length: param.bias.buffer.length)! + newScaleBuffer = device.makeBuffer(bytes: newScale, length: param.scale.buffer.length)! + } else if GlobalConfig.shared.computePrecision == .Float16 { + + newBiaseBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)! + newScaleBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)! + + float32ToFloat16(input: newBiase as! UnsafeMutablePointer, output: newBiaseBuffer.contents(), count: param.bias.buffer.length / MemoryLayout

.size) + + float32ToFloat16(input: newScale as! UnsafeMutablePointer, output: newScaleBuffer.contents(), count: param.scale.buffer.length / MemoryLayout

.size) + } else { + fatalError(" unsupport ") + } + + param.newBiase = newBiaseBuffer + param.newScale = newScaleBuffer + + newScale.deinitialize(count: param.scale.buffer.length) + newScale.deallocate() + + newBiase.deinitialize(count: param.bias.buffer.length) + newBiase.deallocate() } - let offsetX = param.filter.width/2 - Int(param.paddings[0]) - let offsetY = param.filter.height/2 - Int(param.paddings[1]) - - print("offset x: \(offsetX)") - print("offset y: \(offsetY)") - - let offsetZ = 0.0 - metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) - - var invs: [P] = [] - let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self) - - for i in 0...stride { - let inv = 1.0/pow(Float32.init(varianceContents[i]) + param.epsilon, 0.5) - invs.append(P(inv)) + func compute(commandBuffer: MTLCommandBuffer, param: ConvAddBatchNormReluParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encode is nil") + } + + encoder.setTexture(param.input.metalTexture, index: 0) + encoder.setTexture(param.output.metalTexture, index: 1) + encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) + encoder.setBuffer(param.filter.buffer, offset: 0, index: 1) + encoder.setBuffer(param.y.buffer, offset: 0, index: 2) + encoder.setBuffer(param.newScale!, offset: 0, index: 3) + encoder.setBuffer(param.newBiase!, offset: 0, index: 4) + encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) + encoder.endEncoding() } - let newScale: UnsafeMutablePointer

= UnsafeMutablePointer

.allocate(capacity: param.scale.buffer.length) - let newBiase: UnsafeMutablePointer

= UnsafeMutablePointer

.allocate(capacity: param.bias.buffer.length) - - let scaleContents = param.scale.buffer.contents().assumingMemoryBound(to: P.self) - let biaseContents = param.bias.buffer.contents().assumingMemoryBound(to: P.self) - let meanContents = param.mean.buffer.contents().assumingMemoryBound(to: P.self) - for i in 0...stride { - newScale[i] = invs[i] * scaleContents[i] - newBiase[i] = biaseContents[i] - meanContents[i] * invs[i] * scaleContents[i] + public func test(commandBuffer: MTLCommandBuffer, param: ConvAddBatchNormReluTestParam) { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + fatalError() + } + + encoder.setTexture(param.inputTexture, index: 0) + encoder.setTexture(param.outputTexture, index: 1) + var inMetalParam = param.metalParam + encoder.setBytes(&inMetalParam, length: MemoryLayout.size, index: 0) + encoder.setBuffer(param.filterBuffer, offset: 0, index: 1) + encoder.setBuffer(param.biaseBuffer, offset: 0, index: 2) + encoder.setBuffer(param.newScaleBuffer, offset: 0, index: 3) + encoder.setBuffer(param.newBiaseBuffer, offset: 0, index: 4) + encoder.dispatch(computePipline: pipline, outTexture: param.outputTexture) + encoder.endEncoding() } - -// var newScaleFP16: UnsafeMutableRawPointer -// -// float32ToFloat16(input: newScale as! UnsafeMutablePointer, output: newScaleFP16, count: param.scale.buffer.length / MemoryLayout

.size) - - -// let newBiaseFloat16 = device.makeBuffer(length: <#T##Int#>, options: <#T##MTLResourceOptions#>) - - var newBiaseBuffer: MTLBuffer - var newScaleBuffer: MTLBuffer - - if GlobalConfig.shared.computePrecision == .Float32 { - newBiaseBuffer = device.makeBuffer(bytes: newBiase, length: param.bias.buffer.length)! - newScaleBuffer = device.makeBuffer(bytes: newScale, length: param.scale.buffer.length)! - } else if GlobalConfig.shared.computePrecision == .Float16 { - - newBiaseBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)! - newScaleBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)! - - float32ToFloat16(input: newBiase as! UnsafeMutablePointer, output: newBiaseBuffer.contents(), count: param.bias.buffer.length / MemoryLayout

.size) - - float32ToFloat16(input: newScale as! UnsafeMutablePointer, output: newScaleBuffer.contents(), count: param.scale.buffer.length / MemoryLayout

.size) - } else { - fatalError(" unsupport ") - } - - param.newBiase = newBiaseBuffer - param.newScale = newScaleBuffer - - newScale.deinitialize(count: param.scale.buffer.length) - newScale.deallocate() - - newBiase.deinitialize(count: param.bias.buffer.length) - newBiase.deallocate() - } - - func compute(commandBuffer: MTLCommandBuffer, param: ConvAddBatchNormReluParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encode is nil") - } - - encoder.setTexture(param.input.metalTexture, index: 0) - encoder.setTexture(param.output.metalTexture, index: 1) - encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) - encoder.setBuffer(param.filter.buffer, offset: 0, index: 1) - encoder.setBuffer(param.y.buffer, offset: 0, index: 2) - encoder.setBuffer(param.newScale!, offset: 0, index: 3) - encoder.setBuffer(param.newBiase!, offset: 0, index: 4) - encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) - encoder.endEncoding() - } - - public func test(commandBuffer: MTLCommandBuffer, param: ConvAddBatchNormReluTestParam) { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - fatalError() - } - - encoder.setTexture(param.inputTexture, index: 0) - encoder.setTexture(param.outputTexture, index: 1) - var inMetalParam = param.metalParam - encoder.setBytes(&inMetalParam, length: MemoryLayout.size, index: 0) - encoder.setBuffer(param.filterBuffer, offset: 0, index: 1) - encoder.setBuffer(param.biaseBuffer, offset: 0, index: 2) - encoder.setBuffer(param.newScaleBuffer, offset: 0, index: 3) - encoder.setBuffer(param.newBiaseBuffer, offset: 0, index: 4) - encoder.dispatch(computePipline: pipline, outTexture: param.outputTexture) - encoder.endEncoding() - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddKernel.swift index 0ba448161f4596b45797aec7ef186949de277c26..70c3379e8a97edd36e47d35a708a0ea2fa4b2d9e 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddKernel.swift @@ -13,76 +13,218 @@ limitations under the License. */ import Foundation +import MetalPerformanceShaders -class ConvAddKernel: Kernel, Computable { - var metalParam: MetalConvParam! - required init(device: MTLDevice, param: ConvAddParam

, initContext: InitContext) { - param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision) - let padWhenOneC = !(param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1]) - param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision, padWhenOneC: padWhenOneC) - param.y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) - - if GlobalConfig.shared.computePrecision == .Float16 { - if param.filter.width == 1 && param.filter.height == 1 { - super.init(device: device, inFunctionName: "conv_add_1x1_half", initContext: initContext) - } else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] { - super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_half", initContext: initContext) - } else if param.filter.width == 3 && param.filter.height == 3 { - super.init(device: device, inFunctionName: "conv_add_3x3_half", initContext: initContext) - } else if param.filter.width == 1 && param.filter.height == 5 { - super.init(device: device, inFunctionName: "conv_add_5x1_half", initContext: initContext) - } else if param.filter.width == 5 && param.filter.height == 1 { - super.init(device: device, inFunctionName: "conv_add_1x5_half", initContext: initContext) - } else { - fatalError(" unsupport yet ") - } - } else if GlobalConfig.shared.computePrecision == .Float32 { - if param.filter.width == 1 && param.filter.height == 1 { - super.init(device: device, inFunctionName: "conv_add_1x1", initContext: initContext) - } else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] { - super.init(device: device, inFunctionName: "depthwise_conv_add_3x3", initContext: initContext) - } else if param.filter.width == 1 && param.filter.height == 5 { - super.init(device: device, inFunctionName: "conv_add_5x1", initContext: initContext) - } else if param.filter.width == 5 && param.filter.height == 1 { - super.init(device: device, inFunctionName: "conv_add_1x5", initContext: initContext) - } else if param.filter.width == 3 && param.filter.height == 3 { - super.init(device: device, inFunctionName: "conv_add_3x3", initContext: initContext) - } else { - fatalError(" unsupport yet ") - } - } else { - fatalError() - } - +@available(iOS 10.0, *) +var convDic: [String : MPSCNNConvolution] = [:] +@available(iOS 10.0, *) +var imageDic: [String : MPSImage] = [:] + +/// 获取唯一字符串 +/// +/// - Returns: 唯一字符串 +func getUniqueKey() -> String { + return UUID.init().uuidString +} +@available(iOS 11.0, *) +class ConvDataSource: NSObject, MPSCNNConvolutionDataSource { - let offsetY = (Int(param.dilations[1]) * (param.filter.height - 1) + 1)/2 - Int(param.paddings[1]) + var _descriptor: MPSCNNConvolutionDescriptor + var _weightsTensor: Tensor

+ var _biasTensor: Tensor

+ var _biasTerms: UnsafeMutablePointer? - let offsetX = (Int(param.dilations[0]) * (param.filter.width - 1) + 1)/2 - Int(param.paddings[0]) + func load() -> Bool { + switch P.precisionType { + case .Float32: + _biasTerms = _biasTensor.data.pointer as? UnsafeMutablePointer + case .Float16: + _biasTerms = UnsafeMutablePointer.allocate(capacity: _biasTensor.data.count) + if let float16Point = _biasTensor.data.pointer as? UnsafeMutablePointer { + float16to32(input: float16Point, output: _biasTerms!, count: _biasTensor.data.count) + } + } + return true + } + + func purge() { + switch P.precisionType { + case .Float32: + return + case .Float16: + _biasTerms?.deinitialize(count: _biasTensor.data.count) + _biasTerms?.deallocate() + } + } + + func label() -> String? { + return "conv_add_label" + } + + func copy(with zone: NSZone? = nil) -> Any { + return self + } + + init(inDesc: MPSCNNConvolutionDescriptor, + inWeights: Tensor

, + inBiasTerms: Tensor

) { + _descriptor = inDesc + _weightsTensor = inWeights + _biasTensor = inBiasTerms + super.init() + } + + func descriptor() -> MPSCNNConvolutionDescriptor { + return _descriptor + } + + func dataType() -> MPSDataType { + switch P.precisionType { + case .Float32: + return .float32 + case .Float16: + return .float16 + } + } + + func weights() -> UnsafeMutableRawPointer { + return UnsafeMutableRawPointer.init(_weightsTensor.data.pointer) + } + + func biasTerms() -> UnsafeMutablePointer? { + return _biasTerms + } + +} + + +class ConvAddKernel: Kernel, Computable { + var metalParam: MetalConvParam! -// print(" function: \(functionName)") -// print("offset x: \(offsetX)") -// print("offset y: \(offsetY)") + let identifyingKey: String = getUniqueKey() - let offsetZ = 0.0 - let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) -// print("metal param: ") -// print(inMetalParam) + required init(device: MTLDevice, param: ConvAddParam

, initContext: InitContext) { + + param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision) + + let offsetY = (Int(param.dilations[1]) * (param.filter.tensorDim[2] - 1) + 1)/2 - Int(param.paddings[1]) + let offsetX = (Int(param.dilations[0]) * (param.filter.tensorDim[3] - 1) + 1)/2 - Int(param.paddings[0]) + + let key = identifyingKey + + if initContext.useMPS { // 使用 apple 的 MetalPerformanceShaders + if #available(iOS 11.0, *) { + var desc: MPSCNNConvolutionDescriptor? + // 如果不是 depth wise, 并且输入输出 tensor channel 都大于 4 + if !(param.filter.tensorDim[1] == 1 && param.filter.tensorDim[0] == param.input.tensorDim[1]) && param.input.tensorDim[1] > 4 && param.output.tensorDim[1] > 4 { + desc = MPSCNNConvolutionDescriptor(kernelWidth: param.filter.tensorDim[3], + kernelHeight: param.filter.tensorDim[2], + inputFeatureChannels: param.input.tensorDim[1], + outputFeatureChannels: param.output.tensorDim[1], + neuronFilter: nil) + desc?.strideInPixelsX = Int(param.stride[0]) + desc?.strideInPixelsY = Int(param.stride[1]) + } else if param.input.tensorDim[1] > 4 && param.output.tensorDim[1] > 4 { + desc = MPSCNNDepthWiseConvolutionDescriptor(kernelWidth: param.filter.tensorDim[3], + kernelHeight: param.filter.tensorDim[2], + inputFeatureChannels: param.input.tensorDim[1], + outputFeatureChannels: param.output.tensorDim[1], + neuronFilter: nil) + + } + + desc?.strideInPixelsX = Int(param.stride[0]) + desc?.strideInPixelsY = Int(param.stride[1]) + if let inDesc = desc { + let _ = param.filter.convert(converter: MPSPointerConverter

.init()) + let dataSource = ConvDataSource.init(inDesc: inDesc, inWeights: param.filter, inBiasTerms: param.y) + let conv = MPSCNNConvolution.init(device: device, weights: dataSource) + conv.offset = MPSOffset.init(x: offsetX, y: offsetY, z: 0) + conv.edgeMode = .zero + convDic[key] = conv + imageDic[identifyingKey + "_input"] = MPSImage.init(texture: param.input.metalTexture, featureChannels: param.input.tensorDim[1]) + imageDic[identifyingKey + "_output"] = MPSImage.init(texture: param.output.metalTexture, featureChannels: param.output.tensorDim[1]) + super.init(device: device, inFunctionName: "place_holder", initContext: initContext) + return + } + } + } + + let padWhenOneC = !(param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1]) + param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision, padWhenOneC: padWhenOneC) + param.y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) + + if GlobalConfig.shared.computePrecision == .Float16 { + if param.filter.width == 1 && param.filter.height == 1 { + super.init(device: device, inFunctionName: "conv_add_1x1_half", initContext: initContext) + } else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] { + super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_half", initContext: initContext) + } else if param.filter.width == 3 && param.filter.height == 3 { + super.init(device: device, inFunctionName: "conv_add_3x3_half", initContext: initContext) + } else if param.filter.width == 1 && param.filter.height == 5 { + super.init(device: device, inFunctionName: "conv_add_5x1_half", initContext: initContext) + } else if param.filter.width == 5 && param.filter.height == 1 { + super.init(device: device, inFunctionName: "conv_add_1x5_half", initContext: initContext) + } else { + fatalError(" unsupport yet ") + } + } else if GlobalConfig.shared.computePrecision == .Float32 { + if param.filter.width == 1 && param.filter.height == 1 { + super.init(device: device, inFunctionName: "conv_add_1x1", initContext: initContext) + } else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] { + super.init(device: device, inFunctionName: "depthwise_conv_add_3x3", initContext: initContext) + } else if param.filter.width == 1 && param.filter.height == 5 { + super.init(device: device, inFunctionName: "conv_add_5x1", initContext: initContext) + } else if param.filter.width == 5 && param.filter.height == 1 { + super.init(device: device, inFunctionName: "conv_add_1x5", initContext: initContext) + } else if param.filter.width == 3 && param.filter.height == 3 { + super.init(device: device, inFunctionName: "conv_add_3x3", initContext: initContext) + } else { + fatalError(" unsupport yet ") + } + } else { + fatalError() + } + + // print(" function: \(functionName)") + // print("offset x: \(offsetX)") + // print("offset y: \(offsetY)") + + let offsetZ = 0.0 + let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) + // print("metal param: ") + // print(inMetalParam) + + metalParam = inMetalParam + } - metalParam = inMetalParam - } - - func compute(commandBuffer: MTLCommandBuffer, param: ConvAddParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encode is nil") + func compute(commandBuffer: MTLCommandBuffer, param: ConvAddParam

) throws { + if #available(iOS 10.0, *) { + if let conv = convDic[identifyingKey], let inputImage = imageDic[identifyingKey + "_input"], let outputImage = imageDic[identifyingKey + "_output"] { + conv.encode(commandBuffer: commandBuffer, sourceImage: inputImage, destinationImage: outputImage) + return; + } + } + + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encode is nil") + } + encoder.setTexture(param.input.metalTexture, index: 0) + encoder.setTexture(param.output.metalTexture, index: 1) + encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) + encoder.setBuffer(param.filter.buffer, offset: 0, index: 1) + encoder.setBuffer(param.y.buffer, offset: 0, index: 2) + encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) + encoder.endEncoding() } - encoder.setTexture(param.input.metalTexture, index: 0) - encoder.setTexture(param.output.metalTexture, index: 1) - encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) - encoder.setBuffer(param.filter.buffer, offset: 0, index: 1) - encoder.setBuffer(param.y.buffer, offset: 0, index: 2) - encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) - encoder.endEncoding() - } + deinit { + if #available(iOS 10.0, *) { + convDic.removeValue(forKey: identifyingKey) + imageDic.removeValue(forKey: identifyingKey + "_input") + imageDic.removeValue(forKey: identifyingKey + "_output") + } + } } + diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddPreluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddPreluKernel.swift index 1d66696050f6db9c9c0ab041f9ad4b7ed2369648..3535ae1868ecde2ac2794b542dbefdd72b198e2c 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddPreluKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddPreluKernel.swift @@ -14,137 +14,137 @@ import Foundation -class ConvAddPreluKernel: Kernel, Computable { - var metalParam: MetalConvParam! - required init(device: MTLDevice, param: ConvAddPreluParam

, initContext: InitContext) { - param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision) - param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) - param.y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) - param.alpha.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) - - if GlobalConfig.shared.computePrecision == .Float16 { - if param.filter.width == 1 && param.filter.height == 1 { - if param.mode == "channel" { - super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_half", initContext: initContext) - } else if param.mode == "element" { - super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_half", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_half", initContext: initContext) - } +class ConvAddPreluKernel: Kernel, Computable { + var metalParam: MetalConvParam! + required init(device: MTLDevice, param: ConvAddPreluParam

, initContext: InitContext) { + param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision) + param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) + param.y.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) + param.alpha.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) - } else if param.filter.channel == 1 { - if param.mode == "channel" { - super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_half", initContext: initContext) - } else if param.mode == "element" { - super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_half", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_half", initContext: initContext) - } - } else if param.filter.width == 3 && param.filter.height == 3 { - if param.mode == "channel" { - super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_half", initContext: initContext) - } else if param.mode == "element" { - super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_half", initContext: initContext) + if GlobalConfig.shared.computePrecision == .Float16 { + if param.filter.width == 1 && param.filter.height == 1 { + if param.mode == "channel" { + super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_half", initContext: initContext) + } else if param.mode == "element" { + super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_half", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_half", initContext: initContext) + } + + } else if param.filter.channel == 1 { + if param.mode == "channel" { + super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_half", initContext: initContext) + } else if param.mode == "element" { + super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_half", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_half", initContext: initContext) + } + } else if param.filter.width == 3 && param.filter.height == 3 { + if param.mode == "channel" { + super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_half", initContext: initContext) + } else if param.mode == "element" { + super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_half", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_half", initContext: initContext) + } + + } else if param.filter.width == 1 && param.filter.height == 5 { + if param.mode == "channel" { + super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_half", initContext: initContext) + } else if param.mode == "element" { + super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_half", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_half", initContext: initContext) + } + } else if param.filter.width == 5 && param.filter.height == 1 { + if param.mode == "channel" { + super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_half", initContext: initContext) + } else if param.mode == "element" { + super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_half", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_half", initContext: initContext) + } + } else { + fatalError(" unsupport yet ") + } + } else if GlobalConfig.shared.computePrecision == .Float32 { + if param.filter.width == 1 && param.filter.height == 1 { + if param.mode == "channel" { + super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_float", initContext: initContext) + } else if param.mode == "element" { + super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_float", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_float", initContext: initContext) + } + } else if param.filter.channel == 1 { + if param.mode == "channel" { + super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_float", initContext: initContext) + } else if param.mode == "element" { + super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_float", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_float", initContext: initContext) + } + } else if param.filter.width == 3 && param.filter.height == 3 { + if param.mode == "channel" { + super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_float", initContext: initContext) + } else if param.mode == "element" { + super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_float", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_float", initContext: initContext) + } + + } else if param.filter.width == 1 && param.filter.height == 5 { + if param.mode == "channel" { + super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_float", initContext: initContext) + } else if param.mode == "element" { + super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_float", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_float", initContext: initContext) + } + } else if param.filter.width == 5 && param.filter.height == 1 { + if param.mode == "channel" { + super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_float", initContext: initContext) + } else if param.mode == "element" { + super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_float", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_float", initContext: initContext) + } + } else { + fatalError(" unsupport yet ") + } } else { - super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_half", initContext: initContext) + fatalError() } - } else if param.filter.width == 1 && param.filter.height == 5 { - if param.mode == "channel" { - super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_half", initContext: initContext) - } else if param.mode == "element" { - super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_half", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_half", initContext: initContext) - } - } else if param.filter.width == 5 && param.filter.height == 1 { - if param.mode == "channel" { - super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_half", initContext: initContext) - } else if param.mode == "element" { - super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_half", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_half", initContext: initContext) - } - } else { - fatalError(" unsupport yet ") - } - } else if GlobalConfig.shared.computePrecision == .Float32 { - if param.filter.width == 1 && param.filter.height == 1 { - if param.mode == "channel" { - super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_float", initContext: initContext) - } else if param.mode == "element" { - super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_float", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_float", initContext: initContext) - } - } else if param.filter.channel == 1 { - if param.mode == "channel" { - super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_float", initContext: initContext) - } else if param.mode == "element" { - super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_float", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_float", initContext: initContext) - } - } else if param.filter.width == 3 && param.filter.height == 3 { - if param.mode == "channel" { - super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_float", initContext: initContext) - } else if param.mode == "element" { - super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_float", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_float", initContext: initContext) - } + let offsetY = (Int(param.dilations[1]) * (param.filter.height - 1) + 1)/2 - Int(param.paddings[1]) - } else if param.filter.width == 1 && param.filter.height == 5 { - if param.mode == "channel" { - super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_float", initContext: initContext) - } else if param.mode == "element" { - super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_float", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_float", initContext: initContext) - } - } else if param.filter.width == 5 && param.filter.height == 1 { - if param.mode == "channel" { - super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_float", initContext: initContext) - } else if param.mode == "element" { - super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_float", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_float", initContext: initContext) - } - } else { - fatalError(" unsupport yet ") - } - } else { - fatalError() + let offsetX = (Int(param.dilations[0]) * (param.filter.width - 1) + 1)/2 - Int(param.paddings[0]) + + // print(" function: \(functionName)") + // print("offset x: \(offsetX)") + // print("offset y: \(offsetY)") + + let offsetZ = 0.0 + let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) + // print("metal param: ") + // print(inMetalParam) + + metalParam = inMetalParam } - let offsetY = (Int(param.dilations[1]) * (param.filter.height - 1) + 1)/2 - Int(param.paddings[1]) - - let offsetX = (Int(param.dilations[0]) * (param.filter.width - 1) + 1)/2 - Int(param.paddings[0]) - - // print(" function: \(functionName)") - // print("offset x: \(offsetX)") - // print("offset y: \(offsetY)") - - let offsetZ = 0.0 - let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) - // print("metal param: ") - // print(inMetalParam) - - metalParam = inMetalParam - } - - func compute(commandBuffer: MTLCommandBuffer, param: ConvAddPreluParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encode is nil") + func compute(commandBuffer: MTLCommandBuffer, param: ConvAddPreluParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encode is nil") + } + + encoder.setTexture(param.input.metalTexture, index: 0) + encoder.setTexture(param.output.metalTexture, index: 1) + encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) + encoder.setBuffer(param.filter.buffer, offset: 0, index: 1) + encoder.setBuffer(param.y.buffer, offset: 0, index: 2) + encoder.setBuffer(param.alpha.buffer, offset: 0, index: 3) + encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) + encoder.endEncoding() } - - encoder.setTexture(param.input.metalTexture, index: 0) - encoder.setTexture(param.output.metalTexture, index: 1) - encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) - encoder.setBuffer(param.filter.buffer, offset: 0, index: 1) - encoder.setBuffer(param.y.buffer, offset: 0, index: 2) - encoder.setBuffer(param.alpha.buffer, offset: 0, index: 3) - encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) - encoder.endEncoding() - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvBNReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvBNReluKernel.swift index 81c53a57a81155a9d4e804472764e3e0dab28fa6..9fa519a7eb80928919048e4d67c09456a87ab86f 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvBNReluKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvBNReluKernel.swift @@ -16,165 +16,165 @@ import Foundation import MetalPerformanceShaders struct ConvBNReluTestParam: TestParam { - let inputTexture: MTLTexture - let outputTexture: MTLTexture - var metalParam: MetalConvParam - let filterBuffer: MTLBuffer - let biaseBuffer: MTLBuffer - let newScaleBuffer: MTLBuffer - let newBiaseBuffer: MTLBuffer - let filterSize: (width: Int, height: Int, channel: Int) - init(inInputTexture: MTLTexture, inOutputTexture: MTLTexture, inMetalParam: MetalConvParam, inFilterBuffer: MTLBuffer, inBiaseBuffer: MTLBuffer, inNewScaleBuffer: MTLBuffer, inNewBiaseBuffer: MTLBuffer, inFilterSize: (width: Int, height: Int, channel: Int)) { - - inputTexture = inInputTexture - outputTexture = inOutputTexture - metalParam = inMetalParam - filterBuffer = inFilterBuffer - biaseBuffer = inBiaseBuffer - newScaleBuffer = inNewScaleBuffer - newBiaseBuffer = inNewBiaseBuffer - filterSize = inFilterSize - } + let inputTexture: MTLTexture + let outputTexture: MTLTexture + var metalParam: MetalConvParam + let filterBuffer: MTLBuffer + let biaseBuffer: MTLBuffer + let newScaleBuffer: MTLBuffer + let newBiaseBuffer: MTLBuffer + let filterSize: (width: Int, height: Int, channel: Int) + init(inInputTexture: MTLTexture, inOutputTexture: MTLTexture, inMetalParam: MetalConvParam, inFilterBuffer: MTLBuffer, inBiaseBuffer: MTLBuffer, inNewScaleBuffer: MTLBuffer, inNewBiaseBuffer: MTLBuffer, inFilterSize: (width: Int, height: Int, channel: Int)) { + + inputTexture = inInputTexture + outputTexture = inOutputTexture + metalParam = inMetalParam + filterBuffer = inFilterBuffer + biaseBuffer = inBiaseBuffer + newScaleBuffer = inNewScaleBuffer + newBiaseBuffer = inNewBiaseBuffer + filterSize = inFilterSize + } } -class ConvBNReluKernel: Kernel, Computable, Testable { - required init(device: MTLDevice, testParam: ConvBNReluTestParam, initContext: InitContext) { - if testParam.filterSize.width == 1 && testParam.filterSize.height == 1 { - super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1", initContext: initContext) - } else if testParam.filterSize.channel == 1 { - super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3", initContext: initContext) +class ConvBNReluKernel: Kernel, Computable, Testable { + required init(device: MTLDevice, testParam: ConvBNReluTestParam, initContext: InitContext) { + if testParam.filterSize.width == 1 && testParam.filterSize.height == 1 { + super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1", initContext: initContext) + } else if testParam.filterSize.channel == 1 { + super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3", initContext: initContext) + } } - } - - var metalParam: MetalConvParam! - - required init(device: MTLDevice, param: ConvBNReluParam

, initContext: InitContext) { - param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision) - param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) - param.variance.initBuffer(device: device, precision: .Float32) - param.mean.initBuffer(device: device, precision: .Float32) - param.scale.initBuffer(device: device, precision: .Float32) - param.bias.initBuffer(device: device, precision: .Float32) - - if GlobalConfig.shared.computePrecision == .Float32 { - if param.filter.width == 1 && param.filter.height == 1 { - super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1", initContext: initContext) - } else if param.filter.channel == 1 { - super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3", initContext: initContext) - } else if param.filter.width == 3 && param.filter.height == 3 { - super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3", initContext: initContext) - } else { - fatalError(" unsupport ") - } - } else if GlobalConfig.shared.computePrecision == .Float16 { - if param.filter.width == 1 && param.filter.height == 1 { - super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1_half", initContext: initContext) - } else if param.filter.channel == 1 { - super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3_half", initContext: initContext) - } else if param.filter.width == 3 && param.filter.height == 3 { - super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3_half", initContext: initContext) - } else { - fatalError(" unsupport ") - } - } else { - fatalError() + var metalParam: MetalConvParam! + + required init(device: MTLDevice, param: ConvBNReluParam

, initContext: InitContext) { + + param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision) + param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) + param.variance.initBuffer(device: device, precision: .Float32) + param.mean.initBuffer(device: device, precision: .Float32) + param.scale.initBuffer(device: device, precision: .Float32) + param.bias.initBuffer(device: device, precision: .Float32) + + if GlobalConfig.shared.computePrecision == .Float32 { + if param.filter.width == 1 && param.filter.height == 1 { + super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1", initContext: initContext) + } else if param.filter.channel == 1 { + super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3", initContext: initContext) + } else if param.filter.width == 3 && param.filter.height == 3 { + super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3", initContext: initContext) + } else { + fatalError(" unsupport ") + } + } else if GlobalConfig.shared.computePrecision == .Float16 { + if param.filter.width == 1 && param.filter.height == 1 { + super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1_half", initContext: initContext) + } else if param.filter.channel == 1 { + super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3_half", initContext: initContext) + } else if param.filter.width == 3 && param.filter.height == 3 { + super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3_half", initContext: initContext) + } else { + fatalError(" unsupport ") + } + } else { + fatalError() + } + + + + let offsetX = param.filter.width/2 - Int(param.paddings[0]) + let offsetY = param.filter.height/2 - Int(param.paddings[1]) + + // print(" param filter width: \(param.filter.width)") + // print(" param filter height: \(param.filter.height)") + // + // print(" param paddings: \(param.paddings)") + // + // print("ConvBNReluKernel offset x: \(offsetX)") + // print("ConvBNReluKernel offset y: \(offsetY)") + + let offsetZ = 0.0 + + metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) + + var invs: [P] = [] + let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self) + + for i in 0...stride { + let inv = 1.0/pow(Float32.init(varianceContents[i]) + param.epsilon, 0.5) + invs.append(P(inv)) + } + + let newScale: UnsafeMutablePointer

= UnsafeMutablePointer

.allocate(capacity: param.scale.buffer.length) + let newBiase: UnsafeMutablePointer

= UnsafeMutablePointer

.allocate(capacity: param.bias.buffer.length) + + let scaleContents = param.scale.buffer.contents().assumingMemoryBound(to: P.self) + let biaseContents = param.bias.buffer.contents().assumingMemoryBound(to: P.self) + let meanContents = param.mean.buffer.contents().assumingMemoryBound(to: P.self) + for i in 0...stride { + newScale[i] = invs[i] * scaleContents[i] + newBiase[i] = biaseContents[i] - meanContents[i] * invs[i] * scaleContents[i] + } + + var newBiaseBuffer: MTLBuffer + var newScaleBuffer: MTLBuffer + + if GlobalConfig.shared.computePrecision == .Float32 { + newBiaseBuffer = device.makeBuffer(bytes: newBiase, length: param.bias.buffer.length)! + newScaleBuffer = device.makeBuffer(bytes: newScale, length: param.scale.buffer.length)! + } else if GlobalConfig.shared.computePrecision == .Float16 { + + newBiaseBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)! + newScaleBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)! + + float32ToFloat16(input: newBiase as! UnsafeMutablePointer, output: newBiaseBuffer.contents(), count: param.bias.buffer.length / MemoryLayout

.size) + + float32ToFloat16(input: newScale as! UnsafeMutablePointer, output: newScaleBuffer.contents(), count: param.scale.buffer.length / MemoryLayout

.size) + } else { + fatalError(" unsupport ") + } + + param.newBiase = newBiaseBuffer + param.newScale = newScaleBuffer + + newScale.deinitialize(count: param.scale.buffer.length) + newScale.deallocate() + + newBiase.deinitialize(count: param.bias.buffer.length) + newBiase.deallocate() } - - - let offsetX = param.filter.width/2 - Int(param.paddings[0]) - let offsetY = param.filter.height/2 - Int(param.paddings[1]) - -// print(" param filter width: \(param.filter.width)") -// print(" param filter height: \(param.filter.height)") -// -// print(" param paddings: \(param.paddings)") -// -// print("ConvBNReluKernel offset x: \(offsetX)") -// print("ConvBNReluKernel offset y: \(offsetY)") - - let offsetZ = 0.0 - - metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) - - var invs: [P] = [] - let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self) - - for i in 0...stride { - let inv = 1.0/pow(Float32.init(varianceContents[i]) + param.epsilon, 0.5) - invs.append(P(inv)) + func compute(commandBuffer: MTLCommandBuffer, param: ConvBNReluParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encode is nil") + } + + encoder.setTexture(param.input.metalTexture, index: 0) + encoder.setTexture(param.output.metalTexture, index: 1) + encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) + encoder.setBuffer(param.filter.buffer, offset: 0, index: 1) + encoder.setBuffer(param.newScale!, offset: 0, index: 2) + encoder.setBuffer(param.newBiase!, offset: 0, index: 3) + encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) + encoder.endEncoding() } - let newScale: UnsafeMutablePointer

= UnsafeMutablePointer

.allocate(capacity: param.scale.buffer.length) - let newBiase: UnsafeMutablePointer

= UnsafeMutablePointer

.allocate(capacity: param.bias.buffer.length) - - let scaleContents = param.scale.buffer.contents().assumingMemoryBound(to: P.self) - let biaseContents = param.bias.buffer.contents().assumingMemoryBound(to: P.self) - let meanContents = param.mean.buffer.contents().assumingMemoryBound(to: P.self) - for i in 0...stride { - newScale[i] = invs[i] * scaleContents[i] - newBiase[i] = biaseContents[i] - meanContents[i] * invs[i] * scaleContents[i] + public func test(commandBuffer: MTLCommandBuffer, param: ConvBNReluTestParam) { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + fatalError() + } + + encoder.setTexture(param.inputTexture, index: 0) + encoder.setTexture(param.outputTexture, index: 1) + var inMetalParam = param.metalParam + encoder.setBytes(&inMetalParam, length: MemoryLayout.size, index: 0) + encoder.setBuffer(param.filterBuffer, offset: 0, index: 1) + encoder.setBuffer(param.newScaleBuffer, offset: 0, index: 2) + encoder.setBuffer(param.newBiaseBuffer, offset: 0, index: 3) + encoder.dispatch(computePipline: pipline, outTexture: param.outputTexture) + encoder.endEncoding() } - - var newBiaseBuffer: MTLBuffer - var newScaleBuffer: MTLBuffer - - if GlobalConfig.shared.computePrecision == .Float32 { - newBiaseBuffer = device.makeBuffer(bytes: newBiase, length: param.bias.buffer.length)! - newScaleBuffer = device.makeBuffer(bytes: newScale, length: param.scale.buffer.length)! - } else if GlobalConfig.shared.computePrecision == .Float16 { - - newBiaseBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)! - newScaleBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)! - - float32ToFloat16(input: newBiase as! UnsafeMutablePointer, output: newBiaseBuffer.contents(), count: param.bias.buffer.length / MemoryLayout

.size) - - float32ToFloat16(input: newScale as! UnsafeMutablePointer, output: newScaleBuffer.contents(), count: param.scale.buffer.length / MemoryLayout

.size) - } else { - fatalError(" unsupport ") - } - - param.newBiase = newBiaseBuffer - param.newScale = newScaleBuffer - - newScale.deinitialize(count: param.scale.buffer.length) - newScale.deallocate() - - newBiase.deinitialize(count: param.bias.buffer.length) - newBiase.deallocate() - } - - func compute(commandBuffer: MTLCommandBuffer, param: ConvBNReluParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encode is nil") - } - - encoder.setTexture(param.input.metalTexture, index: 0) - encoder.setTexture(param.output.metalTexture, index: 1) - encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) - encoder.setBuffer(param.filter.buffer, offset: 0, index: 1) - encoder.setBuffer(param.newScale!, offset: 0, index: 2) - encoder.setBuffer(param.newBiase!, offset: 0, index: 3) - encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) - encoder.endEncoding() - } - - public func test(commandBuffer: MTLCommandBuffer, param: ConvBNReluTestParam) { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - fatalError() - } - - encoder.setTexture(param.inputTexture, index: 0) - encoder.setTexture(param.outputTexture, index: 1) - var inMetalParam = param.metalParam - encoder.setBytes(&inMetalParam, length: MemoryLayout.size, index: 0) - encoder.setBuffer(param.filterBuffer, offset: 0, index: 1) - encoder.setBuffer(param.newScaleBuffer, offset: 0, index: 2) - encoder.setBuffer(param.newBiaseBuffer, offset: 0, index: 3) - encoder.dispatch(computePipline: pipline, outTexture: param.outputTexture) - encoder.endEncoding() - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift index 7571bc155b6a38afdcec7f646be6927f45f4b13a..fff6dd4726b16fd258df8ba1469d4889900a9970 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift @@ -15,48 +15,46 @@ import Foundation public struct MetalConvParam { - let offsetX: Int16 - let offsetY: Int16 - let offsetZ: Int16 - let strideX: UInt16 - let strideY: UInt16 - let dilationX: UInt16 - let dilationY: UInt16 + let offsetX: Int16 + let offsetY: Int16 + let offsetZ: Int16 + let strideX: UInt16 + let strideY: UInt16 + let dilationX: UInt16 + let dilationY: UInt16 } -class ConvKernel: Kernel, Computable { - var metalParam: MetalConvParam! - required init(device: MTLDevice, param: ConvParam

, initContext: InitContext) { - param.filter.initBuffer(device: device, precision: ComputePrecision.Float32) - if param.filter.width == 1 && param.filter.height == 1 { - super.init(device: device, inFunctionName: "conv_1x1", initContext: initContext) - } else if param.filter.channel == 1 { - super.init(device: device, inFunctionName: "depthwise_conv_3x3", initContext: initContext) - } else if param.filter.width == 3 && param.filter.height == 3 { - super.init(device: device, inFunctionName: "conv_3x3", initContext: initContext) - } else { - fatalError(" unsupport ") +class ConvKernel: Kernel, Computable { + var metalParam: MetalConvParam! + required init(device: MTLDevice, param: ConvParam

, initContext: InitContext) { + param.filter.initBuffer(device: device, precision: Precision.Float32) + if param.filter.width == 1 && param.filter.height == 1 { + super.init(device: device, inFunctionName: "conv_1x1", initContext: initContext) + } else if param.filter.channel == 1 { + super.init(device: device, inFunctionName: "depthwise_conv_3x3", initContext: initContext) + } else if param.filter.width == 3 && param.filter.height == 3 { + super.init(device: device, inFunctionName: "conv_3x3", initContext: initContext) + } else { + fatalError(" unsupport ") + } + + let offsetX = param.filter.dim[2]/2 - Int(param.paddings[0]) + let offsetY = param.filter.dim[1]/2 - Int(param.paddings[1]) + let offsetZ = 0.0 + + metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) } - - let offsetX = param.filter.dim[2]/2 - Int(param.paddings[0]) - let offsetY = param.filter.dim[1]/2 - Int(param.paddings[1]) - let offsetZ = 0.0 - metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) - } - - func compute(commandBuffer: MTLCommandBuffer, param: ConvParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encode is nil") + func compute(commandBuffer: MTLCommandBuffer, param: ConvParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encode is nil") + } + + encoder.setTexture(param.input.metalTexture, index: 0) + encoder.setTexture(param.output.metalTexture, index: 1) + encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) + encoder.setBuffer(param.filter.buffer, offset: 0, index: 1) + encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) + encoder.endEncoding() } - - encoder.setTexture(param.input.metalTexture, index: 0) - encoder.setTexture(param.output.metalTexture, index: 1) - encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) - encoder.setBuffer(param.filter.buffer, offset: 0, index: 1) - encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) - encoder.endEncoding() - } } - - diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvTransposeKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvTransposeKernel.swift index c8b1361649f40237e1527744ea9ba2ad8b1648c1..33ee422c3add3ca00f9d88736427873d41b6d702 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvTransposeKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvTransposeKernel.swift @@ -15,69 +15,69 @@ import Foundation struct MetalConvTransposeParam { - let kernelW: UInt16; - let kernelH: UInt16; - - let strideX: UInt16; - let strideY: UInt16; - - let paddingX: UInt16; - let paddingY: UInt16; - - let dilationX: UInt16; - let dilationY: UInt16; -} - -class ConvTransposeKernel: Kernel, Computable{ - var metalParam: MetalConvTransposeParam! - required init(device: MTLDevice, param: ConvTransposeParam

, initContext: InitContext) { - param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision) - param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision, convertToNHWC: false, withTranspose: true) - if GlobalConfig.shared.computePrecision == .Float32 { - if param.stride == [2, 2] && param.stride == [2, 2] { - super.init(device: device, inFunctionName: "conv_transpose2x2_stride2", initContext: initContext) - } else { - fatalError(" -- conv transpose unsupported yet -- ") - } - } else if GlobalConfig.shared.computePrecision == .Float16 { - if param.stride == [2, 2] && param.stride == [2, 2] { - super.init(device: device, inFunctionName: "conv_transpose2x2_stride2_half", initContext: initContext) - } else { - fatalError(" -- conv transpose unsupported yet -- ") - } - } else { - fatalError() - } + let kernelW: UInt16; + let kernelH: UInt16; -// let filter: [Float32] = param.filter.buffer.array() -// print(" conv transpose filter") -// print(filter) - let kernelWidth = UInt16(param.filter.width) - let kernelHeight = UInt16(param.filter.height) + let strideX: UInt16; + let strideY: UInt16; - let strideX = UInt16(param.stride[0]) - let strideY = UInt16(param.stride[1]) - let paddingX = UInt16(param.paddings[0]) - let paddingY = UInt16(param.paddings[1]) - let dilationX = UInt16(param.dilations[0]) - let dilationY = UInt16(param.dilations[1]) + let paddingX: UInt16; + let paddingY: UInt16; - metalParam = MetalConvTransposeParam.init(kernelW: kernelWidth, kernelH: kernelHeight, strideX: strideX, strideY: strideY, paddingX: paddingX, paddingY: paddingY, dilationX: dilationX, dilationY: dilationY) + let dilationX: UInt16; + let dilationY: UInt16; +} - } - - func compute(commandBuffer: MTLCommandBuffer, param: ConvTransposeParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encoder is nil") +class ConvTransposeKernel: Kernel, Computable{ + var metalParam: MetalConvTransposeParam! + required init(device: MTLDevice, param: ConvTransposeParam

, initContext: InitContext) { + param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision) + param.filter.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision, convertToNHWC: false, withTranspose: true) + if GlobalConfig.shared.computePrecision == .Float32 { + if param.stride == [2, 2] && param.stride == [2, 2] { + super.init(device: device, inFunctionName: "conv_transpose2x2_stride2", initContext: initContext) + } else { + fatalError(" -- conv transpose unsupported yet -- ") + } + } else if GlobalConfig.shared.computePrecision == .Float16 { + if param.stride == [2, 2] && param.stride == [2, 2] { + super.init(device: device, inFunctionName: "conv_transpose2x2_stride2_half", initContext: initContext) + } else { + fatalError(" -- conv transpose unsupported yet -- ") + } + } else { + fatalError() + } + + // let filter: [Float32] = param.filter.buffer.array() + // print(" conv transpose filter") + // print(filter) + let kernelWidth = UInt16(param.filter.width) + let kernelHeight = UInt16(param.filter.height) + + let strideX = UInt16(param.stride[0]) + let strideY = UInt16(param.stride[1]) + let paddingX = UInt16(param.paddings[0]) + let paddingY = UInt16(param.paddings[1]) + let dilationX = UInt16(param.dilations[0]) + let dilationY = UInt16(param.dilations[1]) + + metalParam = MetalConvTransposeParam.init(kernelW: kernelWidth, kernelH: kernelHeight, strideX: strideX, strideY: strideY, paddingX: paddingX, paddingY: paddingY, dilationX: dilationX, dilationY: dilationY) + } - encoder.setTexture(param.input.metalTexture, index: 0) - encoder.setTexture(param.output.metalTexture, index: 1) - encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) - encoder.setBuffer(param.filter.buffer, offset: 0, index: 1) - encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) - encoder.endEncoding() - } + func compute(commandBuffer: MTLCommandBuffer, param: ConvTransposeParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encoder is nil") + } + + encoder.setTexture(param.input.metalTexture, index: 0) + encoder.setTexture(param.output.metalTexture, index: 1) + encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) + encoder.setBuffer(param.filter.buffer, offset: 0, index: 1) + encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) + encoder.endEncoding() + } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddKernel.swift index 21108de10e6de4848649a0cbd237ff36243e7be9..8d4bed8674db48f3b73d86b25a6ea787b216c02a 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddKernel.swift @@ -15,59 +15,59 @@ import Foundation struct ElementwiseAddMetalParam { - var fast: Int32 = 0 - var axis: Int32 = 0 - var ylen: Int32 = 0 - var xdim: (Int32, Int32, Int32, Int32) = (0, 0, 0, 0) - var xtrans: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3) - var ydim: (Int32, Int32, Int32, Int32) = (0, 0, 0, 0) - var ytrans: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3) + var fast: Int32 = 0 + var axis: Int32 = 0 + var ylen: Int32 = 0 + var xdim: (Int32, Int32, Int32, Int32) = (0, 0, 0, 0) + var xtrans: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3) + var ydim: (Int32, Int32, Int32, Int32) = (0, 0, 0, 0) + var ytrans: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3) } -class ElementwiseAddKernel: Kernel, Computable { - var metalParam: ElementwiseAddMetalParam - required init(device: MTLDevice, param: ElementwiseAddParam

, initContext: InitContext) { - param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: GlobalConfig.shared.computePrecision) - - metalParam = ElementwiseAddMetalParam.init() - - let xdim: [Int32] = (0..<4).map { Int32(param.inputX.dim[$0]) } - let ydim: [Int32] = (0..<4).map { Int32(param.inputY.dim[$0]) } - let xtrans: [Int32] = (0..<4).map { Int32(param.inputX.transpose[$0]) } - let ytrans: [Int32] = (0..<4).map { Int32(param.inputY.transpose[$0]) } - - metalParam.xdim = (xdim[0], xdim[1], xdim[2], xdim[3]) - metalParam.ydim = (ydim[0], ydim[1], ydim[2], ydim[3]) - metalParam.xtrans = (xtrans[0], xtrans[1], xtrans[2], xtrans[3]) - metalParam.ytrans = (ytrans[0], ytrans[1], ytrans[2], ytrans[3]) - if param.axis == -1 { - metalParam.axis = 4 - Int32(param.inputY.tensorDim.cout()) - } else { - metalParam.axis = 4 - Int32(param.inputX.tensorDim.cout()) + Int32(param.axis) +class ElementwiseAddKernel: Kernel, Computable { + var metalParam: ElementwiseAddMetalParam + required init(device: MTLDevice, param: ElementwiseAddParam

, initContext: InitContext) { + param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: GlobalConfig.shared.computePrecision) + + metalParam = ElementwiseAddMetalParam.init() + + let xdim: [Int32] = (0..<4).map { Int32(param.inputX.dim[$0]) } + let ydim: [Int32] = (0..<4).map { Int32(param.inputY.dim[$0]) } + let xtrans: [Int32] = (0..<4).map { Int32(param.inputX.transpose[$0]) } + let ytrans: [Int32] = (0..<4).map { Int32(param.inputY.transpose[$0]) } + + metalParam.xdim = (xdim[0], xdim[1], xdim[2], xdim[3]) + metalParam.ydim = (ydim[0], ydim[1], ydim[2], ydim[3]) + metalParam.xtrans = (xtrans[0], xtrans[1], xtrans[2], xtrans[3]) + metalParam.ytrans = (ytrans[0], ytrans[1], ytrans[2], ytrans[3]) + if param.axis == -1 { + metalParam.axis = 4 - Int32(param.inputY.tensorDim.cout()) + } else { + metalParam.axis = 4 - Int32(param.inputX.tensorDim.cout()) + Int32(param.axis) + } + metalParam.ylen = Int32(param.inputY.tensorDim.cout()) + if (param.inputX.dim == param.inputY.dim) && (param.inputX.transpose == param.inputY.transpose) { + // print("===> elementwise_add fast!!!") + metalParam.fast = 1 + } + if GlobalConfig.shared.computePrecision == .Float32 { + super.init(device: device, inFunctionName: "elementwise_add", initContext: initContext) + } else if GlobalConfig.shared.computePrecision == .Float16 { + super.init(device: device, inFunctionName: "elementwise_add_half", initContext: initContext) + } else { + fatalError() + } } - metalParam.ylen = Int32(param.inputY.tensorDim.cout()) - if (param.inputX.dim == param.inputY.dim) && (param.inputX.transpose == param.inputY.transpose) { - // print("===> elementwise_add fast!!!") - metalParam.fast = 1 - } - if GlobalConfig.shared.computePrecision == .Float32 { - super.init(device: device, inFunctionName: "elementwise_add", initContext: initContext) - } else if GlobalConfig.shared.computePrecision == .Float16 { - super.init(device: device, inFunctionName: "elementwise_add_half", initContext: initContext) - } else { - fatalError() - } - } - - func compute(commandBuffer: MTLCommandBuffer, param: ElementwiseAddParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encode is nil") + + func compute(commandBuffer: MTLCommandBuffer, param: ElementwiseAddParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encode is nil") + } + encoder.setTexture(param.inputX.metalTexture, index: 0) + encoder.setTexture(param.inputY.metalTexture, index: 1) + encoder.setTexture(param.output.metalTexture, index: 2) + encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) + encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) + encoder.endEncoding() } - encoder.setTexture(param.inputX.metalTexture, index: 0) - encoder.setTexture(param.inputY.metalTexture, index: 1) - encoder.setTexture(param.output.metalTexture, index: 2) - encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) - encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) - encoder.endEncoding() - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddPreluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddPreluKernel.swift index a423a119f375641eeadd585360d62787d55a82d4..34c45cb06171ee37b292597240ca748e57d78137 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddPreluKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddPreluKernel.swift @@ -15,65 +15,65 @@ import Foundation -class ElementwiseAddPreluKernel: Kernel, Computable { - var metalParam: ElementwiseAddMetalParam - required init(device: MTLDevice, param: ElementwiseAddPreluParam

, initContext: InitContext) { - param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: GlobalConfig.shared.computePrecision) - param.alpha.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) - - metalParam = ElementwiseAddMetalParam.init() - - let xdim: [Int32] = (0..<4).map { Int32(param.inputX.dim[$0]) } - let ydim: [Int32] = (0..<4).map { Int32(param.inputY.dim[$0]) } - let xtrans: [Int32] = (0..<4).map { Int32(param.inputX.transpose[$0]) } - let ytrans: [Int32] = (0..<4).map { Int32(param.inputY.transpose[$0]) } - - metalParam.xdim = (xdim[0], xdim[1], xdim[2], xdim[3]) - metalParam.ydim = (ydim[0], ydim[1], ydim[2], ydim[3]) - metalParam.xtrans = (xtrans[0], xtrans[1], xtrans[2], xtrans[3]) - metalParam.ytrans = (ytrans[0], ytrans[1], ytrans[2], ytrans[3]) - if param.axis == -1 { - metalParam.axis = 4 - Int32(param.inputY.tensorDim.cout()) - } else { - metalParam.axis = 4 - Int32(param.inputX.tensorDim.cout()) + Int32(param.axis) - } - metalParam.ylen = Int32(param.inputY.tensorDim.cout()) - if (param.inputX.dim == param.inputY.dim) && (param.inputX.transpose == param.inputY.transpose) { - // print("===> elementwise_add fast!!!") - metalParam.fast = 1 +class ElementwiseAddPreluKernel: Kernel, Computable { + var metalParam: ElementwiseAddMetalParam + required init(device: MTLDevice, param: ElementwiseAddPreluParam

, initContext: InitContext) { + param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: GlobalConfig.shared.computePrecision) + param.alpha.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) + + metalParam = ElementwiseAddMetalParam.init() + + let xdim: [Int32] = (0..<4).map { Int32(param.inputX.dim[$0]) } + let ydim: [Int32] = (0..<4).map { Int32(param.inputY.dim[$0]) } + let xtrans: [Int32] = (0..<4).map { Int32(param.inputX.transpose[$0]) } + let ytrans: [Int32] = (0..<4).map { Int32(param.inputY.transpose[$0]) } + + metalParam.xdim = (xdim[0], xdim[1], xdim[2], xdim[3]) + metalParam.ydim = (ydim[0], ydim[1], ydim[2], ydim[3]) + metalParam.xtrans = (xtrans[0], xtrans[1], xtrans[2], xtrans[3]) + metalParam.ytrans = (ytrans[0], ytrans[1], ytrans[2], ytrans[3]) + if param.axis == -1 { + metalParam.axis = 4 - Int32(param.inputY.tensorDim.cout()) + } else { + metalParam.axis = 4 - Int32(param.inputX.tensorDim.cout()) + Int32(param.axis) + } + metalParam.ylen = Int32(param.inputY.tensorDim.cout()) + if (param.inputX.dim == param.inputY.dim) && (param.inputX.transpose == param.inputY.transpose) { + // print("===> elementwise_add fast!!!") + metalParam.fast = 1 + } + + if GlobalConfig.shared.computePrecision == .Float32 { + if param.mode == "channel" { + super.init(device: device, inFunctionName: "elementwise_add_channel_float", initContext: initContext) + } else if param.mode == "element" { + super.init(device: device, inFunctionName: "elementwise_add_element_float", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "elementwise_add_prelu_float", initContext: initContext) + } + } else if GlobalConfig.shared.computePrecision == .Float16 { + if param.mode == "channel" { + super.init(device: device, inFunctionName: "elementwise_add_channel_half", initContext: initContext) + } else if param.mode == "element" { + super.init(device: device, inFunctionName: "elementwise_add_channel_half", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "elementwise_add_channel_half", initContext: initContext) + } + } else { + fatalError() + } } - if GlobalConfig.shared.computePrecision == .Float32 { - if param.mode == "channel" { - super.init(device: device, inFunctionName: "elementwise_add_channel_float", initContext: initContext) - } else if param.mode == "element" { - super.init(device: device, inFunctionName: "elementwise_add_element_float", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "elementwise_add_prelu_float", initContext: initContext) - } - } else if GlobalConfig.shared.computePrecision == .Float16 { - if param.mode == "channel" { - super.init(device: device, inFunctionName: "elementwise_add_channel_half", initContext: initContext) - } else if param.mode == "element" { - super.init(device: device, inFunctionName: "elementwise_add_channel_half", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "elementwise_add_channel_half", initContext: initContext) - } - } else { - fatalError() - } - } - - func compute(commandBuffer: MTLCommandBuffer, param: ElementwiseAddPreluParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encode is nil") + func compute(commandBuffer: MTLCommandBuffer, param: ElementwiseAddPreluParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encode is nil") + } + encoder.setTexture(param.inputX.metalTexture, index: 0) + encoder.setTexture(param.inputY.metalTexture, index: 1) + encoder.setTexture(param.output.metalTexture, index: 2) + encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) + encoder.setBuffer(param.alpha.buffer, offset: 0, index: 1) + encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) + encoder.endEncoding() } - encoder.setTexture(param.inputX.metalTexture, index: 0) - encoder.setTexture(param.inputY.metalTexture, index: 1) - encoder.setTexture(param.output.metalTexture, index: 2) - encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) - encoder.setBuffer(param.alpha.buffer, offset: 0, index: 1) - encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) - encoder.endEncoding() - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/FetchKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/FetchKernel.swift index 7d6e68e699b6a7556915f9ce4136bedae29a6dcc..8164a98efab614e4241576f27071021c51d282e4 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/FetchKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/FetchKernel.swift @@ -14,48 +14,48 @@ import Foundation -class FetchKernel: Kernel, Computable { - - required init(device: MTLDevice, param: FetchParam

, initContext: InitContext) { - param.output.initBuffer(device: device) - if GlobalConfig.shared.computePrecision == .Float16 { - if param.input.transpose == [0, 2, 3, 1] { - super.init(device: device, inFunctionName: "fetch_half", initContext: initContext) - } else if param.input.transpose == [0, 1, 2, 3] { - switch param.input.tensorDim.cout() { - case 1, 2: - super.init(device: device, inFunctionName: "fetch_1or2_half", initContext: initContext) - default: - fatalError(" not support ") +class FetchKernel: Kernel, Computable { + + required init(device: MTLDevice, param: FetchParam

, initContext: InitContext) { + param.output.initBuffer(device: device) + if GlobalConfig.shared.computePrecision == .Float16 { + if param.input.transpose == [0, 2, 3, 1] { + super.init(device: device, inFunctionName: "fetch_half", initContext: initContext) + } else if param.input.transpose == [0, 1, 2, 3] { + switch param.input.tensorDim.cout() { + case 1, 2: + super.init(device: device, inFunctionName: "fetch_1or2_half", initContext: initContext) + default: + fatalError(" not support ") + } + } else { + fatalError(" not support ") + } + } else if GlobalConfig.shared.computePrecision == .Float32 { + if param.input.transpose == [0, 2, 3, 1] { + super.init(device: device, inFunctionName: "fetch_float", initContext: initContext) + } else if param.input.transpose == [0, 1, 2, 3] { + switch param.input.tensorDim.cout() { + case 1, 2: + super.init(device: device, inFunctionName: "fetch_1or2_float", initContext: initContext) + default: + fatalError(" not support ") + } + } else { + fatalError(" not support ") + } + } else { + fatalError(" not support ") } - } else { - fatalError(" not support ") - } - } else if GlobalConfig.shared.computePrecision == .Float32 { - if param.input.transpose == [0, 2, 3, 1] { - super.init(device: device, inFunctionName: "fetch_float", initContext: initContext) - } else if param.input.transpose == [0, 1, 2, 3] { - switch param.input.tensorDim.cout() { - case 1, 2: - super.init(device: device, inFunctionName: "fetch_1or2_float", initContext: initContext) - default: - fatalError(" not support ") - } - } else { - fatalError(" not support ") - } - } else { - fatalError(" not support ") } - } - - func compute(commandBuffer: MTLCommandBuffer, param: FetchParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encode is nil") + + func compute(commandBuffer: MTLCommandBuffer, param: FetchParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encode is nil") + } + encoder.setTexture(param.input.metalTexture, index: 0) + encoder.setBuffer(param.output.resultBuffer!, offset: 0, index: 0) + encoder.dispatch(computePipline: pipline, outTexture: param.input.metalTexture) + encoder.endEncoding() } - encoder.setTexture(param.input.metalTexture, index: 0) - encoder.setBuffer(param.output.resultBuffer!, offset: 0, index: 0) - encoder.dispatch(computePipline: pipline, outTexture: param.input.metalTexture) - encoder.endEncoding() - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/FlattenKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/FlattenKernel.swift index 06a6537e1f9612aa646668fab678879b1d782df0..7e9f129a015c1859c0fb848815eb2c8f5d4eebc0 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/FlattenKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/FlattenKernel.swift @@ -15,57 +15,57 @@ import Foundation struct FlattenMetalParam { - var idim: (Int32, Int32, Int32, Int32) - var itrans: (Int32, Int32, Int32, Int32) - var odim: (Int32, Int32, Int32, Int32) - var otrans: (Int32, Int32, Int32, Int32) + var idim: (Int32, Int32, Int32, Int32) + var itrans: (Int32, Int32, Int32, Int32) + var odim: (Int32, Int32, Int32, Int32) + var otrans: (Int32, Int32, Int32, Int32) } -class FlattenKernel: Kernel, Computable{ - - var metalParam: FlattenMetalParam - - required init(device: MTLDevice, param: FlattenParam

, initContext: InitContext) { - param.output.initTexture(device: device, computePrecision: GlobalConfig.shared.computePrecision) - var id: [Int32] = [1, 1, 1, 1] - for i in 0..) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encoder is nil") +class FlattenKernel: Kernel, Computable{ + + var metalParam: FlattenMetalParam + + required init(device: MTLDevice, param: FlattenParam

, initContext: InitContext) { + param.output.initTexture(device: device, computePrecision: GlobalConfig.shared.computePrecision) + var id: [Int32] = [1, 1, 1, 1] + for i in 0...size, index: 0) - encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) - encoder.endEncoding() - } + func compute(commandBuffer: MTLCommandBuffer, param: FlattenParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encoder is nil") + } + + encoder.setTexture(param.input.metalTexture, index: 0) + encoder.setTexture(param.output.metalTexture, index: 1) + + encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) + encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) + encoder.endEncoding() + } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/MulticlassNMSKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/MulticlassNMSKernel.swift index d3fc5a3ac9e62c05d892e26aeca9560943a9e240..d3350a4225ecaf273a716a99334a2b46317d61e5 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/MulticlassNMSKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/MulticlassNMSKernel.swift @@ -14,42 +14,42 @@ import Foundation -class MulticlassNMSKernel: Kernel, Computable{ - let pipline1: MTLComputePipelineState - - required init(device: MTLDevice, param: MulticlassNMSParam

, initContext: InitContext) { +class MulticlassNMSKernel: Kernel, Computable{ + let pipline1: MTLComputePipelineState - param.middleOutput.initBuffer(device: device) - param.bboxOutput.initBuffer(device: device) - if GlobalConfig.shared.computePrecision == .Float32 { - pipline1 = device.pipeLine(funcName: "nms_fetch_bbox", metalLoadMode: initContext.metalLoadMode, metalLibPath: initContext.metalLibPath) - super.init(device: device, inFunctionName: "nms_fetch_result", initContext: initContext) - } else if GlobalConfig.shared.computePrecision == .Float16 { - pipline1 = device.pipeLine(funcName: "nms_fetch_bbox_half", metalLoadMode: initContext.metalLoadMode, metalLibPath: initContext.metalLibPath) - super.init(device: device, inFunctionName: "nms_fetch_result_half", initContext: initContext) - } else { - fatalError( " unsupport precision " ) + required init(device: MTLDevice, param: MulticlassNMSParam

, initContext: InitContext) { + + param.middleOutput.initBuffer(device: device) + param.bboxOutput.initBuffer(device: device) + if GlobalConfig.shared.computePrecision == .Float32 { + pipline1 = device.pipeLine(funcName: "nms_fetch_bbox", metalLoadMode: initContext.metalLoadMode, metalLibPath: initContext.metalLibPath) + super.init(device: device, inFunctionName: "nms_fetch_result", initContext: initContext) + } else if GlobalConfig.shared.computePrecision == .Float16 { + pipline1 = device.pipeLine(funcName: "nms_fetch_bbox_half", metalLoadMode: initContext.metalLoadMode, metalLibPath: initContext.metalLibPath) + super.init(device: device, inFunctionName: "nms_fetch_result_half", initContext: initContext) + } else { + fatalError( " unsupport precision " ) + } + } - } - - func compute(commandBuffer: MTLCommandBuffer, param: MulticlassNMSParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encode is nil") + func compute(commandBuffer: MTLCommandBuffer, param: MulticlassNMSParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encode is nil") + } + + encoder.setTexture(param.scores.metalTexture, index: 0) + encoder.setBuffer(param.middleOutput.resultBuffer!, offset: 0, index: 0) + encoder.dispatch(computePipline: pipline, outTexture: param.scores.metalTexture) + encoder.endEncoding() + + guard let encoderBox = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encode is nil") + } + + encoderBox.setTexture(param.bboxes.metalTexture, index: 0) + encoderBox.setBuffer(param.bboxOutput.resultBuffer!, offset: 0, index: 0) + encoderBox.dispatch(computePipline: pipline1, outTexture: param.bboxes.metalTexture) + encoderBox.endEncoding() } - - encoder.setTexture(param.scores.metalTexture, index: 0) - encoder.setBuffer(param.middleOutput.resultBuffer!, offset: 0, index: 0) - encoder.dispatch(computePipline: pipline, outTexture: param.scores.metalTexture) - encoder.endEncoding() - - guard let encoderBox = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encode is nil") - } - - encoderBox.setTexture(param.bboxes.metalTexture, index: 0) - encoderBox.setBuffer(param.bboxOutput.resultBuffer!, offset: 0, index: 0) - encoderBox.dispatch(computePipline: pipline1, outTexture: param.bboxes.metalTexture) - encoderBox.endEncoding() - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PoolKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PoolKernel.swift index b6833a4f93e82efbf4ffc28998624ce0b6432b52..2e7a4c38566ff3bd9da1ea79345cf015cbd17cbe 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PoolKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PoolKernel.swift @@ -15,57 +15,57 @@ import Foundation struct PoolMetalParam { - let ksizeX: Int32 - let ksizeY: Int32 - let strideX: Int32 - let strideY: Int32 - let paddingX: Int32 - let paddingY: Int32 - let poolType: Int32 + let ksizeX: Int32 + let ksizeY: Int32 + let strideX: Int32 + let strideY: Int32 + let paddingX: Int32 + let paddingY: Int32 + let poolType: Int32 } -class PoolKernel: Kernel, Computable{ - var metalParam: PoolMetalParam - required init(device: MTLDevice, param: PoolParam

, initContext: InitContext) { - param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision) - - var poolType: Int32 - switch param.poolType { - case "max": - poolType = 0 - case "avg": - poolType = 1 - default: - fatalError() +class PoolKernel: Kernel, Computable{ + var metalParam: PoolMetalParam + required init(device: MTLDevice, param: PoolParam

, initContext: InitContext) { + param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision) + + var poolType: Int32 + switch param.poolType { + case "max": + poolType = 0 + case "avg": + poolType = 1 + default: + fatalError() + } + metalParam = PoolMetalParam.init( + ksizeX: param.ksize[0], + ksizeY: param.ksize[1], + strideX: param.stride[0], + strideY: param.stride[1], + paddingX: param.padding[0], + paddingY: param.padding[1], + poolType: poolType + ) + + if GlobalConfig.shared.computePrecision == .Float32 { + super.init(device: device, inFunctionName: "pool_float", initContext: initContext) + } else if GlobalConfig.shared.computePrecision == .Float16 { + super.init(device: device, inFunctionName: "pool_half", initContext: initContext) + } else { + fatalError() + } } - metalParam = PoolMetalParam.init( - ksizeX: param.ksize[0], - ksizeY: param.ksize[1], - strideX: param.stride[0], - strideY: param.stride[1], - paddingX: param.padding[0], - paddingY: param.padding[1], - poolType: poolType - ) - if GlobalConfig.shared.computePrecision == .Float32 { - super.init(device: device, inFunctionName: "pool_float", initContext: initContext) - } else if GlobalConfig.shared.computePrecision == .Float16 { - super.init(device: device, inFunctionName: "pool_half", initContext: initContext) - } else { - fatalError() - } - } - - func compute(commandBuffer: MTLCommandBuffer, param: PoolParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encoder is nil") + func compute(commandBuffer: MTLCommandBuffer, param: PoolParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encoder is nil") + } + encoder.setTexture(param.input.metalTexture, index: 0) + encoder.setTexture(param.output.metalTexture, index: 1) + + encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) + encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) + encoder.endEncoding() } - encoder.setTexture(param.input.metalTexture, index: 0) - encoder.setTexture(param.output.metalTexture, index: 1) - - encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) - encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) - encoder.endEncoding() - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PreluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PreluKernel.swift index 61a21331a6bf1766a86a5849d7ea9001672642fa..906dc0fcfe650a57e6e9d66d6f17c0bf44e15294 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PreluKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PreluKernel.swift @@ -14,40 +14,40 @@ import Foundation -class PreluKernel: Kernel, Computable{ - required init(device: MTLDevice, param: PreluParam

, initContext: InitContext) { - param.alpha.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) - param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision) - if GlobalConfig.shared.computePrecision == .Float32 { - if param.mode == "channel" { - super.init(device: device, inFunctionName: "prelu_channel", initContext: initContext) - } else if param.mode == "element" { - super.init(device: device, inFunctionName: "prelu_element", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "prelu_other", initContext: initContext) - } - } else if GlobalConfig.shared.computePrecision == .Float16 { - if param.mode == "channel" { - super.init(device: device, inFunctionName: "prelu_channel_half", initContext: initContext) - } else if param.mode == "element" { - super.init(device: device, inFunctionName: "prelu_element_half", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "prelu_other_half", initContext: initContext) - } - } else { - fatalError() - } - } - - func compute(commandBuffer: MTLCommandBuffer, param: PreluParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encoder is nil") +class PreluKernel: Kernel, Computable{ + required init(device: MTLDevice, param: PreluParam

, initContext: InitContext) { + param.alpha.initBuffer(device: device, precision: GlobalConfig.shared.computePrecision) + param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision) + if GlobalConfig.shared.computePrecision == .Float32 { + if param.mode == "channel" { + super.init(device: device, inFunctionName: "prelu_channel", initContext: initContext) + } else if param.mode == "element" { + super.init(device: device, inFunctionName: "prelu_element", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "prelu_other", initContext: initContext) + } + } else if GlobalConfig.shared.computePrecision == .Float16 { + if param.mode == "channel" { + super.init(device: device, inFunctionName: "prelu_channel_half", initContext: initContext) + } else if param.mode == "element" { + super.init(device: device, inFunctionName: "prelu_element_half", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "prelu_other_half", initContext: initContext) + } + } else { + fatalError() + } } - encoder.setTexture(param.input.metalTexture, index: 0) - encoder.setTexture(param.output.metalTexture, index: 1) - encoder.setBuffer(param.alpha.buffer, offset: 0, index: 0) - encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) - encoder.endEncoding() - } + func compute(commandBuffer: MTLCommandBuffer, param: PreluParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encoder is nil") + } + + encoder.setTexture(param.input.metalTexture, index: 0) + encoder.setTexture(param.output.metalTexture, index: 1) + encoder.setBuffer(param.alpha.buffer, offset: 0, index: 0) + encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) + encoder.endEncoding() + } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PriorBoxKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PriorBoxKernel.swift index 15126bbc837f2997e6f693b4d6dbcfc85ba34109..009062c889a303da31835ffcfa57811ebcd3cb85 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PriorBoxKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/PriorBoxKernel.swift @@ -15,136 +15,136 @@ import Foundation struct PriorBoxMetalParam { - let offset: Float32 - let stepWidth: Float32 - let stepHeight: Float32 - let minSize: Float32 - let maxSize: Float32 - let imageWidth: Float32 - let imageHeight: Float32 - let clip: Bool - let numPriors: uint - let aspecRatiosSize: uint - let minSizeSize: uint - let maxSizeSize: uint + let offset: Float32 + let stepWidth: Float32 + let stepHeight: Float32 + let minSize: Float32 + let maxSize: Float32 + let imageWidth: Float32 + let imageHeight: Float32 + let clip: Bool + let numPriors: uint + let aspecRatiosSize: uint + let minSizeSize: uint + let maxSizeSize: uint } -class PriorBoxKernel: Kernel, Computable{ - var metalParam: PriorBoxMetalParam! - - required init(device: MTLDevice, param: PriorBoxParam

, initContext: InitContext) { - - let originDim = param.output.tensorDim; - - param.output.tensorDim = Dim.init(inDim: [1, originDim[0], originDim[1], originDim[2] * originDim[3]]) - param.output.padToFourDim = Dim.init(inDim: [1, originDim[0], originDim[1], originDim[2] * originDim[3]]) - - param.output.initTexture(device: device, inTranspose: [0, 1, 2, 3], computePrecision: GlobalConfig.shared.computePrecision) - param.outputVariances.initTexture(device: device, inTranspose: [2, 0, 1, 3], computePrecision: GlobalConfig.shared.computePrecision) - - if GlobalConfig.shared.computePrecision == .Float32 { - if param.min_max_aspect_ratios_order { - super.init(device: device, inFunctionName: "prior_box_MinMaxAspectRatiosOrder", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "prior_box", initContext: initContext) - } - - } else if GlobalConfig.shared.computePrecision == .Float16 { - if param.min_max_aspect_ratios_order { - super.init(device: device, inFunctionName: "prior_box_MinMaxAspectRatiosOrder_half", initContext: initContext) - } else { - super.init(device: device, inFunctionName: "prior_box_half", initContext: initContext) - } - } else { - fatalError() - } - - - guard param.minSizes.count == 1 else { - fatalError(" need implement ") - } - -// let n = 1 -// let h = param.output.dim[1] -// let w = param.output.dim[2] -// let c = param.output.dim[3] * param.output.dim[0] -// -// param.output.dim = Dim.init(inDim: [n, h, w, c]) -// param.output.transpose = [0, 1, 2, 3] - - let imageWidth = Float32(param.inputImage.padToFourDim[3]) - let imageHeight = Float32(param.inputImage.padToFourDim[2]) - - let featureWidth = param.input.padToFourDim[3] - let featureHeight = param.input.padToFourDim[2] - - if param.stepW == 0 || param.stepH == 0 { - param.stepW = Float32(imageWidth) / Float32(featureWidth) - param.stepH = Float32(imageHeight) / Float32(featureHeight) - } - - var outputAspectRatior: [Float32] = [] - outputAspectRatior.append(1.0) - - let epsilon = 1e-6 - for ar in param.aspectRatios { - var alreadyExist = false - for outputAr in outputAspectRatior { - if fabs(Double(ar) - Double(outputAr)) < Double(epsilon) { - alreadyExist = true - break +class PriorBoxKernel: Kernel, Computable{ + var metalParam: PriorBoxMetalParam! + + required init(device: MTLDevice, param: PriorBoxParam

, initContext: InitContext) { + + let originDim = param.output.tensorDim; + + param.output.tensorDim = Dim.init(inDim: [1, originDim[0], originDim[1], originDim[2] * originDim[3]]) + param.output.padToFourDim = Dim.init(inDim: [1, originDim[0], originDim[1], originDim[2] * originDim[3]]) + + param.output.initTexture(device: device, inTranspose: [0, 1, 2, 3], computePrecision: GlobalConfig.shared.computePrecision) + param.outputVariances.initTexture(device: device, inTranspose: [2, 0, 1, 3], computePrecision: GlobalConfig.shared.computePrecision) + + if GlobalConfig.shared.computePrecision == .Float32 { + if param.min_max_aspect_ratios_order { + super.init(device: device, inFunctionName: "prior_box_MinMaxAspectRatiosOrder", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "prior_box", initContext: initContext) + } + + } else if GlobalConfig.shared.computePrecision == .Float16 { + if param.min_max_aspect_ratios_order { + super.init(device: device, inFunctionName: "prior_box_MinMaxAspectRatiosOrder_half", initContext: initContext) + } else { + super.init(device: device, inFunctionName: "prior_box_half", initContext: initContext) + } + } else { + fatalError() } - } - - if !alreadyExist { - outputAspectRatior.append(ar) - } - if param.flip { - outputAspectRatior.append(1.0 / ar) - } - } - - if GlobalConfig.shared.computePrecision == .Float16 { - let buffer = device.makeBuffer(length: outputAspectRatior.count * MemoryLayout.size) - float32ToFloat16(input: &outputAspectRatior, output:(buffer?.contents())!, count: outputAspectRatior.count) - param.newAspectRatios = buffer - - } else if GlobalConfig.shared.computePrecision == .Float32 { - let buffer = device.makeBuffer(bytes: outputAspectRatior, length: outputAspectRatior.count * MemoryLayout.size, options: []) - param.newAspectRatios = buffer - } else { - fatalError() + + + guard param.minSizes.count == 1 else { + fatalError(" need implement ") + } + + // let n = 1 + // let h = param.output.dim[1] + // let w = param.output.dim[2] + // let c = param.output.dim[3] * param.output.dim[0] + // + // param.output.dim = Dim.init(inDim: [n, h, w, c]) + // param.output.transpose = [0, 1, 2, 3] + + let imageWidth = Float32(param.inputImage.padToFourDim[3]) + let imageHeight = Float32(param.inputImage.padToFourDim[2]) + + let featureWidth = param.input.padToFourDim[3] + let featureHeight = param.input.padToFourDim[2] + + if param.stepW == 0 || param.stepH == 0 { + param.stepW = Float32(imageWidth) / Float32(featureWidth) + param.stepH = Float32(imageHeight) / Float32(featureHeight) + } + + var outputAspectRatior: [Float32] = [] + outputAspectRatior.append(1.0) + + let epsilon = 1e-6 + for ar in param.aspectRatios { + var alreadyExist = false + for outputAr in outputAspectRatior { + if fabs(Double(ar) - Double(outputAr)) < Double(epsilon) { + alreadyExist = true + break + } + } + + if !alreadyExist { + outputAspectRatior.append(ar) + } + if param.flip { + outputAspectRatior.append(1.0 / ar) + } + } + + if GlobalConfig.shared.computePrecision == .Float16 { + let buffer = device.makeBuffer(length: outputAspectRatior.count * MemoryLayout.size) + float32ToFloat16(input: &outputAspectRatior, output:(buffer?.contents())!, count: outputAspectRatior.count) + param.newAspectRatios = buffer + + } else if GlobalConfig.shared.computePrecision == .Float32 { + let buffer = device.makeBuffer(bytes: outputAspectRatior, length: outputAspectRatior.count * MemoryLayout.size, options: []) + param.newAspectRatios = buffer + } else { + fatalError() + } + + let aspectRatiosSize = uint(outputAspectRatior.count) + + let maxSizeSize: uint = uint(param.maxSizes.count) + let minSizeSize: uint = uint(param.minSizes.count) + + let numPriors = aspectRatiosSize * minSizeSize + maxSizeSize + + let minSize = param.minSizes.last ?? 0.0 + let maxSize = param.maxSizes.last ?? 0.0 + + metalParam = PriorBoxMetalParam.init(offset: param.offset, stepWidth: param.stepW, stepHeight: param.stepH, minSize: minSize, maxSize: maxSize, imageWidth: imageWidth, imageHeight: imageHeight, clip: param.clip, numPriors: numPriors, aspecRatiosSize: aspectRatiosSize, minSizeSize: minSizeSize, maxSizeSize: maxSizeSize) + } - let aspectRatiosSize = uint(outputAspectRatior.count) - - let maxSizeSize: uint = uint(param.maxSizes.count) - let minSizeSize: uint = uint(param.minSizes.count) - - let numPriors = aspectRatiosSize * minSizeSize + maxSizeSize - - let minSize = param.minSizes.last ?? 0.0 - let maxSize = param.maxSizes.last ?? 0.0 - - metalParam = PriorBoxMetalParam.init(offset: param.offset, stepWidth: param.stepW, stepHeight: param.stepH, minSize: minSize, maxSize: maxSize, imageWidth: imageWidth, imageHeight: imageHeight, clip: param.clip, numPriors: numPriors, aspecRatiosSize: aspectRatiosSize, minSizeSize: minSizeSize, maxSizeSize: maxSizeSize) - - } - - func compute(commandBuffer: MTLCommandBuffer, param: PriorBoxParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encode is nil") + func compute(commandBuffer: MTLCommandBuffer, param: PriorBoxParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encode is nil") + } + + encoder.setTexture(param.input.metalTexture, index: 0) + encoder.setTexture(param.output.metalTexture, index: 1) + encoder.setTexture(param.outputVariances.metalTexture, index: 2) + + encoder.setBuffer(param.newAspectRatios!, offset: 0, index: 0) + + encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 1) + + encoder.setBytes(param.variances, length: MemoryLayout.size * param.variances.count, index: 2) + encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) + encoder.endEncoding() } - - encoder.setTexture(param.input.metalTexture, index: 0) - encoder.setTexture(param.output.metalTexture, index: 1) - encoder.setTexture(param.outputVariances.metalTexture, index: 2) - - encoder.setBuffer(param.newAspectRatios!, offset: 0, index: 0) - - encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 1) - - encoder.setBytes(param.variances, length: MemoryLayout.size * param.variances.count, index: 2) - encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) - encoder.endEncoding() - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReluKernel.swift index 0bde0623ef53dd8346fbc2f91843e06ed01c77d7..e4fe6fdf8c056689b7e529f6fa1ccce5a75d146d 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReluKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReluKernel.swift @@ -14,24 +14,24 @@ import Foundation -class ReluKernel: Kernel, Computable{ - func compute(commandBuffer: MTLCommandBuffer, param: ReluParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encode is nil") +class ReluKernel: Kernel, Computable{ + func compute(commandBuffer: MTLCommandBuffer, param: ReluParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encode is nil") + } + encoder.setTexture(param.input.metalTexture, index: 0) + encoder.setTexture(param.output.metalTexture, index: 1) + encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) + encoder.endEncoding() } - encoder.setTexture(param.input.metalTexture, index: 0) - encoder.setTexture(param.output.metalTexture, index: 1) - encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) - encoder.endEncoding() - } - - required init(device: MTLDevice, param: ReluParam

, initContext: InitContext) { - if GlobalConfig.shared.computePrecision == .Float32 { - super.init(device: device, inFunctionName: "relu", initContext: initContext) - } else if GlobalConfig.shared.computePrecision == .Float16 { - super.init(device: device, inFunctionName: "relu_half", initContext: initContext) - } else { - fatalError() + + required init(device: MTLDevice, param: ReluParam

, initContext: InitContext) { + if GlobalConfig.shared.computePrecision == .Float32 { + super.init(device: device, inFunctionName: "relu", initContext: initContext) + } else if GlobalConfig.shared.computePrecision == .Float16 { + super.init(device: device, inFunctionName: "relu_half", initContext: initContext) + } else { + fatalError() + } } - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReshapeKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReshapeKernel.swift index f14db86a3a4904575be6ac1f0c70c36f99ce4305..0613e4952dacba2b336d017b9010cfa855f41dfb 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReshapeKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReshapeKernel.swift @@ -15,83 +15,83 @@ import Foundation struct ReshapeMetalParam { - var idim: (Int32, Int32, Int32, Int32) - var itrans: (Int32, Int32, Int32, Int32) - var odim: (Int32, Int32, Int32, Int32) - var otrans: (Int32, Int32, Int32, Int32) + var idim: (Int32, Int32, Int32, Int32) + var itrans: (Int32, Int32, Int32, Int32) + var odim: (Int32, Int32, Int32, Int32) + var otrans: (Int32, Int32, Int32, Int32) } struct ReshapeTestParam: TestParam { - let inputTexture: MTLTexture - let outputTexture: MTLTexture - let param: ReshapeMetalParam + let inputTexture: MTLTexture + let outputTexture: MTLTexture + let param: ReshapeMetalParam } -class ReshapeKernel: Kernel, Computable{ - - var metalParam: ReshapeMetalParam - - required init(device: MTLDevice, param: ReshapeParam

, initContext: InitContext) { - param.output.initTexture(device: device, computePrecision: GlobalConfig.shared.computePrecision) - var id: [Int32] = [1, 1, 1, 1] - for i in 0..: Kernel, Computable{ + + var metalParam: ReshapeMetalParam + + required init(device: MTLDevice, param: ReshapeParam

, initContext: InitContext) { + param.output.initTexture(device: device, computePrecision: GlobalConfig.shared.computePrecision) + var id: [Int32] = [1, 1, 1, 1] + for i in 0..) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encoder is nil") + } + + encoder.setTexture(param.input.metalTexture, index: 0) + encoder.setTexture(param.output.metalTexture, index: 1) + + encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) + encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) + encoder.endEncoding() } - } - - required init(device: MTLDevice, testParam: ReshapeTestParam, initContext: InitContext) { - metalParam = ReshapeMetalParam.init( - idim: (0, 0, 0, 0), - itrans: (0, 0, 0, 0), - odim: (0, 0, 0, 0), - otrans: (0, 0, 0, 0) - ) - super.init(device: device, inFunctionName: "reshape", initContext: initContext) - } - - func compute(commandBuffer: MTLCommandBuffer, param: ReshapeParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encoder is nil") - } - - encoder.setTexture(param.input.metalTexture, index: 0) - encoder.setTexture(param.output.metalTexture, index: 1) - - encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) - encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) - encoder.endEncoding() - } - -// func test(commandBuffer: MTLCommandBuffer, testParam: ReshapeTestParam) { -// guard let encoder = commandBuffer.makeComputeCommandEncoder() else { -// fatalError() -// } -// encoder.setTexture(testParam.inputTexture, index: 0) -// encoder.setTexture(testParam.outputTexture, index: 1) -// var pm: ReshapeMetalParam = testParam.param -// encoder.setBytes(&pm, length: MemoryLayout.size, index: 0) -// encoder.dispatch(computePipline: pipline, outTexture: testParam.outputTexture) -// encoder.endEncoding() -// } + + // func test(commandBuffer: MTLCommandBuffer, testParam: ReshapeTestParam) { + // guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + // fatalError() + // } + // encoder.setTexture(testParam.inputTexture, index: 0) + // encoder.setTexture(testParam.outputTexture, index: 1) + // var pm: ReshapeMetalParam = testParam.param + // encoder.setBytes(&pm, length: MemoryLayout.size, index: 0) + // encoder.dispatch(computePipline: pipline, outTexture: testParam.outputTexture) + // encoder.endEncoding() + // } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ResizeBilinearKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ResizeBilinearKernel.swift index a007196b6735de29f7de6a8ff28935baf4477a5f..5edf80332abf81772192218c96e524ab4a05d91a 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ResizeBilinearKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ResizeBilinearKernel.swift @@ -15,37 +15,37 @@ import Foundation struct ResizeBilinearMetalParam { - var ratio_h: Float32 - var ratio_w: Float32 + var ratio_h: Float32 + var ratio_w: Float32 } -class ResizeBilinearKernel: Kernel, Computable{ - required init(device: MTLDevice, param: ResizeBilinearParam

, initContext: InitContext) { - param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision) - if GlobalConfig.shared.computePrecision == .Float32 { - super.init(device: device, inFunctionName: "resize_bilinear", initContext: initContext) - } else if GlobalConfig.shared.computePrecision == .Float16 { - super.init(device: device, inFunctionName: "resize_bilinear_half", initContext: initContext) - } else { - fatalError() +class ResizeBilinearKernel: Kernel, Computable{ + required init(device: MTLDevice, param: ResizeBilinearParam

, initContext: InitContext) { + param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision) + if GlobalConfig.shared.computePrecision == .Float32 { + super.init(device: device, inFunctionName: "resize_bilinear", initContext: initContext) + } else if GlobalConfig.shared.computePrecision == .Float16 { + super.init(device: device, inFunctionName: "resize_bilinear_half", initContext: initContext) + } else { + fatalError() + } } - } - - func compute(commandBuffer: MTLCommandBuffer, param: ResizeBilinearParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encode is nil") + + func compute(commandBuffer: MTLCommandBuffer, param: ResizeBilinearParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encode is nil") + } + + encoder.setTexture(param.input.metalTexture, index: 0) + encoder.setTexture(param.output.metalTexture, index: 1) + let ratio_h: Float32 = Float32(param.input.tensorDim.dims[2]) / Float32(param.output.tensorDim.dims[2]) + let ratio_w: Float32 = Float32(param.input.tensorDim.dims[3]) / Float32(param.output.tensorDim.dims[3]) + var p = ResizeBilinearMetalParam.init(ratio_h: ratio_h, ratio_w: ratio_w) + encoder.setBytes(&p, length: MemoryLayout.size, index: 0) + encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) + encoder.endEncoding() } - encoder.setTexture(param.input.metalTexture, index: 0) - encoder.setTexture(param.output.metalTexture, index: 1) - let ratio_h: Float32 = Float32(param.input.tensorDim.dims[2]) / Float32(param.output.tensorDim.dims[2]) - let ratio_w: Float32 = Float32(param.input.tensorDim.dims[3]) / Float32(param.output.tensorDim.dims[3]) - var p = ResizeBilinearMetalParam.init(ratio_h: ratio_h, ratio_w: ratio_w) - encoder.setBytes(&p, length: MemoryLayout.size, index: 0) - encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) - encoder.endEncoding() - } - - - + + } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Scale.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Scale.swift index 2afee5607d3c67e9b125c436affbb9afa4ed2c5a..cab3f3b0db922612f4fc4752ba0c95a554c05ec1 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Scale.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/Scale.swift @@ -14,15 +14,15 @@ import Foundation -class ScaleKernel: CusomKernel { - init(device: MTLDevice, shape: Shape, metalLoadMode: MetalLoadMode, metalLibPath: String?) { - if GlobalConfig.shared.computePrecision == .Float32 { - super.init(device: device, inFunctionName: "scale", outputDim: shape, metalLoadModel: metalLoadMode, metalLibPath: metalLibPath) - } else if GlobalConfig.shared.computePrecision == .Float16 { - super.init(device: device, inFunctionName: "scale_half", outputDim: shape, metalLoadModel: metalLoadMode, metalLibPath: metalLibPath) - } else { - fatalError(" unsupport ") +public class ScaleKernel: CusomKernel { + public init(device: MTLDevice, shape: Shape, metalLoadMode: MetalLoadMode, metalLibPath: String?) { + if GlobalConfig.shared.computePrecision == .Float32 { + super.init(device: device, inFunctionName: "scale", outputDim: shape, metalLoadModel: metalLoadMode, metalLibPath: metalLibPath) + } else if GlobalConfig.shared.computePrecision == .Float16 { + super.init(device: device, inFunctionName: "scale_half", outputDim: shape, metalLoadModel: metalLoadMode, metalLibPath: metalLibPath) + } else { + fatalError(" unsupport ") + } } - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ShapeKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ShapeKernel.swift index dfec8f9adf9dfc6be3a835c8ea215b37cb1a948c..a4f250a175c85c99437e2ad00d511c90507ba2d6 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ShapeKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ShapeKernel.swift @@ -18,25 +18,25 @@ import Foundation struct ShapeMetalParam { } -class ShapeKernel: Kernel, Computable{ - func compute(commandBuffer: MTLCommandBuffer, param: ShapeParam

) throws { -// print("shape compute") -// guard let encoder = commandBuffer.makeComputeCommandEncoder() else { -// throw PaddleMobileError.predictError(message: " encode is nil") -// } -// encoder.setTexture(param.output.metalTexture, index: 0) -// encoder.endEncoding() - } - - required init(device: MTLDevice, param: ShapeParam

, initContext: InitContext) { - param.output.initTexture(device: device, computePrecision: GlobalConfig.shared.computePrecision) - if GlobalConfig.shared.computePrecision == .Float32 { - super.init(device: device, inFunctionName: "shape", initContext: initContext) - } else if GlobalConfig.shared.computePrecision == .Float16 { - super.init(device: device, inFunctionName: "shape_half", initContext: initContext) - } else { - fatalError() +class ShapeKernel: Kernel, Computable{ + func compute(commandBuffer: MTLCommandBuffer, param: ShapeParam

) throws { + // print("shape compute") + // guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + // throw PaddleMobileError.predictError(message: " encode is nil") + // } + // encoder.setTexture(param.output.metalTexture, index: 0) + // encoder.endEncoding() } - } - + + required init(device: MTLDevice, param: ShapeParam

, initContext: InitContext) { + param.output.initTexture(device: device, computePrecision: GlobalConfig.shared.computePrecision) + if GlobalConfig.shared.computePrecision == .Float32 { + super.init(device: device, inFunctionName: "shape", initContext: initContext) + } else if GlobalConfig.shared.computePrecision == .Float16 { + super.init(device: device, inFunctionName: "shape_half", initContext: initContext) + } else { + fatalError() + } + } + } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/SoftmaxKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/SoftmaxKernel.swift index 1eac43484d8759e2d1aefaef4b55fbde728a24d6..bccb37c47d820fe010bb4780885875bb886ae24d 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/SoftmaxKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/SoftmaxKernel.swift @@ -15,37 +15,37 @@ import Foundation struct SoftmaxMetalParam { - let N: Int32 - let K: Int32 + let N: Int32 + let K: Int32 } -class SoftmaxKernel: Kernel, Computable{ - - var metalParam: SoftmaxMetalParam - required init(device: MTLDevice, param: SoftmaxParam

, initContext: InitContext) { - param.output.initTexture(device: device, computePrecision: GlobalConfig.shared.computePrecision) - metalParam = SoftmaxMetalParam.init( - N: Int32(param.input.tensorDim[0]), - K: Int32(param.input.tensorDim[1]) - ) - if GlobalConfig.shared.computePrecision == .Float32 { - super.init(device: device, inFunctionName: "softmax_float", initContext: initContext) - } else if GlobalConfig.shared.computePrecision == .Float16 { - super.init(device: device, inFunctionName: "softmax_half", initContext: initContext) - } else { - fatalError() +class SoftmaxKernel: Kernel, Computable{ + + var metalParam: SoftmaxMetalParam + required init(device: MTLDevice, param: SoftmaxParam

, initContext: InitContext) { + param.output.initTexture(device: device, computePrecision: GlobalConfig.shared.computePrecision) + metalParam = SoftmaxMetalParam.init( + N: Int32(param.input.tensorDim[0]), + K: Int32(param.input.tensorDim[1]) + ) + if GlobalConfig.shared.computePrecision == .Float32 { + super.init(device: device, inFunctionName: "softmax_float", initContext: initContext) + } else if GlobalConfig.shared.computePrecision == .Float16 { + super.init(device: device, inFunctionName: "softmax_half", initContext: initContext) + } else { + fatalError() + } } - } - - func compute(commandBuffer: MTLCommandBuffer, param: SoftmaxParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encoder is nil") + + func compute(commandBuffer: MTLCommandBuffer, param: SoftmaxParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encoder is nil") + } + encoder.setTexture(param.input.metalTexture, index: 0) + encoder.setTexture(param.output.metalTexture, index: 1) + encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) + encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) + encoder.endEncoding() } - encoder.setTexture(param.input.metalTexture, index: 0) - encoder.setTexture(param.output.metalTexture, index: 1) - encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) - encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) - encoder.endEncoding() - } - + } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/SplitKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/SplitKernel.swift index 8b07a87406a6c33767ac6552c0f8241602a89cb0..eaaffa7bfe8a07df4a1da2ebd5ca10b8d373f1e3 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/SplitKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/SplitKernel.swift @@ -15,79 +15,79 @@ import Foundation struct SplitMetalParam { - var idim: (Int32, Int32, Int32, Int32) = (1, 1, 1, 1) - var axis: Int32 = 0 - var offset: Int32 = 0 - var trans: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3) - var vdim: (Int32, Int32, Int32, Int32) = (0, 0, 0, 0) + var idim: (Int32, Int32, Int32, Int32) = (1, 1, 1, 1) + var axis: Int32 = 0 + var offset: Int32 = 0 + var trans: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3) + var vdim: (Int32, Int32, Int32, Int32) = (0, 0, 0, 0) } -class SplitKernel: Kernel, Computable{ - var smp: SplitMetalParam - func compute(commandBuffer: MTLCommandBuffer, param: SplitParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encode is nil") +class SplitKernel: Kernel, Computable{ + var smp: SplitMetalParam + func compute(commandBuffer: MTLCommandBuffer, param: SplitParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encode is nil") + } + encoder.setTexture(param.input.metalTexture, index: 0) + for i in 0...size, index: 0) + encoder.dispatch(computePipline: pipline, outTexture: param.input.metalTexture) + encoder.endEncoding() } - encoder.setTexture(param.input.metalTexture, index: 0) - for i in 0.., initContext: InitContext) { + // param.output.initTexture(device: device, computePrecision: computePrecision) + let num = param.outputList.count + let rank = param.input.tensorDim.cout() + assert(num >= 2 && num <= 4) + for output in param.outputList { + output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision) + } + smp = SplitMetalParam.init() + smp.idim = (Int32(param.input.dim[0]), Int32(param.input.dim[1]), Int32(param.input.dim[2]), Int32(param.input.dim[3])) + smp.axis = Int32(param.axis + param.input.dim.cout() - param.input.tensorDim.cout()) + for i in 0..<4 { + if param.input.transpose[i] == smp.axis { + smp.axis = Int32(i) + break + } + } + smp.trans = (Int32(param.input.transpose[0]), Int32(param.input.transpose[1]), Int32(param.input.transpose[2]), Int32(param.input.transpose[3])) + var vdim: [Int32] = [0, 0, 0, 0] + for i in 0...size, index: 0) - encoder.dispatch(computePipline: pipline, outTexture: param.input.metalTexture) - encoder.endEncoding() - } - - required init(device: MTLDevice, param: SplitParam

, initContext: InitContext) { - // param.output.initTexture(device: device, computePrecision: computePrecision) - let num = param.outputList.count - let rank = param.input.tensorDim.cout() - assert(num >= 2 && num <= 4) - for output in param.outputList { - output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision) - } - smp = SplitMetalParam.init() - smp.idim = (Int32(param.input.dim[0]), Int32(param.input.dim[1]), Int32(param.input.dim[2]), Int32(param.input.dim[3])) - smp.axis = Int32(param.axis + param.input.dim.cout() - param.input.tensorDim.cout()) - for i in 0..<4 { - if param.input.transpose[i] == smp.axis { - smp.axis = Int32(i) - break - } - } - smp.trans = (Int32(param.input.transpose[0]), Int32(param.input.transpose[1]), Int32(param.input.transpose[2]), Int32(param.input.transpose[3])) - var vdim: [Int32] = [0, 0, 0, 0] - for i in 0..: Kernel, Computable{ - func compute(commandBuffer: MTLCommandBuffer, param: FeedParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encode is nil") - } - encoder.setTexture(param.input.mtlTexture, index: 0) - encoder.setTexture(param.output.metalTexture, index: 1) - encoder.dispatch(computePipline: pipline, outTexture: param.input.mtlTexture) - encoder.endEncoding() - } - - required init(device: MTLDevice, param: FeedParam

, initContext: InitContext) { - param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision) - if GlobalConfig.shared.computePrecision == .Float16 { - super.init(device: device, inFunctionName: "texture2d_to_2d_array_half", initContext: initContext) - } else if GlobalConfig.shared.computePrecision == .Float32 { - super.init(device: device, inFunctionName: "texture2d_to_2d_array", initContext: initContext) - } else { - fatalError() +class Texture2DTo2DArrayKernel: Kernel, Computable{ + func compute(commandBuffer: MTLCommandBuffer, param: FeedParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encode is nil") + } + encoder.setTexture(param.input.mtlTexture, index: 0) + encoder.setTexture(param.output.metalTexture, index: 1) + encoder.dispatch(computePipline: pipline, outTexture: param.input.mtlTexture) + encoder.endEncoding() } - } + required init(device: MTLDevice, param: FeedParam

, initContext: InitContext) { + param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: GlobalConfig.shared.computePrecision) + if GlobalConfig.shared.computePrecision == .Float16 { + super.init(device: device, inFunctionName: "texture2d_to_2d_array_half", initContext: initContext) + } else if GlobalConfig.shared.computePrecision == .Float32 { + super.init(device: device, inFunctionName: "texture2d_to_2d_array", initContext: initContext) + } else { + fatalError() + } + + } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/TransposeKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/TransposeKernel.swift index e1490052e7419591c57ad8cdf628708fd15beeb8..a2f87758348ed0b692937e9f7222f19c3550ac97 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/TransposeKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/TransposeKernel.swift @@ -15,65 +15,65 @@ import Foundation struct TransposeMetalParam { - var iC: Int32 = 0 - var oC: Int32 = 0 - var axis: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3) + var iC: Int32 = 0 + var oC: Int32 = 0 + var axis: (Int32, Int32, Int32, Int32) = (0, 1, 2, 3) } -class TransposeKernel: Kernel, Computable { - var metalParam: TransposeMetalParam = TransposeMetalParam.init() - required init(device: MTLDevice, param: TransposeParam

, initContext: InitContext) { - param.output.initTexture(device: device, computePrecision: GlobalConfig.shared.computePrecision) - let rank = param.input.tensorDim.cout() - var axis: [Int] = [0, 1, 2, 3] - for i in 0..: Kernel, Computable { + var metalParam: TransposeMetalParam = TransposeMetalParam.init() + required init(device: MTLDevice, param: TransposeParam

, initContext: InitContext) { + param.output.initTexture(device: device, computePrecision: GlobalConfig.shared.computePrecision) + let rank = param.input.tensorDim.cout() + var axis: [Int] = [0, 1, 2, 3] + for i in 0..", kernelFunc) + print(metalParam) + super.init(device: device, inFunctionName: kernelFunc, initContext: initContext) } - print("===========>", kernelFunc) - print(metalParam) - super.init(device: device, inFunctionName: kernelFunc, initContext: initContext) - } - - func compute(commandBuffer: MTLCommandBuffer, param: TransposeParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encode is nil") + + func compute(commandBuffer: MTLCommandBuffer, param: TransposeParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encode is nil") + } + + encoder.setTexture(param.input.metalTexture, index: 0) + encoder.setTexture(param.output.metalTexture, index: 1) + encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) + encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) + encoder.endEncoding() } - - encoder.setTexture(param.input.metalTexture, index: 0) - encoder.setTexture(param.output.metalTexture, index: 1) - encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) - encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) - encoder.endEncoding() - } - - + + } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/BilinearInterp.inc.metal b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/BilinearInterp.inc.metal deleted file mode 100644 index a590f8089890f2fab1af4c1f736f3bfc5708aecf..0000000000000000000000000000000000000000 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/BilinearInterp.inc.metal +++ /dev/null @@ -1,49 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#ifdef P - -#define CONCAT2(a, b) a ## b -#define CONCAT2_(a, b) a ## _ ## b - -#define FUNC(f, p) CONCAT2_(f, p) -#define VECTOR(p, n) CONCAT2(p, n) - -kernel void FUNC(bilinear_interp, P)(texture2d_array input [[texture(0)]], - texture2d_array output [[texture(1)]], - constant bilinear_interp_param & pm [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]) { - VECTOR(P, 4) r; - if ((input.get_width() == output.get_width()) && (input.get_height() == output.get_height())) { - r = input.read(gid.xy, gid.z); - } else { - P w = gid.x * pm.ratio_w; - P h = gid.y * pm.ratio_h; - uint w0 = w, h0 = h; - uint w1 = w0 + 1, h1 = h0 + 1; - P w1lambda = w - w0, h1lambda = h - h0; - P w2lambda = 1.0 - w1lambda, h2lambda = 1.0 - h1lambda; - if (w1 >= input.get_width()) w1 = w0; - if (h1 >= input.get_height()) h1 = h0; - VECTOR(P, 4) r0 = input.read(uint2(w0, h0), gid.z); - VECTOR(P, 4) r1 = input.read(uint2(w1, h0), gid.z); - VECTOR(P, 4) r2 = input.read(uint2(w0, h1), gid.z); - VECTOR(P, 4) r3 = input.read(uint2(w1, h1), gid.z); - r = h2lambda * (w2lambda * r0 + w1lambda * r1) - + h1lambda * (w2lambda * r2 + w1lambda * r3); - } - output.write(r, gid.xy, gid.z); -} - -#endif diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/BoxCoder.inc.metal b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/BoxCoder.inc.metal deleted file mode 100644 index 918fbac1a713d7b0442a1eb1f07abea3616bec96..0000000000000000000000000000000000000000 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/BoxCoder.inc.metal +++ /dev/null @@ -1,54 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#ifdef P - -#define CONCAT2(a, b) a ## b -#define CONCAT2_(a, b) a ## _ ## b - -#define FUNC(f, p) CONCAT2_(f, p) -#define VECTOR(p, n) CONCAT2(p, n) -kernel void FUNC(boxcoder, P)(texture2d_array priorBox [[texture(0)]], - texture2d_array priorBoxVar [[texture(1)]], - texture2d_array targetBox [[texture(2)]], - texture2d_array output[[texture(3)]], - uint3 gid [[thread_position_in_grid]]) { - VECTOR(P, 4) p = priorBox.read(uint2(0, gid.x), gid.z); - VECTOR(P, 4) pv = priorBoxVar.read(uint2(0, gid.x), gid.z); - VECTOR(P, 4) t; - t[0] = targetBox.read(uint2(0, gid.x), gid.z)[0]; - t[1] = targetBox.read(uint2(1, gid.x), gid.z)[0]; - t[2] = targetBox.read(uint2(2, gid.x), gid.z)[0]; - t[3] = targetBox.read(uint2(3, gid.x), gid.z)[0]; - - P px = (p.x + p.z) / 2; - P py = (p.y + p.w) / 2; - P pw = p.z - p.x; - P ph = p.w - p.y; - - P tx = pv.x * t.x * pw + px; - P ty = pv.y * t.y * ph + py; - P tw = exp(pv.z * t.z) * pw; - P th = exp(pv.w * t.w) * ph; - - VECTOR(P, 4) r; - r.x = tx - tw / 2; - r.y = ty - th / 2; - r.z = tx + tw / 2; - r.w = ty + th / 2; - - output.write(r, gid.xy, gid.z); -} - -#endif diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/Common.metal b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/Common.metal deleted file mode 100644 index 40bae035c097b5ab386d78520b6b04f074eb2fee..0000000000000000000000000000000000000000 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/Common.metal +++ /dev/null @@ -1,120 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#include -using namespace metal; - - -inline void xyzn2abcd_1(int xyzn[4], int abcd[4]) { - abcd[0] = abcd[1] = abcd[2] = 0; - abcd[3] = xyzn[0] * 4 + xyzn[3]; -} -inline void xyzn2abcd_2(int xyzn[4], int abcd[4]) { - abcd[0] = abcd[1] = 0; - abcd[2] = xyzn[1]; - abcd[3] = xyzn[0] * 4 + xyzn[3]; -} -inline void xyzn2abcd_3(int xyzn[4], int abcd[4]) { - abcd[0] = 0; - abcd[3] = xyzn[0]; - abcd[2] = xyzn[1]; - abcd[1] = xyzn[2] * 4 + xyzn[3]; -} -inline void xyzn2abcd_4(int C, int xyzn[4], int abcd[4]) { - abcd[2] = xyzn[0]; - abcd[1] = xyzn[1]; - uint t = xyzn[2] * 4 + xyzn[3]; - abcd[0] = t / C; - abcd[3] = t % C; -} - -inline void abcd2xyzn_1(int abcd[4], int xyzn[4]) { - xyzn[1] = xyzn[2] = 0; - xyzn[0] = abcd[3] / 4; - xyzn[1] = abcd[3] % 4; -} -inline void abcd2xyzn_2(int abcd[4], int xyzn[4]) { - xyzn[2] = 0; - xyzn[1] = abcd[2]; - xyzn[0] = abcd[3] / 4; - xyzn[3] = abcd[3] % 4; -} -inline void abcd2xyzn_3(int abcd[4], int xyzn[4]) { - xyzn[0] = abcd[3]; - xyzn[1] = abcd[2]; - xyzn[2] = abcd[1] / 4; - xyzn[3] = abcd[1] % 4; -} -inline void abcd2xyzn_4(int C, int abcd[4], int xyzn[4]) { - xyzn[0] = abcd[2]; - xyzn[1] = abcd[1]; - uint t = abcd[0] * C + abcd[3]; - xyzn[2] = t / 4; - xyzn[3] = t % 4; -} - -inline void xyzn2abcd(int C, int xyzn[4], int abcd[4]) { - abcd[2] = xyzn[0]; - abcd[1] = xyzn[1]; - uint t = xyzn[2] * 4 + xyzn[3]; - abcd[0] = t / C; - abcd[3] = t % C; -} - -inline void abcd2xyzn(int C, int abcd[4], int xyzn[4]) { - xyzn[0] = abcd[2]; - xyzn[1] = abcd[1]; - uint t = abcd[0] * C + abcd[3]; - xyzn[2] = t / 4; - xyzn[3] = t % 4; -} - -inline int32_t abcd2index(int32_t dim[4], int32_t abcd[4]) { - int32_t r = abcd[0]; - r = r * dim[1] + abcd[1]; - r = r * dim[2] + abcd[2]; - r = r * dim[3] + abcd[3]; - return r; -} - -inline void index2abcd(int32_t dim[4], int32_t ind, int32_t abcd[4]) { - abcd[3] = ind % dim[3]; ind /= dim[3]; - abcd[2] = ind % dim[2]; ind /= dim[2]; - abcd[1] = ind % dim[1]; ind /= dim[1]; - abcd[0] = ind; -} - -inline void trans(int32_t trans[4], int32_t ipos[4], int32_t opos[4]) { - for (int i = 0; i < 4; i++) { - opos[i] = ipos[trans[i]]; - } -} - -inline void invtrans(int32_t trans[4], int32_t ipos[4], int32_t opos[4]) { - for (int i = 0; i < 4; i++) { - opos[trans[i]] = ipos[i]; - } -} - - -struct MetalConvParam { - short offsetX; - short offsetY; - short offsetZ; - ushort strideX; - ushort strideY; - ushort dilationX; - ushort dilationY; -}; - diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ConcatKernel.inc.metal b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ConcatKernel.inc.metal deleted file mode 100644 index 2b070fc48b78391e96b93823eeff7f936de2ff7d..0000000000000000000000000000000000000000 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ConcatKernel.inc.metal +++ /dev/null @@ -1,318 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#ifdef P - -#define CONCAT2(a, b) a ## b -#define CONCAT2_(a, b) a ## _ ## b -#define CONCAT3_(a, b, c) a ## _ ## b ## _ ## c -#define CONCAT4_(a, b, c, d) a ## _ ## b ## _ ## c ## _ ## d -#define CONCAT5_(a, b, c, d, e) a ## _ ## b ## _ ## c ## _ ## d ## _ ## e - -#define FUNC(f, r, n, v, p) CONCAT5_(f, r, n, v, p) -#define VECTOR(p, n) CONCAT2(p, n) -#define FUNC_R(f, r) CONCAT2_(f, r) - -#if V == VX -#define VV x -#elif V == VY -#define VV y -#elif V == VZ -#define VV z -#else -#define VV normal -#endif - -#if V == VNORMAL -//kernel void FUNC(concat, R, N, normal, P)(array, N> in [[texture(0)]], -// texture2d_array out_x [[texture(N)]], -// texture2d_array out [[texture(N+1)]], -// constant ConcatParam & pm [[buffer(0)]], -// uint3 gid [[thread_position_in_grid]]) { -//} -kernel void FUNC(concat, R, N, VV, P)(texture2d_array in0 [[texture(0)]], - texture2d_array in1 [[texture(1)]], -#if N >= 3 - texture2d_array in2 [[texture(2)]], -#endif -#if N >= 4 - texture2d_array in3 [[texture(3)]], -#endif -#if N >= 5 - texture2d_array in4 [[texture(4)]], -#endif -#if N >= 6 - texture2d_array in5 [[texture(5)]], -#endif - texture2d_array inx [[texture(N)]], - texture2d_array out [[texture(N+1)]], - constant ConcatParam & pm [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]) { - - ConcatParam cp = pm; - int xyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0}, abcd[4], oxyzn[4]; - VECTOR(P, 4) r = inx.read(gid.xy, gid.z); - for (int i = 0; i < 4; i++) { - xyzn[3] = i; -#if R == 4 - xyzn2abcd_4(cp.odim[3], xyzn, abcd); -#else - FUNC_R(xyzn2abcd, R)(xyzn, abcd); -#endif - int k = abcd[cp.axis] - cp.offset; - if (k < 0) continue; - int j = 0; - for (; j < N; j++) { - if (k < cp.vdim[j]) { - break; - } - k -= cp.vdim[j]; - } - if (j == N) { - continue; - } - int ta = cp.odim[cp.axis]; - abcd[cp.axis] = k; - cp.odim[cp.axis] = cp.vdim[j]; -#if R == 4 - abcd2xyzn_4(cp.odim[3], abcd, oxyzn); -#else - FUNC_R(abcd2xyzn, R)(abcd, oxyzn); -#endif - cp.odim[cp.axis] = ta; - switch (j) { - case 0: r[i] = in0.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break; - case 1: r[i] = in1.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break; -#if N >= 3 - case 2: r[i] = in2.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break; -#endif -#if N >= 4 - case 3: r[i] = in3.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break; -#endif -#if N >= 5 - case 4: r[i] = in4.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break; -#endif -#if N >= 6 - case 5: r[i] = in5.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break; -#endif - } - } - out.write(r, gid.xy, gid.z); -} - -#endif // V == NORMAL - - - -#if V == VX -kernel void FUNC(concat, R, N, VV, P)(texture2d_array in0 [[texture(0)]], - texture2d_array in1 [[texture(1)]], -#if N >= 3 - texture2d_array in2 [[texture(2)]], -#endif // N >= 3 -#if N >= 4 - texture2d_array in3 [[texture(3)]], -#endif // N >= 4 -#if N >= 5 - texture2d_array in4 [[texture(4)]], -#endif // N >= 5 -#if N >= 6 - texture2d_array in5 [[texture(5)]], -#endif // N >= 6 - texture2d_array out [[texture(N)]], - constant ConcatParam & pm [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]) { - int x = gid.x - pm.offset; - if (x < 0) return; - if (x < pm.vdim[0]) { - VECTOR(P, 4) r = in0.read(gid.xy, gid.z); - out.write(r, gid.xy, gid.z); - return; - } - x -= pm.vdim[0]; - if (x < pm.vdim[1]) { - VECTOR(P, 4) r = in1.read(uint2(x, gid.y), gid.z); - out.write(r, gid.xy, gid.z); - return; - } -#if N >= 3 - x -= pm.vdim[1]; - if (x < pm.vdim[2]) { - VECTOR(P, 4) r = in2.read(uint2(x, gid.y), gid.z); - out.write(r, gid.xy, gid.z); - return; - } -#endif // N >= 3 -#if N >= 4 - x -= pm.vdim[2]; - if (x < pm.vdim[3]) { - VECTOR(P, 4) r = in3.read(uint2(x, gid.y), gid.z); - out.write(r, gid.xy, gid.z); - return; - } -#endif // N >= 4 -#if N >= 5 - x -= pm.vdim[3]; - if (x < pm.vdim[4]) { - VECTOR(P, 4) r = in4.read(uint2(x, gid.y), gid.z); - out.write(r, gid.xy, gid.z); - return; - } -#endif // N >= 5 -#if N >= 6 - x -= pm.vdim[4]; - if (x < pm.vdim[5]) { - VECTOR(P, 4) r = in5.read(uint2(x, gid.y), gid.z); - out.write(r, gid.xy, gid.z); - return; - } -#endif // N >= 6 -} -#endif // V == VX - -#if V == VY -kernel void FUNC(concat, R, N, VV, P)(texture2d_array in0 [[texture(0)]], - texture2d_array in1 [[texture(1)]], -#if N >= 3 - texture2d_array in2 [[texture(2)]], -#endif // N >= 3 -#if N >= 4 - texture2d_array in3 [[texture(3)]], -#endif // N >= 4 -#if N >= 5 - texture2d_array in4 [[texture(4)]], -#endif // N >= 5 -#if N >= 6 - texture2d_array in5 [[texture(5)]], -#endif // N >= 6 - texture2d_array out [[texture(N)]], - constant ConcatParam & pm [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]) { - int y = gid.y - pm.offset; - if (y < 0) return; - if (y < pm.vdim[0]) { - VECTOR(P, 4) r = in0.read(gid.xy, gid.z); - out.write(r, gid.xy, gid.z); - return; - } - y -= pm.vdim[0]; - if (y < pm.vdim[1]) { - VECTOR(P, 4) r = in1.read(uint2(gid.x, y), gid.z); - out.write(r, gid.xy, gid.z); - return; - } -#if N >= 3 - y -= pm.vdim[1]; - if (y < pm.vdim[2]) { - VECTOR(P, 4) r = in2.read(uint2(gid.x, y), gid.z); - out.write(r, gid.xy, gid.z); - return; - } -#endif // N >= 3 -#if N >= 4 - y -= pm.vdim[2]; - if (y < pm.vdim[3]) { - VECTOR(P, 4) r = in3.read(uint2(gid.x, y), gid.z); - out.write(r, gid.xy, gid.z); - return; - } -#endif // N >= 4 -#if N >= 5 - y -= pm.vdim[3]; - if (y < pm.vdim[4]) { - VECTOR(P, 4) r = in4.read(uint2(gid.x, y), gid.z); - out.write(r, gid.xy, gid.z); - return; - } -#endif // N >= 5 -#if N >= 6 - y -= pm.vdim[4]; - if (y < pm.vdim[5]) { - VECTOR(P, 4) r = in5.read(uint2(gid.x, y), gid.z); - out.write(r, gid.xy, gid.z); - return; - } -#endif // N >= 6 -} -#endif // V == VY - -#if V == VZ -kernel void FUNC(concat, R, N, VV, P)(texture2d_array in0 [[texture(0)]], - texture2d_array in1 [[texture(1)]], -#if N >= 3 - texture2d_array in2 [[texture(2)]], -#endif // N >= 3 -#if N >= 4 - texture2d_array in3 [[texture(3)]], -#endif // N >= 4 -#if N >= 5 - texture2d_array in4 [[texture(4)]], -#endif // N >= 5 -#if N >= 6 - texture2d_array in5 [[texture(5)]], -#endif // N >= 6 - texture2d_array out [[texture(N)]], - constant ConcatParam & pm [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]) { - int z = gid.z - pm.offset; - if (z < 0) return; - if (z < pm.vdim[0]) { - VECTOR(P, 4) r = in0.read(gid.xy, gid.z); - out.write(r, gid.xy, gid.z); - return; - } - z -= pm.vdim[0]; - if (z < pm.vdim[1]) { - VECTOR(P, 4) r = in1.read(gid.xy, z); - out.write(r, gid.xy, gid.z); - return; - } -#if N >= 3 - z -= pm.vdim[1]; - if (z < pm.vdim[2]) { - VECTOR(P, 4) r = in2.read(gid.xy, z); - out.write(r, gid.xy, gid.z); - return; - } -#endif // N >= 3 -#if N >= 4 - z -= pm.vdim[2]; - if (z < pm.vdim[3]) { - VECTOR(P, 4) r = in3.read(gid.xy, z); - out.write(r, gid.xy, gid.z); - return; - } -#endif // N >= 4 -#if N >= 5 - z -= pm.vdim[3]; - if (z < pm.vdim[4]) { - VECTOR(P, 4) r = in4.read(gid.xy, z); - out.write(r, gid.xy, gid.z); - return; - } -#endif // N >= 5 -#if N >= 6 - z -= pm.vdim[4]; - if (z < pm.vdim[5]) { - VECTOR(P, 4) r = in5.read(gid.xy, z); - out.write(r, gid.xy, gid.z); - return; - } -#endif // N >= 6 -} -#endif // V == VZ - - -#undef VV -#endif // #ifdef P diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ConcatKernel.metal b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ConcatKernel.metal deleted file mode 100644 index b7d17f2d25de544e4ce938c577e0d04f536da9af..0000000000000000000000000000000000000000 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ConcatKernel.metal +++ /dev/null @@ -1,171 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#include -#include "Common.metal" - -using namespace metal; - -struct ConcatParam { - int32_t odim[4]; - int32_t axis; - int32_t offset; - int32_t trans[4]; - int32_t vdim[6]; -}; - -#define VNORMAL 1 -#define VX 2 -#define VY 3 -#define VZ 4 - -// >> fast mode -// only support concat_{2,3,4}_{2,3,4,5,6}_y_{float,half} -// only support concat_{3,4}_{2,3,4,5,6}_x_{float,half} -// only support concat_{1,2,3,4}_{2,3,4,5,6}_z_{float,half} -// >> normal mode (loop mode) -// ssd-ar: (R=4, N=3, V=z), (R=3, N=2, V=y), (R=2, N=5, V=x), (R=3, N=5, V=x) -// ssd: (R=2, N=6, V=y), (R=3, N=6, V=y) -// genet: (R=4, N=2, V=normal) - -// ssd-ar: (R=3, N=5, V=x) -#define V VX - #define R 3 - #define N 5 - #define P float - #include "ConcatKernel.inc.metal" - #undef P - #define P half - #include "ConcatKernel.inc.metal" - #undef P - #undef N - #undef R -#undef V - -// ssd-ar: (R=2, N=5, V=x) -#define V VX - #define R 2 - #define N 5 - #define P float - #include "ConcatKernel.inc.metal" - #undef P - #define P half - #include "ConcatKernel.inc.metal" - #undef P - #undef N - #undef R -#undef V - - -// ssd-ar: (R=3, N=2, V=y) -#define V VY - #define R 3 - #define N 2 - #define P float - #include "ConcatKernel.inc.metal" - #undef P - #define P half - #include "ConcatKernel.inc.metal" - #undef P - #undef N - #undef R -#undef V - -// ssd-ar: (R=4, N=3, V=z) -#define V VZ - #define R 4 - #define N 3 - #define P float - #include "ConcatKernel.inc.metal" - #undef P - #define P half - #include "ConcatKernel.inc.metal" - #undef P - #undef N - #undef R -#undef V - - -// ssd: (R=2, N=6, V=y) -#define V VY - #define R 2 - #define N 6 - #define P float - #include "ConcatKernel.inc.metal" - #undef P - #define P half - #include "ConcatKernel.inc.metal" - #undef P - #undef N - #undef R -#undef V - -// ssd: (R=3, N=6, V=y) -#define V VY - #define R 3 - #define N 6 - #define P float - #include "ConcatKernel.inc.metal" - #undef P - #define P half - #include "ConcatKernel.inc.metal" - #undef P - #undef N - #undef R -#undef V - -#define V VNORMAL - #define R 4 - #define N 2 - #define P float - #include "ConcatKernel.inc.metal" - #undef P - #define P half - #include "ConcatKernel.inc.metal" - #undef P - #undef N - #undef R -#undef V - - -#define V VY - #define R 2 - #define N 2 - #define P float - #include "ConcatKernel.inc.metal" - #undef P - #define P half - #include "ConcatKernel.inc.metal" - #undef P - #undef N - #undef R -#undef V - - -#define V VY - #define R 2 - #define N 5 - #define P float - #include "ConcatKernel.inc.metal" - #undef P - #define P half - #include "ConcatKernel.inc.metal" - #undef P - #undef N - #undef R -#undef V - - - - diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ConvAddBNReluKernel.metal b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ConvAddBNReluKernel.metal deleted file mode 100644 index 87b60a64fc48ab89af274e0b24897e0b411599e0..0000000000000000000000000000000000000000 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ConvAddBNReluKernel.metal +++ /dev/null @@ -1,310 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#include -#include "Common.metal" -using namespace metal; - - -kernel void conv_add_batch_norm_relu_1x1_half( - texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device half4 *weights [[buffer(1)]], - const device half4 *biase [[buffer(2)]], - const device half4 *new_scale [[buffer(3)]], - const device half4 *new_biase [[buffer(4)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 1; - - uint input_arr_size = inTexture.get_array_size(); - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - float4 output = float4(0.0); - - half4 input; - for (uint i = 0; i < input_arr_size; ++i) { - input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; - output.x += dot(input, weight_x); - - half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; - output.y += dot(input, weight_y); - - half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; - output.z += dot(input, weight_z); - - half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; - output.w += dot(input, weight_w); - } - output = fmax((output + float4(biase[gid.z])) * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0); - outTexture.write(half4(output), gid.xy, gid.z); -} - -kernel void conv_add_batch_norm_relu_3x3_half( - texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device half4 *weights [[buffer(1)]], - const device half4 *biase [[buffer(2)]], - const device half4 *new_scale [[buffer(3)]], - const device half4 *new_biase [[buffer(4)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 9; - uint input_arr_size = inTexture.get_array_size(); - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - float4 output = float4(0.0); - - half4 input[9]; - for (uint i = 0; i < input_arr_size; ++i) { - input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i); - input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i); - input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i); - input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i); - input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i); - input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i); - input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i); - input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i); - for (int j = 0; j < 9; ++j) { - half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.x += dot(input[j], weight_x); - - half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.y += dot(input[j], weight_y); - - half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.z += dot(input[j], weight_z); - - half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.w += dot(input[j], weight_w); - } - } - output = fmax((output + float4(biase[gid.z])) * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0); - outTexture.write(half4(output), gid.xy, gid.z); -} - -kernel void depthwise_conv_add_batch_norm_relu_3x3_half( - texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device half *weights [[buffer(1)]], - const device half4 *biase [[buffer(2)]], - const device half4 *new_scale [[buffer(3)]], - const device half4 *new_biase [[buffer(4)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - uint output_slice = gid.z; - ushort2 stride = ushort2(param.strideX, param.strideY); - ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 9; - uint weithTo = gid.z * kernelHXW * 4; - float4 output = float4(0.0); - half4 inputs[9]; - inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); - inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); - inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); - inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); - inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); - inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); - inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); - inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); - inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); - for (int j = 0; j < 9; ++j) { - half4 input = inputs[j]; - output.x += input.x * weights[weithTo + 0 * kernelHXW + j]; - output.y += input.y * weights[weithTo + 1 * kernelHXW + j]; - output.z += input.z * weights[weithTo + 2 * kernelHXW + j]; - output.w += input.w * weights[weithTo + 3 * kernelHXW + j]; - } - output = fmax((output + float4(biase[gid.z])) * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0); - outTexture.write(half4(output), gid.xy, gid.z); -} - - - -/*---------------------------------------------*/ - - - -kernel void conv_add_batch_norm_relu_1x1(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device float4 *weights [[buffer(1)]], - const device float4 *biase [[buffer(2)]], - const device float4 *new_scale [[buffer(3)]], - const device float4 *new_biase [[buffer(4)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 1; - - uint input_arr_size = inTexture.get_array_size(); - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - float4 output = float4(0.0); - - float4 input; - for (uint i = 0; i < input_arr_size; ++i) { - input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; - output.x += dot(input, weight_x); - - float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; - output.y += dot(input, weight_y); - - float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; - output.z += dot(input, weight_z); - - float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; - output.w += dot(input, weight_w); - } - output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0); - outTexture.write(output, gid.xy, gid.z); -} - -kernel void conv_add_batch_norm_relu_3x3(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device float4 *weights [[buffer(1)]], - const device float4 *biase [[buffer(2)]], - const device float4 *new_scale [[buffer(3)]], - const device float4 *new_biase [[buffer(4)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 9; - uint input_arr_size = inTexture.get_array_size(); - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - float4 output = float4(0.0); - - float4 input[9]; - for (uint i = 0; i < input_arr_size; ++i) { - input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i); - input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i); - input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i); - input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i); - input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i); - input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i); - input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i); - input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i); - for (int j = 0; j < 9; ++j) { - float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.x += dot(input[j], weight_x); - - float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.y += dot(input[j], weight_y); - - float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.z += dot(input[j], weight_z); - - float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.w += dot(input[j], weight_w); - } - } - output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0); - outTexture.write(output, gid.xy, gid.z); -} - -kernel void depthwise_conv_add_batch_norm_relu_3x3(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device float *weights [[buffer(1)]], - const device float4 *biase [[buffer(2)]], - const device float4 *new_scale [[buffer(3)]], - const device float4 *new_biase [[buffer(4)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - uint output_slice = gid.z; - ushort2 stride = ushort2(param.strideX, param.strideY); - ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 9; - uint weithTo = gid.z * kernelHXW * 4; - float4 output = float4(0.0); - float4 inputs[9]; - inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); - inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); - inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); - inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); - inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); - inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); - inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); - inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); - inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); - for (int j = 0; j < 9; ++j) { - float4 input = inputs[j]; - output.x += input.x * weights[weithTo + 0 * kernelHXW + j]; - output.y += input.y * weights[weithTo + 1 * kernelHXW + j]; - output.z += input.z * weights[weithTo + 2 * kernelHXW + j]; - output.w += input.w * weights[weithTo + 3 * kernelHXW + j]; - } - output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0); - outTexture.write(output, gid.xy, gid.z); -} - diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ConvAddMetal.metal b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ConvAddMetal.metal deleted file mode 100644 index 274e416576743a473ba8931bcd538e9c39415f3c..0000000000000000000000000000000000000000 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ConvAddMetal.metal +++ /dev/null @@ -1,622 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#include -#include "Common.metal" - -using namespace metal; - -#pragma mark - convAdd -kernel void conv_add_1x1(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device float4 *weights [[buffer(1)]], - const device float4 *biase [[buffer(2)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 1; - - uint input_arr_size = inTexture.get_array_size(); - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - float4 output = biase[gid.z]; - - float4 input; - for (uint i = 0; i < input_arr_size; ++i) { - input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; - output.x += dot(input, weight_x); - - float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; - output.y += dot(input, weight_y); - - float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; - output.z += dot(input, weight_z); - - float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; - output.w += dot(input, weight_w); - } -// output = output + biase[gid.z]; - outTexture.write(output, gid.xy, gid.z); -} - -kernel void conv_add_3x3(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device float4 *weights [[buffer(1)]], - const device float4 *biase [[buffer(2)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - - const uint kernelHXW = 9; - - uint input_arr_size = inTexture.get_array_size(); - - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - float4 output = biase[gid.z]; - - ushort dilation_x = param.dilationX; - ushort dilation_y = param.dilationY; - - float4 input[9]; - - for (uint i = 0; i < input_arr_size; ++i) { - input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i); - - input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i); - - input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i); - - input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i); - - input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - - input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i); - - input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y + dilation_y), i); - - input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i); - - input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y + dilation_y), i); - - for (int j = 0; j < 9; ++j) { - float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.x += dot(input[j], weight_x); - - float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.y += dot(input[j], weight_y); - - float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.z += dot(input[j], weight_z); - - float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.w += dot(input[j], weight_w); - } - } -// output = output + biase[gid.z]; - outTexture.write(output, gid.xy, gid.z); -} - -kernel void conv_add_5x1(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device float4 *weights [[buffer(1)]], - const device float4 *biase [[buffer(2)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - - const uint kernelHXW = 5; - - uint input_arr_size = inTexture.get_array_size(); - - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - float4 output = biase[gid.z]; - - ushort dilation_y = param.dilationY; - float4 input[5]; - - for (uint i = 0; i < input_arr_size; ++i) { - input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i); - - input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i); - - input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - - input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i); - - input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i); - - for (int j = 0; j < 5; ++j) { - float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.x += dot(input[j], weight_x); - - float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.y += dot(input[j], weight_y); - - float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.z += dot(input[j], weight_z); - - float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.w += dot(input[j], weight_w); - } - } -// output = output + biase[gid.z]; - outTexture.write(output, gid.xy, gid.z); -} - - -kernel void conv_add_1x5(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device float4 *weights [[buffer(1)]], - const device float4 *biase [[buffer(2)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - - const uint kernelHXW = 5; - - uint input_arr_size = inTexture.get_array_size(); - - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - float4 output = biase[gid.z]; - - ushort dilation_x = param.dilationX; - float4 input[5]; - - for (uint i = 0; i < input_arr_size; ++i) { - input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i); - - input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i); - - input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - - input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i); - - input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i); - - for (int j = 0; j < 5; ++j) { - float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.x += dot(input[j], weight_x); - - float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.y += dot(input[j], weight_y); - - float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.z += dot(input[j], weight_z); - - float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.w += dot(input[j], weight_w); - } - } -// output = output + biase[gid.z]; - outTexture.write(output, gid.xy, gid.z); -} - - -kernel void depthwise_conv_add_3x3(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device float *weights [[buffer(1)]], - const device float4 *biase [[buffer(2)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - uint output_slice = gid.z; - ushort2 stride = ushort2(param.strideX, param.strideY); - ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 9; - uint weithTo = gid.z * kernelHXW * 4; - float4 output = biase[gid.z]; - float4 inputs[9]; - inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); - inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); - inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); - inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); - inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); - inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); - inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); - inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); - inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); - for (int j = 0; j < 9; ++j) { - float4 input = inputs[j]; - output.x += input.x * weights[weithTo + 0 * kernelHXW + j]; - output.y += input.y * weights[weithTo + 1 * kernelHXW + j]; - output.z += input.z * weights[weithTo + 2 * kernelHXW + j]; - output.w += input.w * weights[weithTo + 3 * kernelHXW + j]; - } -// output = output + biase[gid.z]; - outTexture.write(output, gid.xy, gid.z); -} - - -#pragma mark - half - -kernel void conv_add_1x1_half(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device half4 *weights [[buffer(1)]], - const device half4 *biase [[buffer(2)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 1; - - uint input_arr_size = inTexture.get_array_size(); - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - half4 output = biase[gid.z]; - - half4 input; - for (uint i = 0; i < input_arr_size; ++i) { - input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; - output.x += dot(input, weight_x); - - half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; - output.y += dot(input, weight_y); - - half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; - output.z += dot(input, weight_z); - - half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; - output.w += dot(input, weight_w); - } -// output = output + float4(biase[gid.z]); - outTexture.write(output, gid.xy, gid.z); -} - -kernel void conv_add_3x3_half(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device half4 *weights [[buffer(1)]], - const device half4 *biase [[buffer(2)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 9; - uint input_arr_size = inTexture.get_array_size(); - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - half4 output = biase[gid.z]; - - ushort dilation_x = param.dilationX; - ushort dilation_y = param.dilationY; - - half4 input[9]; - for (uint i = 0; i < input_arr_size; ++i) { - input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i); - input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i); - input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i); - input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i); - input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i); - input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y + dilation_y), i); - input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i); - input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y + dilation_y), i); - for (int j = 0; j < 9; ++j) { - half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.x += dot(float4(input[j]), float4(weight_x)); - - half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.y += dot(float4(input[j]), float4(weight_y)); - - half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.z += dot(float4(input[j]), float4(weight_z)); - - half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.w += dot(float4(input[j]), float4(weight_w)); - } - } -// output = output + float4(biase[gid.z]); - outTexture.write(output, gid.xy, gid.z); -} - -kernel void depthwise_conv_add_3x3_half(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device half *weights [[buffer(1)]], - const device half4 *biase [[buffer(2)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - uint output_slice = gid.z; - ushort2 stride = ushort2(param.strideX, param.strideY); - ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 9; - uint weithTo = gid.z * kernelHXW * 4; - half4 output = biase[gid.z]; - half4 inputs[9]; - inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); - inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); - inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); - inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); - inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); - inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); - inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); - inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); - inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); - for (int j = 0; j < 9; ++j) { - half4 input = inputs[j]; - output.x += input.x * weights[weithTo + 0 * kernelHXW + j]; - output.y += input.y * weights[weithTo + 1 * kernelHXW + j]; - output.z += input.z * weights[weithTo + 2 * kernelHXW + j]; - output.w += input.w * weights[weithTo + 3 * kernelHXW + j]; - } -// output = output + float4(biase[gid.z]); - outTexture.write(output, gid.xy, gid.z); -} - - -kernel void conv_add_5x1_half(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device half4 *weights [[buffer(1)]], - const device half4 *biase [[buffer(2)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - - const uint kernelHXW = 5; - - uint input_arr_size = inTexture.get_array_size(); - - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - half4 output = biase[gid.z]; - - ushort dilation_y = param.dilationY; - half4 input[5]; - - for (uint i = 0; i < input_arr_size; ++i) { - input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i); - - input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i); - - input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - - input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i); - - input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i); - - for (int j = 0; j < 5; ++j) { - half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.x += dot(input[j], weight_x); - - half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.y += dot(input[j], weight_y); - - half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.z += dot(input[j], weight_z); - - half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.w += dot(input[j], weight_w); - } - } -// output = output + float4(biase[gid.z]); - outTexture.write(output, gid.xy, gid.z); -} - - -kernel void conv_add_1x5_half(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device half4 *weights [[buffer(1)]], - const device half4 *biase [[buffer(2)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - - const uint kernelHXW = 5; - - uint input_arr_size = inTexture.get_array_size(); - - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - half4 output = biase[gid.z]; - - ushort dilation_x = param.dilationX; - half4 input[5]; - - for (uint i = 0; i < input_arr_size; ++i) { - input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i); - - input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i); - - input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - - input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i); - - input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i); - - for (int j = 0; j < 5; ++j) { - half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.x += dot(input[j], weight_x); - - half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.y += dot(input[j], weight_y); - - half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.z += dot(input[j], weight_z); - - half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.w += dot(input[j], weight_w); - } - } -// output = output + float4(biase[gid.z]); - outTexture.write(output, gid.xy, gid.z); -} - - -kernel void test_conv_add_3x3(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device float4 *weights [[buffer(1)]], - const device float4 *biase [[buffer(2)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - if (gid.x > 0 || gid.y > 0 || gid.z > 0) { return; } - - ushort2 stride = ushort2(param.strideX, param.strideY); - const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - - const uint kernelHXW = 9; - - uint input_arr_size = inTexture.get_array_size(); - - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - float4 output = float4(0.0); - - ushort dilation_x = param.dilationX; - ushort dilation_y = param.dilationY; - - float4 input[9]; - - for (uint i = 0; i < input_arr_size; ++i) { - - input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i); - - input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i); - - input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i); - - input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i); - - input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - - input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i); - - input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y + dilation_y), i); - - input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i); - - input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y + dilation_y), i); - - for (int j = 0; j < 9; ++j) { - float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.x += dot(input[j], weight_x); - - float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.y += dot(input[j], weight_y); - - float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.z += dot(input[j], weight_z); - - float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.w += dot(input[j], weight_w); - } - } - // output = output + biase[gid.z]; - outTexture.write(output, gid.xy, gid.z); -} - - - diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ConvAddPrelu.inc.metal b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ConvAddPrelu.inc.metal deleted file mode 100644 index 069daa20e875eb00c0d518e0463987248ca8dce5..0000000000000000000000000000000000000000 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ConvAddPrelu.inc.metal +++ /dev/null @@ -1,447 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#ifdef P - -#include "Macro.metal" - - -#pragma mark - convAdd -kernel void FUNC3_(conv_add_1x1, PRELU_TYPE, P)(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device VECTOR(P, 4) *weights [[buffer(1)]], - const device VECTOR(P, 4) *biase [[buffer(2)]], -#ifdef PRELU_CHANNEL - const device VECTOR(P, 4) *alpha [[buffer(3)]], -#endif -#ifdef PRELU_ELEMENT - const device VECTOR(P, 4) *alpha [[buffer(3)]], -#endif -#ifdef PRELU_OTHER - const device P *alpha [[buffer(3)]], -#endif - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 1; - - uint input_arr_size = inTexture.get_array_size(); - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - VECTOR(P, 4) output = biase[gid.z]; - - VECTOR(P, 4) input; - for (uint i = 0; i < input_arr_size; ++i) { - input = inTexture.sample(sample,float2(posInInput.x, posInInput.y), i); - VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; - output.x += dot(input, weight_x); - - VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; - output.y += dot(input, weight_y); - - VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; - output.z += dot(input, weight_z); - - VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; - output.w += dot(input, weight_w); - } - -// output = output + float4(biase[gid.z]); - -#ifdef PRELU_CHANNEL - VECTOR(P, 4) alpha_value = alpha[gid.z]; - output.x = output.x > 0 ? output.x : (alpha_value.x * output.x); - output.y = output.y > 0 ? output.y : (alpha_value.y * output.y); - output.z = output.z > 0 ? output.z : (alpha_value.z * output.z); - output.w = output.w > 0 ? output.w : (alpha_value.w * output.w); -#endif -#ifdef PRELU_ELEMENT - int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size(); - VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z]; - output.x = output.x > 0 ? output.x : (alpha_value.x * output.x); - output.y = output.y > 0 ? output.y : (alpha_value.y * output.y); - output.z = output.z > 0 ? output.z : (alpha_value.z * output.z); - output.w = output.w > 0 ? output.w : (alpha_value.w * output.w); -#endif -#ifdef PRELU_OTHER - P alpha_value = alpha[0]; - output.x = output.x > 0 ? output.x : (alpha_value * output.x); - output.y = output.y > 0 ? output.y : (alpha_value * output.y); - output.z = output.z > 0 ? output.z : (alpha_value * output.z); - output.w = output.w > 0 ? output.w : (alpha_value * output.w); -#endif - outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z); -} - -kernel void FUNC3_(conv_add_3x3, PRELU_TYPE, P)(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device VECTOR(P, 4) *weights [[buffer(1)]], - const device VECTOR(P, 4) *biase [[buffer(2)]], -#ifdef PRELU_CHANNEL - const device VECTOR(P, 4) *alpha [[buffer(3)]], -#endif -#ifdef PRELU_ELEMENT - const device VECTOR(P, 4) *alpha [[buffer(3)]], -#endif -#ifdef PRELU_OTHER - const device P *alpha [[buffer(3)]], -#endif - uint3 gid [[thread_position_in_grid]]) { - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - - const uint kernelHXW = 9; - - uint input_arr_size = inTexture.get_array_size(); - - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - VECTOR(P, 4) output = biase[gid.z]; - - ushort dilation_x = param.dilationX; - ushort dilation_y = param.dilationY; - - VECTOR(P, 4) input[9]; - - for (uint i = 0; i < input_arr_size; ++i) { - input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i); - - input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i); - - input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i); - - input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i); - - input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - - input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i); - - input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y + dilation_y), i); - - input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i); - - input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y + dilation_y), i); - - for (int j = 0; j < 9; ++j) { - VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.x += dot(input[j], weight_x); - - VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.y += dot(input[j], weight_y); - - VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.z += dot(input[j], weight_z); - - VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.w += dot(input[j], weight_w); - } - } -// output = output + float4(biase[gid.z]); - -#ifdef PRELU_CHANNEL - VECTOR(P, 4) alpha_value = alpha[gid.z]; - output.x = output.x > 0 ? output.x : (alpha_value.x * output.x); - output.y = output.y > 0 ? output.y : (alpha_value.y * output.y); - output.z = output.z > 0 ? output.z : (alpha_value.z * output.z); - output.w = output.w > 0 ? output.w : (alpha_value.w * output.w); -#endif -#ifdef PRELU_ELEMENT - int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size(); - VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z]; - output.x = output.x > 0 ? output.x : (alpha_value.x * output.x); - output.y = output.y > 0 ? output.y : (alpha_value.y * output.y); - output.z = output.z > 0 ? output.z : (alpha_value.z * output.z); - output.w = output.w > 0 ? output.w : (alpha_value.w * output.w); -#endif -#ifdef PRELU_OTHER - P alpha_value = alpha[0]; - output.x = output.x > 0 ? output.x : (alpha_value * output.x); - output.y = output.y > 0 ? output.y : (alpha_value * output.y); - output.z = output.z > 0 ? output.z : (alpha_value * output.z); - output.w = output.w > 0 ? output.w : (alpha_value * output.w); -#endif - outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z); -} - -kernel void FUNC3_(conv_add_5x1, PRELU_TYPE, P)(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device VECTOR(P, 4) *weights [[buffer(1)]], - const device VECTOR(P, 4) *biase [[buffer(2)]], -#ifdef PRELU_CHANNEL - const device VECTOR(P, 4) *alpha [[buffer(3)]], -#endif -#ifdef PRELU_ELEMENT - const device VECTOR(P, 4) *alpha [[buffer(3)]], -#endif -#ifdef PRELU_OTHER - const device P *alpha [[buffer(3)]], -#endif - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - - const uint kernelHXW = 5; - - uint input_arr_size = inTexture.get_array_size(); - - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - VECTOR(P, 4) output = biase[gid.z];; - - ushort dilation_y = param.dilationY; - VECTOR(P, 4) input[5]; - - for (uint i = 0; i < input_arr_size; ++i) { - input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i); - - input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i); - - input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - - input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i); - - input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i); - - for (int j = 0; j < 5; ++j) { - VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.x += dot(input[j], weight_x); - - VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.y += dot(input[j], weight_y); - - VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.z += dot(input[j], weight_z); - - VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.w += dot(input[j], weight_w); - } - } - -#ifdef PRELU_CHANNEL - VECTOR(P, 4) alpha_value = alpha[gid.z]; - output.x = output.x > 0 ? output.x : (alpha_value.x * output.x); - output.y = output.y > 0 ? output.y : (alpha_value.y * output.y); - output.z = output.z > 0 ? output.z : (alpha_value.z * output.z); - output.w = output.w > 0 ? output.w : (alpha_value.w * output.w); -#endif -#ifdef PRELU_ELEMENT - int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size(); - VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z]; - output.x = output.x > 0 ? output.x : (alpha_value.x * output.x); - output.y = output.y > 0 ? output.y : (alpha_value.y * output.y); - output.z = output.z > 0 ? output.z : (alpha_value.z * output.z); - output.w = output.w > 0 ? output.w : (alpha_value.w * output.w); -#endif -#ifdef PRELU_OTHER - P alpha_value = alpha[0]; - output.x = output.x > 0 ? output.x : (alpha_value * output.x); - output.y = output.y > 0 ? output.y : (alpha_value * output.y); - output.z = output.z > 0 ? output.z : (alpha_value * output.z); - output.w = output.w > 0 ? output.w : (alpha_value * output.w); -#endif - outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z); -} - - -kernel void FUNC3_(conv_add_1x5, PRELU_TYPE, P)(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device VECTOR(P, 4) *weights [[buffer(1)]], - const device VECTOR(P, 4) *biase [[buffer(2)]], -#ifdef PRELU_CHANNEL - const device VECTOR(P, 4) *alpha [[buffer(3)]], -#endif -#ifdef PRELU_ELEMENT - const device VECTOR(P, 4) *alpha [[buffer(3)]], -#endif -#ifdef PRELU_OTHER - const device P *alpha [[buffer(3)]], -#endif - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - - const uint kernelHXW = 5; - - uint input_arr_size = inTexture.get_array_size(); - - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - VECTOR(P, 4) output = biase[gid.z]; - - ushort dilation_x = param.dilationX; - VECTOR(P, 4) input[5]; - - for (uint i = 0; i < input_arr_size; ++i) { - input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i); - - input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i); - - input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - - input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i); - - input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i); - - for (int j = 0; j < 5; ++j) { - VECTOR(P, 4) weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.x += dot(input[j], weight_x); - - VECTOR(P, 4) weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.y += dot(input[j], weight_y); - - VECTOR(P, 4) weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.z += dot(input[j], weight_z); - - VECTOR(P, 4) weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.w += dot(input[j], weight_w); - } - } - -#ifdef PRELU_CHANNEL - VECTOR(P, 4) alpha_value = alpha[gid.z]; - output.x = output.x > 0 ? output.x : (alpha_value.x * output.x); - output.y = output.y > 0 ? output.y : (alpha_value.y * output.y); - output.z = output.z > 0 ? output.z : (alpha_value.z * output.z); - output.w = output.w > 0 ? output.w : (alpha_value.w * output.w); -#endif -#ifdef PRELU_ELEMENT - int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size(); - VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z]; - output.x = output.x > 0 ? output.x : (alpha_value.x * output.x); - output.y = output.y > 0 ? output.y : (alpha_value.y * output.y); - output.z = output.z > 0 ? output.z : (alpha_value.z * output.z); - output.w = output.w > 0 ? output.w : (alpha_value.w * output.w); -#endif -#ifdef PRELU_OTHER - P alpha_value = alpha[0]; - output.x = output.x > 0 ? output.x : (alpha_value * output.x); - output.y = output.y > 0 ? output.y : (alpha_value * output.y); - output.z = output.z > 0 ? output.z : (alpha_value * output.z); - output.w = output.w > 0 ? output.w : (alpha_value * output.w); -#endif - outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z); -} - -kernel void FUNC3_(depthwise_conv_add_3x3, PRELU_TYPE, P)(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device P *weights [[buffer(1)]], - const device VECTOR(P, 4) *biase [[buffer(2)]], -#ifdef PRELU_CHANNEL - const device VECTOR(P, 4) *alpha [[buffer(3)]], -#endif -#ifdef PRELU_ELEMENT - const device VECTOR(P, 4) *alpha [[buffer(3)]], -#endif -#ifdef PRELU_OTHER - const device P *alpha [[buffer(3)]], -#endif - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - uint output_slice = gid.z; - ushort2 stride = ushort2(param.strideX, param.strideY); - ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 9; - uint weithTo = gid.z * kernelHXW * 4; - VECTOR(P, 4) output = biase[gid.z]; - VECTOR(P, 4) inputs[9]; - inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); - inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); - inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); - inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); - inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); - inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); - inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); - inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); - inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); - for (int j = 0; j < 9; ++j) { - VECTOR(P, 4) input = inputs[j]; - output.x += input.x * weights[weithTo + 0 * kernelHXW + j]; - output.y += input.y * weights[weithTo + 1 * kernelHXW + j]; - output.z += input.z * weights[weithTo + 2 * kernelHXW + j]; - output.w += input.w * weights[weithTo + 3 * kernelHXW + j]; - } - -#ifdef PRELU_CHANNEL - VECTOR(P, 4) alpha_value = alpha[gid.z]; - output.x = output.x > 0 ? output.x : (alpha_value.x * output.x); - output.y = output.y > 0 ? output.y : (alpha_value.y * output.y); - output.z = output.z > 0 ? output.z : (alpha_value.z * output.z); - output.w = output.w > 0 ? output.w : (alpha_value.w * output.w); -#endif -#ifdef PRELU_ELEMENT - int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size(); - VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z]; - output.x = output.x > 0 ? output.x : (alpha_value.x * output.x); - output.y = output.y > 0 ? output.y : (alpha_value.y * output.y); - output.z = output.z > 0 ? output.z : (alpha_value.z * output.z); - output.w = output.w > 0 ? output.w : (alpha_value.w * output.w); -#endif -#ifdef PRELU_OTHER - P alpha_value = alpha[0]; - output.x = output.x > 0 ? output.x : (alpha_value * output.x); - output.y = output.y > 0 ? output.y : (alpha_value * output.y); - output.z = output.z > 0 ? output.z : (alpha_value * output.z); - output.w = output.w > 0 ? output.w : (alpha_value * output.w); -#endif - outTexture.write(VECTOR(P, 4)(output), gid.xy, gid.z); -} - -#endif - diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ConvAddPreluKernel.metal b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ConvAddPreluKernel.metal deleted file mode 100644 index f03a1d5b625cf01f1f1bc5ac23bebf7dabd968d9..0000000000000000000000000000000000000000 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ConvAddPreluKernel.metal +++ /dev/null @@ -1,65 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#include -#include "Common.metal" -using namespace metal; - -#define P float - - #define PRELU_CHANNEL prelu_channel - #define PRELU_TYPE prelu_channel - #include "ConvAddPrelu.inc.metal" - #undef PRELU_TYPE - #undef PRELU_CHANNEL - - #define PRELU_ELEMENT prelu_element - #define PRELU_TYPE prelu_element - #include "ConvAddPrelu.inc.metal" - #undef PRELU_TYPE - #undef PRELU_ELEMENT - - #define PRELU_OTHER prelu_other - #define PRELU_TYPE prelu_other - #include "ConvAddPrelu.inc.metal" - #undef PRELU_TYPE - #undef PRELU_OTHER - -#undef P - -#define P half - - #define PRELU_CHANNEL prelu_channel - #define PRELU_TYPE prelu_channel - #include "ConvAddPrelu.inc.metal" - #undef PRELU_TYPE - #undef PRELU_CHANNEL - - #define PRELU_ELEMENT prelu_element - #define PRELU_TYPE prelu_element - #include "ConvAddPrelu.inc.metal" - #undef PRELU_TYPE - #undef PRELU_ELEMENT - - #define PRELU_OTHER prelu_other - #define PRELU_TYPE prelu_other - #include "ConvAddPrelu.inc.metal" - #undef PRELU_TYPE - #undef PRELU_OTHER - -#undef P - - - - diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ConvBNReluKernel.metal b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ConvBNReluKernel.metal deleted file mode 100644 index 4b97b7829a1fba27704fe7b60a03b2672f4f5953..0000000000000000000000000000000000000000 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ConvBNReluKernel.metal +++ /dev/null @@ -1,297 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#include -#include "Common.metal" - -using namespace metal; - -#pragma mark - conv bn relu -kernel void conv_batch_norm_relu_1x1(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device float4 *weights [[buffer(1)]], - const device float4 *new_scale [[buffer(2)]], - const device float4 *new_biase [[buffer(3)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 1; - - uint input_arr_size = inTexture.get_array_size(); - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - float4 output = float4(0.0); - - float4 input; - for (uint i = 0; i < input_arr_size; ++i) { - input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; - output.x += dot(input, weight_x); - - float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; - output.y += dot(input, weight_y); - - float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; - output.z += dot(input, weight_z); - - float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; - output.w += dot(input, weight_w); - } - output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0); - outTexture.write(output, gid.xy, gid.z); -} - -kernel void conv_batch_norm_relu_3x3(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device float4 *weights [[buffer(1)]], - const device float4 *new_scale [[buffer(2)]], - const device float4 *new_biase [[buffer(3)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 9; - uint input_arr_size = inTexture.get_array_size(); - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - float4 output = float4(0.0); - - float4 input[9]; - for (uint i = 0; i < input_arr_size; ++i) { - input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i); - input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i); - input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i); - input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i); - input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i); - input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i); - input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i); - input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i); - for (int j = 0; j < 9; ++j) { - float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.x += dot(input[j], weight_x); - - float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.y += dot(input[j], weight_y); - - float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.z += dot(input[j], weight_z); - - float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.w += dot(input[j], weight_w); - } - } - output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0); - outTexture.write(output, gid.xy, gid.z); -} - -kernel void depthwise_conv_batch_norm_relu_3x3(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device float *weights [[buffer(1)]], - const device float4 *new_scale [[buffer(2)]], - const device float4 *new_biase [[buffer(3)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - uint output_slice = gid.z; - ushort2 stride = ushort2(param.strideX, param.strideY); - ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 9; - uint weithTo = gid.z * kernelHXW * 4; - float4 output = float4(0.0); - float4 inputs[9]; - inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); - inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); - inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); - inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); - inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); - inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); - inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); - inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); - inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); - for (int j = 0; j < 9; ++j) { - float4 input = inputs[j]; - output.x += input.x * weights[weithTo + 0 * kernelHXW + j]; - output.y += input.y * weights[weithTo + 1 * kernelHXW + j]; - output.z += input.z * weights[weithTo + 2 * kernelHXW + j]; - output.w += input.w * weights[weithTo + 3 * kernelHXW + j]; - } - output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0); - outTexture.write(output, gid.xy, gid.z); -} - -#pragma mark - half -kernel void conv_batch_norm_relu_1x1_half(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device half4 *weights [[buffer(1)]], - const device half4 *new_scale [[buffer(2)]], - const device half4 *new_biase [[buffer(3)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 1; - - uint input_arr_size = inTexture.get_array_size(); - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - float4 output = float4(0.0); - - half4 input; - for (uint i = 0; i < input_arr_size; ++i) { - input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; - output.x += dot(float4(input), float4(weight_x)); - - half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; - output.y += dot(float4(input), float4(weight_y)); - - half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; - output.z += dot(float4(input), float4(weight_z)); - - half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; - output.w += dot(float4(input), float4(weight_w)); - } - output = fmax(output * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0); - outTexture.write(half4(output), gid.xy, gid.z); -} - -kernel void conv_batch_norm_relu_3x3_half(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device half4 *weights [[buffer(1)]], - const device half4 *new_scale [[buffer(2)]], - const device half4 *new_biase [[buffer(3)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 9; - uint input_arr_size = inTexture.get_array_size(); - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - float4 output = float4(0.0); - - half4 input[9]; - for (uint i = 0; i < input_arr_size; ++i) { - input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i); - input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i); - input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i); - input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i); - input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i); - input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i); - input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i); - input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i); - for (int j = 0; j < 9; ++j) { - half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.x += dot(float4(input[j]), float4(weight_x)); - - half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.y += dot(float4(input[j]), float4(weight_y)); - - half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.z += dot(float4(input[j]), float4(weight_z)); - - half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.w += dot(float4(input[j]), float4(weight_w)); - } - } - output = fmax(output * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0); - outTexture.write(half4(output), gid.xy, gid.z); -} - -kernel void depthwise_conv_batch_norm_relu_3x3_half(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device half *weights [[buffer(1)]], - const device half4 *new_scale [[buffer(2)]], - const device half4 *new_biase [[buffer(3)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - uint output_slice = gid.z; - ushort2 stride = ushort2(param.strideX, param.strideY); - ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 9; - uint weithTo = gid.z * kernelHXW * 4; - float4 output = float4(0.0); - half4 inputs[9]; - inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); - inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); - inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); - inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); - inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); - inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); - inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); - inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); - inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); - for (int j = 0; j < 9; ++j) { - half4 input = inputs[j]; - output.x += input.x * weights[weithTo + 0 * kernelHXW + j]; - output.y += input.y * weights[weithTo + 1 * kernelHXW + j]; - output.z += input.z * weights[weithTo + 2 * kernelHXW + j]; - output.w += input.w * weights[weithTo + 3 * kernelHXW + j]; - } - output = fmax(output * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0); - outTexture.write(half4(output), gid.xy, gid.z); -} - diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ConvKernel.metal b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ConvKernel.metal deleted file mode 100644 index c07515c13da54c7f8bf698f976e47f7cda6de32b..0000000000000000000000000000000000000000 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ConvKernel.metal +++ /dev/null @@ -1,280 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#include -#include "Common.metal" -using namespace metal; - -// conv -#pragma mark -- conv -kernel void conv_3x3(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device float4 *weights [[buffer(1)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 9; - uint input_arr_size = inTexture.get_array_size(); - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - float4 output = float4(0.0); - - float4 input[9]; - for (uint i = 0; i < input_arr_size; ++i) { - input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i); - input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i); - input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i); - input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i); - input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i); - input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i); - input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i); - input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i); - for (int j = 0; j < 9; ++j) { - float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.x += dot(input[j], weight_x); - - float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.y += dot(input[j], weight_y); - - float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.z += dot(input[j], weight_z); - - float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.w += dot(input[j], weight_w); - } - } - outTexture.write(output, gid.xy, gid.z); -} - -kernel void depthwise_conv_3x3(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device float *weights [[buffer(1)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - uint output_slice = gid.z; - ushort2 stride = ushort2(param.strideX, param.strideY); - ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 9; - uint weithTo = gid.z * kernelHXW * 4; - float4 output = float4(0.0); - float4 inputs[9]; - inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); - inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); - inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); - inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); - inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); - inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); - inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); - inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); - inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); - for (int j = 0; j < 9; ++j) { - float4 input = inputs[j]; - output.x += input.x * weights[weithTo + 0 * kernelHXW + j]; - output.y += input.y * weights[weithTo + 1 * kernelHXW + j]; - output.z += input.z * weights[weithTo + 2 * kernelHXW + j]; - output.w += input.w * weights[weithTo + 3 * kernelHXW + j]; - } - outTexture.write(output, gid.xy, gid.z); -} - -kernel void conv_1x1(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device float4 *weights [[buffer(1)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 1; - - uint input_arr_size = inTexture.get_array_size(); - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - float4 output = float4(0.0); - - float4 input; - for (uint i = 0; i < input_arr_size; ++i) { - input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; - output.x += dot(input, weight_x); - - float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; - output.y += dot(input, weight_y); - - float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; - output.z += dot(input, weight_z); - - float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; - output.w += dot(input, weight_w); - } - outTexture.write(output, gid.xy, gid.z); -} - - -kernel void conv_3x3_half(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device half4 *weights [[buffer(1)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 9; - uint input_arr_size = inTexture.get_array_size(); - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - float4 output = float4(0.0); - - half4 input[9]; - for (uint i = 0; i < input_arr_size; ++i) { - input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i); - input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i); - input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i); - input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i); - input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i); - input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i); - input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i); - input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i); - for (int j = 0; j < 9; ++j) { - half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.x += dot(float4(input[j]), float4(weight_x)); - - half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.y += dot(float4(input[j]), float4(weight_y)); - - half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.z += dot(float4(input[j]), float4(weight_z)); - - half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.w += dot(float4(input[j]), float4(weight_w)); - } - } - outTexture.write(half4(output), gid.xy, gid.z); -} - -kernel void depthwise_conv_3x3_half(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device half *weights [[buffer(1)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - uint output_slice = gid.z; - ushort2 stride = ushort2(param.strideX, param.strideY); - ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 9; - uint weithTo = gid.z * kernelHXW * 4; - float4 output = float4(0.0); - half4 inputs[9]; - inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); - inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); - inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); - inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); - inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); - inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); - inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); - inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); - inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); - for (int j = 0; j < 9; ++j) { - half4 input = inputs[j]; - output.x += float(input.x) * float(weights[weithTo + 0 * kernelHXW + j]); - output.y += float(input.y) * float(weights[weithTo + 1 * kernelHXW + j]); - output.z += float(input.z) * float(weights[weithTo + 2 * kernelHXW + j]); - output.w += float(input.w) * float(weights[weithTo + 3 * kernelHXW + j]); - } - outTexture.write(half4(output), gid.xy, gid.z); -} - -kernel void conv_1x1_half(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device half4 *weights [[buffer(1)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 1; - - uint input_arr_size = inTexture.get_array_size(); - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - float4 output = float4(0.0); - - half4 input; - for (uint i = 0; i < input_arr_size; ++i) { - input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; - output.x += dot(float4(input), float4(weight_x)); - - half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; - output.y += dot(float4(input), float4(weight_y)); - - half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; - output.z += dot(float4(input), float4(weight_z)); - - half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; - output.w += dot(float4(input), float4(weight_w)); - } - outTexture.write(half4(output), gid.xy, gid.z); -} - - diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/Elementwise.metal b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/Elementwise.metal deleted file mode 100644 index b152df828106acd96171a89f4f636f308e0e9e39..0000000000000000000000000000000000000000 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/Elementwise.metal +++ /dev/null @@ -1,100 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#include -#include "Common.metal" - -using namespace metal; - -struct ElementwiseAddParam { - int32_t fast; - int32_t axis; - int32_t ylen; - int32_t xdim[4]; - int32_t xtrans[4]; - int32_t ydim[4]; - int32_t ytrans[4]; -}; - -kernel void elementwise_add(texture2d_array inputX [[texture(0)]], - texture2d_array inputY [[texture(1)]], - texture2d_array outTexture [[texture(2)]], - constant ElementwiseAddParam &pm [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]) { - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) return; - float4 rx, ry; - - if (pm.fast == 1) { - rx = inputX.read(gid.xy, gid.z); - ry = inputY.read(gid.xy, gid.z); - } else { - rx = inputX.read(gid.xy, gid.z); - int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4]; - int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4]; - int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]}; - int32_t ytrans[4] = {pm.ytrans[0], pm.ytrans[1], pm.ytrans[2], pm.ytrans[3]}; - int32_t yshift = 4 - pm.ylen - pm.axis; - for (int n = 0; n < 4; n++) { - x_xyzn[3] = n; - xyzn2abcd(pm.xdim[3], x_xyzn, x_abcd); - invtrans(xtrans, x_abcd, t_abcd); - for (int k = pm.axis; k < (pm.axis + pm.ylen); k++) { - y_abcd[yshift+k] = t_abcd[k]; - } - trans(ytrans, y_abcd, t_abcd); - abcd2xyzn(pm.ydim[3], t_abcd, y_xyzn); - ry[n] = inputY.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]]; - } - } - float4 r = rx + ry; - outTexture.write(r, gid.xy, gid.z); -} - -kernel void elementwise_add_half(texture2d_array inputX [[texture(0)]], - texture2d_array inputY [[texture(1)]], - texture2d_array outTexture [[texture(2)]], - constant ElementwiseAddParam &pm [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]) { - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) return; - half4 rx, ry; - - if (pm.fast == 1) { - rx = inputX.read(gid.xy, gid.z); - ry = inputY.read(gid.xy, gid.z); - } else { - rx = inputX.read(gid.xy, gid.z); - int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4]; - int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4]; - int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]}; - int32_t ytrans[4] = {pm.ytrans[0], pm.ytrans[1], pm.ytrans[2], pm.ytrans[3]}; - int32_t yshift = 4 - pm.ylen - pm.axis; - for (int n = 0; n < 4; n++) { - x_xyzn[3] = n; - xyzn2abcd(pm.xdim[3], x_xyzn, x_abcd); - invtrans(xtrans, x_abcd, t_abcd); - for (int k = pm.axis; k < (pm.axis + pm.ylen); k++) { - y_abcd[yshift+k] = t_abcd[k]; - } - trans(ytrans, y_abcd, t_abcd); - abcd2xyzn(pm.ydim[3], t_abcd, y_xyzn); - ry[n] = inputY.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]]; - } - } - half4 r = rx + ry; - outTexture.write(r, gid.xy, gid.z); -} diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ElementwiseAddPreluKernel.inc.metal b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ElementwiseAddPreluKernel.inc.metal deleted file mode 100644 index b1d68d680962c53778d624ab15bfcfeb1d1a3142..0000000000000000000000000000000000000000 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ElementwiseAddPreluKernel.inc.metal +++ /dev/null @@ -1,91 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#ifdef P - -#include -#include "Macro.metal" - -using namespace metal; - -kernel void FUNC3_(elementwise_add, PRELU_TYPE, P)(texture2d_array inputX [[texture(0)]], - texture2d_array inputY [[texture(1)]], - texture2d_array outTexture [[texture(2)]], - constant ElementwiseAddParam &pm [[buffer(0)]], -#ifdef PRELU_CHANNEL - const device VECTOR(P, 4) *alpha [[buffer(1)]], -#endif -#ifdef PRELU_ELEMENT - const device VECTOR(P, 4) *alpha [[buffer(1)]], -#endif -#ifdef PRELU_OTHER - const device P *alpha [[buffer(1)]], -#endif - uint3 gid [[thread_position_in_grid]]) { - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) return; - VECTOR(P, 4) rx, ry; - - if (pm.fast == 1) { - rx = inputX.read(gid.xy, gid.z); - ry = inputY.read(gid.xy, gid.z); - } else { - rx = inputX.read(gid.xy, gid.z); - int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4]; - int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4]; - int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]}; - int32_t ytrans[4] = {pm.ytrans[0], pm.ytrans[1], pm.ytrans[2], pm.ytrans[3]}; - int32_t yshift = 4 - pm.ylen - pm.axis; - for (int n = 0; n < 4; n++) { - x_xyzn[3] = n; - xyzn2abcd(pm.xdim[3], x_xyzn, x_abcd); - invtrans(xtrans, x_abcd, t_abcd); - for (int k = pm.axis; k < (pm.axis + pm.ylen); k++) { - y_abcd[yshift+k] = t_abcd[k]; - } - trans(ytrans, y_abcd, t_abcd); - abcd2xyzn(pm.ydim[3], t_abcd, y_xyzn); - ry[n] = inputY.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]]; - } - } - VECTOR(P, 4) output = rx + ry; - -#ifdef PRELU_CHANNEL - VECTOR(P, 4) alpha_value = alpha[gid.z]; - output.x = output.x > 0 ? output.x : (alpha_value.x * output.x); - output.y = output.y > 0 ? output.y : (alpha_value.y * output.y); - output.z = output.z > 0 ? output.z : (alpha_value.z * output.z); - output.w = output.w > 0 ? output.w : (alpha_value.w * output.w); -#endif -#ifdef PRELU_ELEMENT - int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size(); - VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z]; - output.x = output.x > 0 ? output.x : (alpha_value.x * output.x); - output.y = output.y > 0 ? output.y : (alpha_value.y * output.y); - output.z = output.z > 0 ? output.z : (alpha_value.z * output.z); - output.w = output.w > 0 ? output.w : (alpha_value.w * output.w); -#endif -#ifdef PRELU_OTHER - P alpha_value = alpha[0]; - output.x = output.x > 0 ? output.x : (alpha_value * output.x); - output.y = output.y > 0 ? output.y : (alpha_value * output.y); - output.z = output.z > 0 ? output.z : (alpha_value * output.z); - output.w = output.w > 0 ? output.w : (alpha_value * output.w); -#endif - - outTexture.write(output, gid.xy, gid.z); -} - -#endif diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/FetchKernel.inc.metal b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/FetchKernel.inc.metal deleted file mode 100644 index 9655b0fc1a02912cf64b29457a384d95231a25be..0000000000000000000000000000000000000000 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/FetchKernel.inc.metal +++ /dev/null @@ -1,61 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#ifdef P - -#define CONCAT3_(a, b, c) a ## _ ## b ## _ ## c -#define CONCAT2_(a, b) a ## _ ## b -#define CONCAT2(a, b) a ## b -#define FUNC(m, n, q) CONCAT3_(m, n, q) -#define FUNC_T(m, n) CONCAT2_(m, n) - -#define VECTOR(p, n) CONCAT2(p, n) - -kernel void FUNC_T(fetch, P)(texture2d_array inTexture [[texture(0)]], - device float *output [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]) { - if (gid.x >= inTexture.get_width() || - gid.y >= inTexture.get_height() || - gid.z >= inTexture.get_array_size()) { - return; - } - - int input_width = inTexture.get_width(); - int input_height = inTexture.get_height(); - const VECTOR(P, 4) input = inTexture.read(gid.xy, gid.z); - int output_to = 4 * input_width * input_height; - - output[gid.z * output_to + 0 * input_width * input_height + gid.y * input_width + gid.x] = input.x; - - output[gid.z * output_to + 1 * input_width * input_height + gid.y * input_width + gid.x] = input.y; - output[gid.z * output_to + 2 * input_width * input_height + gid.y * input_width + gid.x] = input.z; - output[gid.z * output_to + 3 * input_width * input_height + gid.y * input_width + gid.x] = input.w; -} - -kernel void FUNC(fetch, 1or2, P)(texture2d_array inTexture [[texture(0)]], - device float4 *output [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]) { - if (gid.x >= inTexture.get_width() || - gid.y >= inTexture.get_height() || - gid.z >= inTexture.get_array_size()) { - return; - } - - int input_width = inTexture.get_width(); - const VECTOR(P, 4) input = inTexture.read(gid.xy, gid.z); - output[gid.y * input_width + gid.x] = float4(input); -} - - -#endif diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/NMSFetchResultKernel.metal b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/NMSFetchResultKernel.metal deleted file mode 100644 index 44c57440e1ec138717ad1bc569fd772e0d7ede1a..0000000000000000000000000000000000000000 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/NMSFetchResultKernel.metal +++ /dev/null @@ -1,80 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#include -using namespace metal; - -kernel void nms_fetch_result(texture2d_array inTexture [[texture(0)]], - device float *output [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= inTexture.get_width() || - gid.y >= inTexture.get_height() || - gid.z >= inTexture.get_array_size()) { - return; - } - - int input_width = inTexture.get_width(); - const float4 input = inTexture.read(gid.xy, gid.z); - output[gid.y * input_width + gid.x] = input.x; - -} - - -kernel void nms_fetch_result_half(texture2d_array inTexture [[texture(0)]], - device float *output [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= inTexture.get_width() || - gid.y >= inTexture.get_height() || - gid.z >= inTexture.get_array_size()) { - return; - } - - int input_width = inTexture.get_width(); - const half4 input = inTexture.read(gid.xy, gid.z); - output[gid.y * input_width + gid.x] = input.x; -} - -kernel void nms_fetch_bbox(texture2d_array inTexture [[texture(0)]], - device float4 *output [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= inTexture.get_width() || - gid.y >= inTexture.get_height() || - gid.z >= inTexture.get_array_size()) { - return; - } - - int input_width = inTexture.get_width(); -// int input_height = inTexture.get_height(); - const float4 input = inTexture.read(gid.xy, gid.z); - output[gid.y * input_width + gid.x] = input; -} - -kernel void nms_fetch_bbox_half(texture2d_array inTexture [[texture(0)]], - device float4 *output [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]) { - if (gid.x >= inTexture.get_width() || - gid.y >= inTexture.get_height() || - gid.z >= inTexture.get_array_size()) { - return; - } - - int input_width = inTexture.get_width(); -// int input_height = inTexture.get_height(); - const half4 input = inTexture.read(gid.xy, gid.z); - output[gid.y * input_width + gid.x] = float4(input); -} - diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/PoolKernel.inc.metal b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/PoolKernel.inc.metal deleted file mode 100644 index 3c36ba06f543f6f6cf3e1c234c5326e1f00fdc04..0000000000000000000000000000000000000000 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/PoolKernel.inc.metal +++ /dev/null @@ -1,50 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#ifdef P - -kernel void FUNC2_(pool, P)(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant PoolParam &pm [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]) { - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) return; - int xmin = gid.x * pm.strideX - pm.paddingX; - int xmax = min(xmin + pm.ksizeX, int(inTexture.get_width())); - xmin = max(xmin, 0); - int ymin = gid.y * pm.strideX - pm.paddingX; - int ymax = min(ymin + pm.ksizeX, int(inTexture.get_height())); - ymin = max(ymin, 0); - - VECTOR(P, 4) r = 0; - if (pm.poolType == 0) { - r = inTexture.read(uint2(xmin, ymin), gid.z); - for (int x = xmin; x < xmax; x++) { - for (int y = ymin; y < ymax; y++) { - r = fmax(r, inTexture.read(uint2(x, y), gid.z)); - } - } - } else if (pm.poolType == 1) { - for (int x = xmin; x < xmax; x++) { - for (int y = ymin; y < ymax; y++) { - r += inTexture.read(uint2(x, y), gid.z); - } - } - r /= (xmax - xmin) * (ymax - ymin); - } - outTexture.write(r, gid.xy, gid.z); -} - -#endif diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/PreluKernel.metal b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/PreluKernel.metal deleted file mode 100644 index 597804137743dd253d05d91a5008f558dcaf42e7..0000000000000000000000000000000000000000 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/PreluKernel.metal +++ /dev/null @@ -1,151 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#include -using namespace metal; - -kernel void prelu_channel(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - const device float4 *alpha [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]){ - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - float4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z); - float4 alpha_value = alpha[gid.z]; - float4 output; - output.x = input.x > 0 ? input.x : (alpha_value.x * input.x); - output.y = input.y > 0 ? input.y : (alpha_value.y * input.y); - output.z = input.z > 0 ? input.z : (alpha_value.z * input.z); - output.w = input.w > 0 ? input.w : (alpha_value.w * input.w); - outTexture.write(output, gid.xy, gid.z); -} - -kernel void prelu_element(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - const device float4 *alpha [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]){ - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - float4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z); - - int alpha_to = (gid.y * inTexture.get_width() + gid.x) * inTexture.get_array_size(); - float4 alpha_value = alpha[alpha_to + gid.z]; - - float4 output; - output.x = input.x > 0 ? input.x : (alpha_value.x * input.x); - output.y = input.y > 0 ? input.y : (alpha_value.y * input.y); - output.z = input.z > 0 ? input.z : (alpha_value.z * input.z); - output.w = input.w > 0 ? input.w : (alpha_value.w * input.w); - outTexture.write(output, gid.xy, gid.z); -} - -kernel void prelu_other(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - const device float *alpha [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]){ - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - float4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z); - float alpha_value = alpha[0]; - float4 output; - output.x = input.x > 0 ? input.x : (alpha_value * input.x); - output.y = input.y > 0 ? input.y : (alpha_value * input.y); - output.z = input.z > 0 ? input.z : (alpha_value * input.z); - output.w = input.w > 0 ? input.w : (alpha_value * input.w); - outTexture.write(output, gid.xy, gid.z); -} - - -kernel void prelu_channel_half(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - const device half4 *alpha [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]){ - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z); - half4 alpha_value = alpha[gid.z]; - half4 output; - output.x = input.x > 0 ? input.x : (alpha_value.x * input.x); - output.y = input.y > 0 ? input.y : (alpha_value.y * input.y); - output.z = input.z > 0 ? input.z : (alpha_value.z * input.z); - output.w = input.w > 0 ? input.w : (alpha_value.w * input.w); - outTexture.write(output, gid.xy, gid.z); -} - -kernel void prelu_element_half(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - const device half4 *alpha [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]){ - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z); - - int alpha_to = (gid.y * inTexture.get_width() + gid.x) * inTexture.get_array_size(); - half4 alpha_value = alpha[alpha_to + gid.z]; - - half4 output; - output.x = input.x > 0 ? input.x : (alpha_value.x * input.x); - output.y = input.y > 0 ? input.y : (alpha_value.y * input.y); - output.z = input.z > 0 ? input.z : (alpha_value.z * input.z); - output.w = input.w > 0 ? input.w : (alpha_value.w * input.w); - outTexture.write(output, gid.xy, gid.z); -} - -kernel void prelu_other_half(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - const device half *alpha [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]){ - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z); - half alpha_value = alpha[0]; - half4 output; - output.x = input.x > 0 ? input.x : (alpha_value * input.x); - output.y = input.y > 0 ? input.y : (alpha_value * input.y); - output.z = input.z > 0 ? input.z : (alpha_value * input.z); - output.w = input.w > 0 ? input.w : (alpha_value * input.w); - outTexture.write(output, gid.xy, gid.z); -} - - diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/PriorBoxKernel.metal b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/PriorBoxKernel.metal deleted file mode 100644 index 7630febf77210bb364f0191e8b10a5a6923d6c95..0000000000000000000000000000000000000000 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/PriorBoxKernel.metal +++ /dev/null @@ -1,367 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#include -using namespace metal; - -struct PriorBoxMetalParam { - float offset; - float stepWidth; - float stepHeight; - float minSize; - float maxSize; - float imageWidth; - float imageHeight; - - bool clip; - - uint numPriors; - uint aspecRatiosSize; - uint minSizeSize; - uint maxSizeSize; -}; - -kernel void prior_box(texture2d_array inTexture [[texture(0)]], - texture2d_array outBoxTexture [[texture(1)]], - texture2d_array varianceTexture [[texture(2)]], - const device float *aspect_ratios [[buffer(0)]], - constant PriorBoxMetalParam ¶m [[buffer(1)]], - const device float4 *variances [[buffer(2)]], - uint3 gid [[thread_position_in_grid]]) { - if (gid.x >= outBoxTexture.get_width() || - gid.y >= outBoxTexture.get_height() || - gid.z >= outBoxTexture.get_array_size()) return; - - float center_x = (gid.x + param.offset) * param.stepWidth; - float center_y = (gid.y + param.offset) * param.stepHeight; - - float box_width, box_height; - - if (gid.z < param.aspecRatiosSize) { - float ar = aspect_ratios[gid.z]; - box_width = param.minSize * sqrt(ar) / 2; - box_height = param.minSize / sqrt(ar) / 2; - float4 box; - box.x = (center_x - box_width) / param.imageWidth; - box.y = (center_y - box_height) / param.imageHeight; - box.z = (center_x + box_width) / param.imageWidth; - box.w = (center_y + box_height) / param.imageHeight; - - float4 res; - if (param.clip) { - res = fmin(fmax(box, 0.0), 1.0); - } else { - res = box; - } - - outBoxTexture.write(res, gid.xy, gid.z); - } else if (gid.z >= param.aspecRatiosSize) { - if (param.maxSizeSize > 0) { - box_width = box_height = sqrt(param.minSize * param.maxSize) / 2; - float4 max_box; - max_box.x = (center_x - box_width) / param.imageWidth; - max_box.y = (center_y - box_height) / param.imageHeight; - max_box.z = (center_x + box_width) / param.imageWidth; - max_box.w = (center_y + box_height) / param.imageHeight; - - float4 res; - if (param.clip) { - res = min(max(max_box, 0.0), 1.0); - } else { - res = max_box; - } - outBoxTexture.write(max_box, gid.xy, gid.z); - } - } - - float4 variance = variances[0]; - if (gid.z < param.numPriors) { - float4 variances_output; - variances_output.x = variance.x; - variances_output.y = variance.y; - variances_output.z = variance.z; - variances_output.w = variance.w; - varianceTexture.write(variances_output, gid.xy, gid.z); - } -} - - -kernel void prior_box_half(texture2d_array inTexture [[texture(0)]], - texture2d_array outBoxTexture [[texture(1)]], - texture2d_array varianceTexture [[texture(2)]], - const device half *aspect_ratios [[buffer(0)]], - constant PriorBoxMetalParam ¶m [[buffer(1)]], - const device float4 *variances [[buffer(2)]], - uint3 gid [[thread_position_in_grid]]) { - if (gid.x >= outBoxTexture.get_width() || - gid.y >= outBoxTexture.get_height() || - gid.z >= outBoxTexture.get_array_size()) return; - - float center_x = (gid.x + param.offset) * param.stepWidth; - float center_y = (gid.y + param.offset) * param.stepHeight; - - float box_width, box_height; - - if (gid.z < param.aspecRatiosSize) { - half ar = aspect_ratios[gid.z]; - box_width = param.minSize * sqrt(ar) / 2; - box_height = param.minSize / sqrt(ar) / 2; - float4 box; - box.x = (center_x - box_width) / param.imageWidth; - box.y = (center_y - box_height) / param.imageHeight; - box.z = (center_x + box_width) / param.imageWidth; - box.w = (center_y + box_height) / param.imageHeight; - - float4 res; - if (param.clip) { - res = fmin(fmax(box, 0.0), 1.0); - } else { - res = box; - } - - outBoxTexture.write(half4(res), gid.xy, gid.z); - } else if (gid.z >= param.aspecRatiosSize) { - if (param.maxSizeSize > 0) { - box_width = box_height = sqrt(param.minSize * param.maxSize) / 2; - float4 max_box; - max_box.x = (center_x - box_width) / param.imageWidth; - max_box.y = (center_y - box_height) / param.imageHeight; - max_box.z = (center_x + box_width) / param.imageWidth; - max_box.w = (center_y + box_height) / param.imageHeight; - - float4 res; - if (param.clip) { - res = min(max(max_box, 0.0), 1.0); - } else { - res = max_box; - } - outBoxTexture.write(half4(max_box), gid.xy, gid.z); - } - } - - float4 variance = variances[0]; - if (gid.z < param.numPriors) { - float4 variances_output; - variances_output.x = variance.x; - variances_output.y = variance.y; - variances_output.z = variance.z; - variances_output.w = variance.w; - varianceTexture.write(half4(variances_output), gid.xy, gid.z); - } -} - - - -kernel void prior_box_MinMaxAspectRatiosOrder(texture2d_array inTexture [[texture(0)]], - texture2d_array outBoxTexture [[texture(1)]], - texture2d_array varianceTexture [[texture(2)]], - const device float *aspect_ratios [[buffer(0)]], - constant PriorBoxMetalParam ¶m [[buffer(1)]], - const device float4 *variances [[buffer(2)]], - uint3 gid [[thread_position_in_grid]]) { - if (gid.x >= outBoxTexture.get_width() || - gid.y >= outBoxTexture.get_height() || - gid.z >= outBoxTexture.get_array_size()) return; - - float center_x = (gid.x + param.offset) * param.stepWidth; - float center_y = (gid.y + param.offset) * param.stepHeight; - - float box_width, box_height; - - - - if (gid.z == 0) { - box_width = box_height = param.minSize / 2; - - float4 box; - box.x = (center_x - box_width) / param.imageWidth; - box.y = (center_y - box_height) / param.imageHeight; - box.z = (center_x + box_width) / param.imageWidth; - box.w = (center_y + box_height) / param.imageHeight; - - float4 res; - if (param.clip) { - res = fmin(fmax(box, 0.0), 1.0); - } else { - res = box; - } - - outBoxTexture.write(res, gid.xy, gid.z); - } - - if (gid.z == 1 && param.maxSizeSize > 0) { - - box_width = box_height = sqrt(param.minSize * param.maxSize) / 2; - float4 max_box; - max_box.x = (center_x - box_width) / param.imageWidth; - max_box.y = (center_y - box_height) / param.imageHeight; - max_box.z = (center_x + box_width) / param.imageWidth; - max_box.w = (center_y + box_height) / param.imageHeight; - - float4 res; - if (param.clip) { - res = min(max(max_box, 0.0), 1.0); - } else { - res = max_box; - } - outBoxTexture.write(res, gid.xy, gid.z); - } - - int aspect_to = 0; - if (param.maxSizeSize > 0) { - aspect_to = gid.z - 2; - } else { - aspect_to = gid.z - 1; - } - - - - - if (aspect_to >= 0 && aspect_to < int(param.aspecRatiosSize)) { - - int skip = 0; - for (int i = 0; i < aspect_to + 1; ++i) { - if (fabs(aspect_ratios[i] - 1.) < 1e-6) { - skip += 1; - } - } - aspect_to += skip; - - float ar = aspect_ratios[aspect_to]; - - box_width = param.minSize * sqrt(ar) / 2; - box_height = param.minSize / sqrt(ar) / 2; - float4 box; - box.x = (center_x - box_width) / param.imageWidth; - box.y = (center_y - box_height) / param.imageHeight; - box.z = (center_x + box_width) / param.imageWidth; - box.w = (center_y + box_height) / param.imageHeight; - - float4 res; - if (param.clip) { - res = fmin(fmax(box, 0.0), 1.0); - } else { - res = box; - } - - outBoxTexture.write(res, gid.xy, gid.z); - } - - float4 variance = variances[0]; - if (gid.z < param.numPriors) { - float4 variances_output; - variances_output.x = variance.x; - variances_output.y = variance.y; - variances_output.z = variance.z; - variances_output.w = variance.w; - varianceTexture.write(variances_output, gid.xy, gid.z); - } -} - - -kernel void prior_box_MinMaxAspectRatiosOrder_half(texture2d_array inTexture [[texture(0)]], - texture2d_array outBoxTexture [[texture(1)]], - texture2d_array varianceTexture [[texture(2)]], - const device half *aspect_ratios [[buffer(0)]], - constant PriorBoxMetalParam ¶m [[buffer(1)]], - const device float4 *variances [[buffer(2)]], - uint3 gid [[thread_position_in_grid]]) { - if (gid.x >= outBoxTexture.get_width() || - gid.y >= outBoxTexture.get_height() || - gid.z >= outBoxTexture.get_array_size()) return; - - float center_x = (gid.x + param.offset) * param.stepWidth; - float center_y = (gid.y + param.offset) * param.stepHeight; - - float box_width, box_height; - - - - if (gid.z == 0) { - box_width = box_height = param.minSize / 2; - - float4 box; - box.x = (center_x - box_width) / param.imageWidth; - box.y = (center_y - box_height) / param.imageHeight; - box.z = (center_x + box_width) / param.imageWidth; - box.w = (center_y + box_height) / param.imageHeight; - - float4 res; - if (param.clip) { - res = fmin(fmax(box, 0.0), 1.0); - } else { - res = box; - } - - outBoxTexture.write(half4(res), gid.xy, gid.z); - } - - if (gid.z == 1 && param.maxSizeSize > 0) { - - box_width = box_height = sqrt(param.minSize * param.maxSize) / 2; - float4 max_box; - max_box.x = (center_x - box_width) / param.imageWidth; - max_box.y = (center_y - box_height) / param.imageHeight; - max_box.z = (center_x + box_width) / param.imageWidth; - max_box.w = (center_y + box_height) / param.imageHeight; - - float4 res; - if (param.clip) { - res = min(max(max_box, 0.0), 1.0); - } else { - res = max_box; - } - outBoxTexture.write(half4(res), gid.xy, gid.z); - } - - int aspect_to = 0; - if (param.maxSizeSize > 0) { - aspect_to = gid.z - 2; - } else { - aspect_to = gid.z - 1; - } - - if (aspect_to > 0 && aspect_to < int(param.aspecRatiosSize) && fabs(aspect_ratios[aspect_to] - 1.) > 1e-6) { - float ar = aspect_ratios[aspect_to]; - - box_width = param.minSize * sqrt(ar) / 2; - box_height = param.minSize / sqrt(ar) / 2; - float4 box; - box.x = (center_x - box_width) / param.imageWidth; - box.y = (center_y - box_height) / param.imageHeight; - box.z = (center_x + box_width) / param.imageWidth; - box.w = (center_y + box_height) / param.imageHeight; - - float4 res; - if (param.clip) { - res = fmin(fmax(box, 0.0), 1.0); - } else { - res = box; - } - - outBoxTexture.write(half4(res), gid.xy, gid.z); - } - - float4 variance = variances[0]; - if (gid.z < param.numPriors) { - float4 variances_output; - variances_output.x = variance.x; - variances_output.y = variance.y; - variances_output.z = variance.z; - variances_output.w = variance.w; - varianceTexture.write(half4(variances_output), gid.xy, gid.z); - } -} - - diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ReshapeKernel.inc.metal b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ReshapeKernel.inc.metal deleted file mode 100644 index 7583537c2b404b7a95eeedfb4c69793a608f18ac..0000000000000000000000000000000000000000 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/ReshapeKernel.inc.metal +++ /dev/null @@ -1,66 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#ifdef P - -#define CONCAT2(a, b) a ## b -#define CONCAT2_(a, b) a ## _ ## b -#define CONCAT3_(a, b, c) a ## _ ## b ## _ ## c -#define CONCAT4_(a, b, c, d) a ## _ ## b ## _ ## c ## _ ## d - -#define FUNC(f, r1, r2, p) CONCAT4_(f, r1, r2, p) -#define VECTOR(p, n) CONCAT2(p, n) -#define FUNC_R(f, r) CONCAT2_(f, r) - -kernel void FUNC(reshape, RIN, ROUT, P)(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant ReshapeParam &rp [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]) { - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) return; - - int oxyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0}, oabcd[4], ixyzn[4], iabcd[4]; - ReshapeParam lrp = rp; - int oC = lrp.odim[lrp.otrans[3]]; - int iC = lrp.idim[lrp.itrans[3]]; - int count = lrp.odim[0] * lrp.odim[1] * lrp.odim[2] * lrp.odim[3]; - VECTOR(P, 4) r; - for (int n = 0; n < 4; n++) { - oxyzn[3] = n; -#if ROUT == 4 - xyzn2abcd_4(oC, oxyzn, oabcd); -#else - FUNC_R(xyzn2abcd, ROUT)(oxyzn, oabcd); -#endif - int tabcd[4]; - invtrans(lrp.otrans, oabcd, tabcd); - int index = abcd2index(lrp.odim, tabcd); - if (index < count) { - index2abcd(lrp.idim, index, tabcd); - trans(lrp.itrans, tabcd, iabcd); -#if RIN == 4 - abcd2xyzn_4(iC, iabcd, ixyzn); -#else - FUNC_R(abcd2xyzn, RIN)(iabcd, ixyzn); -#endif - r[n] = inTexture.read(uint2(ixyzn[0], ixyzn[1]), ixyzn[2])[ixyzn[3]]; - } else { - r[n] = 0; - } - } - outTexture.write(r, gid.xy, gid.z); -} - -#endif diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/Scale.metal b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/Scale.metal deleted file mode 100644 index ae4ccdef751535765b78b0573b012ace30b16811..0000000000000000000000000000000000000000 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/Scale.metal +++ /dev/null @@ -1,30 +0,0 @@ -// -// Scale.metal -// paddle-mobile -// -// Created by liuRuiLong on 2019/1/4. -// Copyright © 2019 orange. All rights reserved. -// - -#include -using namespace metal; - -kernel void scale(texture2d inTexture [[texture(0)]], texture2d outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]]) { - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height()) return; - float w_stride = inTexture.get_width() / outTexture.get_width(); - float h_stride = inTexture.get_height() / outTexture.get_height(); - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - float4 input = inTexture.sample(sample, float2(gid.x * w_stride, gid.y * h_stride), 0); - outTexture.write(input, gid); -} - -kernel void scale_half(texture2d inTexture [[texture(0)]], texture2d outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]]) { - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height()) return; - float w_stride = inTexture.get_width() / outTexture.get_width(); - float h_stride = inTexture.get_height() / outTexture.get_height(); - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - float4 input = inTexture.sample(sample, float2(gid.x * w_stride, gid.y * h_stride), 0); - outTexture.write(half4(input), gid); -} diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/Softmax.inc.metal b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/Softmax.inc.metal deleted file mode 100644 index 455cf1471b5c369fc27040e03b57812e8d6bf0e8..0000000000000000000000000000000000000000 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/Softmax.inc.metal +++ /dev/null @@ -1,61 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#ifdef P - -#define CONCAT2(a, b) a ## b -#define CONCAT2_(a, b) a ## _ ## b - -#define FUNC(f, p) CONCAT2_(f, p) -#define VECTOR(p, n) CONCAT2(p, n) - -kernel void FUNC(softmax, P)(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant SoftmaxParam &sp [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]) { - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) return; -// int zsize = inTexture.get_array_size(); - P maxv = inTexture.read(uint2(0, gid.y), 0)[0]; - int group = sp.K / 4; - int remain = sp.K % 4; - for (int x = 0; x < group; x++) { - VECTOR(P, 4) r = inTexture.read(uint2(x, gid.y), 0); - maxv = max(maxv, max(r[0], max(r[1], max(r[2], r[3])))); - } - if (remain > 0) { - VECTOR(P, 4) r = inTexture.read(uint2(group, gid.y), 0); - for (int i = 0; i < remain; i++) { - maxv = max(maxv, r[i]); - } - } - VECTOR(P, 4) rsum = {0, 0, 0, 0}; - for (int x = 0; x < group; x++) { - VECTOR(P, 4) r = inTexture.read(uint2(x, gid.y), 0); - rsum += exp(r - maxv); - } - P sum = rsum[0] + rsum[1] + rsum[2] + rsum[3]; - if (remain > 0) { - VECTOR(P, 4) r = inTexture.read(uint2(group, gid.y), 0); - for (int i = 0; i < remain; i++) { - sum += exp(r[i] - maxv); - } - } - VECTOR(P, 4) rr = inTexture.read(gid.xy, gid.z); - rr = exp(rr - maxv) / sum; - outTexture.write(rr, gid.xy, gid.z); -} - -#endif diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/TransposeKernel.metal b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/TransposeKernel.metal deleted file mode 100644 index 321663b9b7f09eba2041cb0932215d291e44aba6..0000000000000000000000000000000000000000 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/metal/TransposeKernel.metal +++ /dev/null @@ -1,63 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#include -#include "Common.metal" -using namespace metal; - -struct TransposeParam { - int iC; - int oC; - int axis[4]; -}; - -kernel void transpose_copy_float(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant TransposeParam &pm [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]) { - outTexture.write(inTexture.read(gid.xy, gid.z), gid.xy, gid.z); -} -kernel void transpose_copy_half(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant TransposeParam &pm [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]) { - outTexture.write(inTexture.read(gid.xy, gid.z), gid.xy, gid.z); -} - -#define R 4 - #define P float - #include "TransposeKernel.inc.metal" - #undef P - #define P half - #include "TransposeKernel.inc.metal" - #undef P -#undef R - -#define R 3 - #define P float - #include "TransposeKernel.inc.metal" - #undef P - #define P half - #include "TransposeKernel.inc.metal" - #undef P -#undef R - -#define R 2 - #define P float - #include "TransposeKernel.inc.metal" - #undef P - #define P half - #include "TransposeKernel.inc.metal" - #undef P -#undef R diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/MulticlassNMSOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/MulticlassNMSOp.swift index 6d2e46b64986300556898596ea881a254709f472..c8dcf4e023fdebd7a817bb3b56343f3af8483fdd 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/MulticlassNMSOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/MulticlassNMSOp.swift @@ -14,58 +14,58 @@ import Foundation -class MulticlassNMSParam: OpParam { - //typealias ParamPrecisionType = P - required init(opDesc: PMOpDesc, inScope: Scope) throws { - do { - scores = try MulticlassNMSParam.getFirstTensor(key: "Scores", map: opDesc.inputs, from: inScope) - bboxes = try MulticlassNMSParam.getFirstTensor(key: "BBoxes", map: opDesc.inputs, from: inScope) - output = try MulticlassNMSParam.outputOut(outputs: opDesc.outputs, from: inScope) - - middleOutput = FetchHolder.init(inPaddedCapacity: scores.tensorDim.numel(), inDim: scores.tensorDim) - - bboxOutput = FetchHolder.init(inPaddedCapacity: bboxes.tensorDim.numel(), inDim: bboxes.tensorDim) - } catch let error { - throw error +class MulticlassNMSParam: OpParam { + //typealias ParamPrecisionType = P + required init(opDesc: PMOpDesc, inScope: Scope) throws { + do { + scores = try MulticlassNMSParam.getFirstTensor(key: "Scores", map: opDesc.inputs, from: inScope) + bboxes = try MulticlassNMSParam.getFirstTensor(key: "BBoxes", map: opDesc.inputs, from: inScope) + output = try MulticlassNMSParam.outputOut(outputs: opDesc.outputs, from: inScope) + + middleOutput = FetchHolder.init(inPaddedCapacity: scores.tensorDim.numel(), inDim: scores.tensorDim) + + bboxOutput = FetchHolder.init(inPaddedCapacity: bboxes.tensorDim.numel(), inDim: bboxes.tensorDim) + } catch let error { + throw error + } } - } - var bboxOutput: FetchHolder - var middleOutput: FetchHolder - let scores: Texture - let bboxes: Texture - var output: Texture + var bboxOutput: FetchHolder + var middleOutput: FetchHolder + let scores: Texture + let bboxes: Texture + var output: Texture } -class MulticlassNMSOp: Operator, MulticlassNMSParam

>, Runable, Creator, InferShaperable{ - - func inputVariant() -> [String : [MTLBuffer]] { - guard let scoreBuffer = para.middleOutput.resultBuffer, let bboxBuffer = para.middleOutput.resultBuffer else { - fatalError() +class MulticlassNMSOp: Operator, MulticlassNMSParam

>, Runable, Creator, InferShaperable{ + + func inputVariant() -> [String : [MTLBuffer]] { + guard let scoreBuffer = para.middleOutput.resultBuffer, let bboxBuffer = para.middleOutput.resultBuffer else { + fatalError() + } + return ["Scores" : [scoreBuffer], "BBoxes" : [bboxBuffer]] } - return ["Scores" : [scoreBuffer], "BBoxes" : [bboxBuffer]] - } - - func computeMiddleResult(device: MTLDevice, buffer: MTLCommandBuffer) { - do { - try kernel.compute(commandBuffer: buffer, param: para) - } catch let _ { - fatalError() + + func computeMiddleResult(device: MTLDevice, buffer: MTLCommandBuffer) { + do { + try kernel.compute(commandBuffer: buffer, param: para) + } catch let error { + fatalError("\(error)") + } + } + + func inferShape() { + // para.output.dim = para.input.dim + } + + typealias OpType = MulticlassNMSOp

+ func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { + + } + + func delogOutput() { + print(" nms - output: ") + print(para.bboxes.metalTexture.float32Array().strideArray()) } - } - - func inferShape() { - // para.output.dim = para.input.dim - } - - typealias OpType = MulticlassNMSOp

- func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { - - } - - func delogOutput() { - print(" nms - output: ") - print(para.bboxes.metalTexture.float32Array().strideArray()) - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/PoolOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/PoolOp.swift index e57c8f48e362af8cae8fedbb5a0292775f0ce923..8fed29aeddf126ebbe7986a9db4fba0a0d405738 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/PoolOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/PoolOp.swift @@ -14,61 +14,61 @@ import Foundation -class PoolParam: OpParam { - //typealias ParamPrecisionType = P - required init(opDesc: PMOpDesc, inScope: Scope) throws { - do { - input = try PoolParam.inputX(inputs: opDesc.inputs, from: inScope) - output = try PoolParam.outputOut(outputs: opDesc.outputs, from: inScope) - poolType = try PoolParam.getAttr(key: "pooling_type", attrs: opDesc.attrs) - ksize = try PoolParam.getAttr(key: "ksize", attrs: opDesc.attrs) - stride = try PoolParam.getAttr(key: "strides", attrs: opDesc.attrs) - padding = try PoolParam.getAttr(key: "paddings", attrs: opDesc.attrs) - ceilMode = try PoolParam.getAttr(key: "ceil_mode", attrs: opDesc.attrs) - globalPooling = try PoolParam.getAttr(key: "global_pooling", attrs: opDesc.attrs) - assert(input.transpose == [0, 2, 3, 1]) - } catch let error { - throw error +class PoolParam: OpParam { + //typealias ParamPrecisionType = P + required init(opDesc: PMOpDesc, inScope: Scope) throws { + do { + input = try PoolParam.inputX(inputs: opDesc.inputs, from: inScope) + output = try PoolParam.outputOut(outputs: opDesc.outputs, from: inScope) + poolType = try PoolParam.getAttr(key: "pooling_type", attrs: opDesc.attrs) + ksize = try PoolParam.getAttr(key: "ksize", attrs: opDesc.attrs) + stride = try PoolParam.getAttr(key: "strides", attrs: opDesc.attrs) + padding = try PoolParam.getAttr(key: "paddings", attrs: opDesc.attrs) + ceilMode = try PoolParam.getAttr(key: "ceil_mode", attrs: opDesc.attrs) + globalPooling = try PoolParam.getAttr(key: "global_pooling", attrs: opDesc.attrs) + assert(input.transpose == [0, 2, 3, 1]) + } catch let error { + throw error + } + // let buffer = input.metalTexture.buffer.contents().assumingMemoryBound(to: P.self) } - // let buffer = input.metalTexture.buffer.contents().assumingMemoryBound(to: P.self) - } - let input: Texture - var output: Texture - var ksize: [Int32] - var stride: [Int32] - var padding: [Int32] - var poolType: String - var ceilMode: Bool - var globalPooling: Bool + let input: Texture + var output: Texture + var ksize: [Int32] + var stride: [Int32] + var padding: [Int32] + var poolType: String + var ceilMode: Bool + var globalPooling: Bool } -class PoolOp: Operator, PoolParam

>, Runable, Creator, InferShaperable{ - - typealias OpType = PoolOp

- - func inferShape() { - // para.output.dim = para.input.dim - } - - func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { - do { - try kernel.compute(commandBuffer: buffer, param: para) - } catch let error { - throw error +class PoolOp: Operator, PoolParam

>, Runable, Creator, InferShaperable{ + + typealias OpType = PoolOp

+ + func inferShape() { + // para.output.dim = para.input.dim } - } - - func delogOutput() { - print(" \(type) output: ") - print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray()) - -// print("pool2d delog") -// let _: P? = para.input.metalTexture.logDesc(header: "pool2d input: ", stridable: true) -// print(para.ksize) -// print(para.stride) -// print(para.padding) -// print(para.poolType) -// let _: P? = para.output.metalTexture.logDesc(header: "pool2d output: ", stridable: true) - } + func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { + do { + try kernel.compute(commandBuffer: buffer, param: para) + } catch let error { + throw error + } + } + + func delogOutput() { + print(" \(type) output: ") + print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray()) + + + // print("pool2d delog") + // let _: P? = para.input.metalTexture.logDesc(header: "pool2d input: ", stridable: true) + // print(para.ksize) + // print(para.stride) + // print(para.padding) + // print(para.poolType) + // let _: P? = para.output.metalTexture.logDesc(header: "pool2d output: ", stridable: true) + } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/PreluOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/PreluOp.swift index b7150c2fea85b7a6da6ae883e95c751484db6af6..429e82a49350b42791296330b418c2126b0605e8 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/PreluOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/PreluOp.swift @@ -14,52 +14,52 @@ import Foundation -class PreluParam: OpParam { - //typealias ParamPrecisionType = P - required init(opDesc: PMOpDesc, inScope: Scope) throws { - do { - input = try PreluParam.inputX(inputs: opDesc.inputs, from: inScope) - output = try PreluParam.outputOut(outputs: opDesc.outputs, from: inScope) - alpha = try PreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope) - mode = try PreluParam.getAttr(key: "mode", attrs: opDesc.attrs) - } catch let error { - throw error +class PreluParam: OpParam { + //typealias ParamPrecisionType = P + required init(opDesc: PMOpDesc, inScope: Scope) throws { + do { + input = try PreluParam.inputX(inputs: opDesc.inputs, from: inScope) + output = try PreluParam.outputOut(outputs: opDesc.outputs, from: inScope) + alpha = try PreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope) + mode = try PreluParam.getAttr(key: "mode", attrs: opDesc.attrs) + } catch let error { + throw error + } } - } - let mode: String - let alpha: Tensor

- let input: Texture - var output: Texture + let mode: String + let alpha: Tensor

+ let input: Texture + var output: Texture } -class PreluOp: Operator, PreluParam

>, Runable, Creator, InferShaperable{ - - typealias OpType = PreluOp

- - func inferShape() { - // para.output.dim = para.input.dim - } - - func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { - do { - try kernel.compute(commandBuffer: buffer, param: para) - } catch let error { - throw error +class PreluOp: Operator, PreluParam

>, Runable, Creator, InferShaperable{ + + typealias OpType = PreluOp

+ + func inferShape() { + // para.output.dim = para.input.dim } - } - - func delogOutput() { - print(" \(type) input: ") - print(para.input.metalTexture.toTensor(dim: (n: para.input.padToFourDim[0], c: para.input.padToFourDim[1], h: para.input.padToFourDim[2], w: para.input.padToFourDim[3])).strideArray()) - print(" \(type) Alpha: ") - let _: Float32? = para.alpha.buffer.logDesc(header: " alpha: ", stridable: false) + func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { + do { + try kernel.compute(commandBuffer: buffer, param: para) + } catch let error { + throw error + } + } + + func delogOutput() { + print(" \(type) input: ") + print(para.input.metalTexture.toTensor(dim: (n: para.input.padToFourDim[0], c: para.input.padToFourDim[1], h: para.input.padToFourDim[2], w: para.input.padToFourDim[3])).strideArray()) + + print(" \(type) Alpha: ") + let _: Float32? = para.alpha.buffer.logDesc(header: " alpha: ", stridable: false) + + print(" \(type) output: ") + print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray()) + } - print(" \(type) output: ") - print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray()) - } - -// print("softmax delog") -// let _: P? = para.input.metalTexture.logDesc(header: "softmax input: ", stridable: false) -// let _: P? = para.output.metalTexture.logDesc(header: "softmax output: ", stridable: false) + // print("softmax delog") + // let _: P? = para.input.metalTexture.logDesc(header: "softmax input: ", stridable: false) + // let _: P? = para.output.metalTexture.logDesc(header: "softmax output: ", stridable: false) } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/PriorBoxOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/PriorBoxOp.swift index bff7c9870a3dc70e820b02ad775ca8a19527c26d..6af9490766ba9d2e50ac715ac6a510e207329116 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/PriorBoxOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/PriorBoxOp.swift @@ -14,110 +14,110 @@ import Foundation -class PriorBoxParam: OpParam { - //typealias ParamPrecisionType = P - required init(opDesc: PMOpDesc, inScope: Scope) throws { - do { - min_max_aspect_ratios_order = try PriorBoxParam.getAttr(key: "min_max_aspect_ratios_order", attrs: opDesc.attrs) - } catch _ { +class PriorBoxParam: OpParam { + //typealias ParamPrecisionType = P + required init(opDesc: PMOpDesc, inScope: Scope) throws { + do { + min_max_aspect_ratios_order = try PriorBoxParam.getAttr(key: "min_max_aspect_ratios_order", attrs: opDesc.attrs) + } catch _ { + } + + do { + input = try PriorBoxParam.input(inputs: opDesc.inputs, from: inScope) + output = try PriorBoxParam.outputBoxes(outputs: opDesc.outputs, from: inScope) + inputImage = try PriorBoxParam.inputImage(inputs: opDesc.inputs, from: inScope) + outputVariances = try PriorBoxParam.outputVariances(outputs: opDesc.outputs, from: inScope) + minSizes = try PriorBoxParam.getAttr(key: "min_sizes", attrs: opDesc.attrs) + maxSizes = try PriorBoxParam.getAttr(key: "max_sizes", attrs: opDesc.attrs) + aspectRatios = try PriorBoxParam.getAttr(key: "aspect_ratios", attrs: opDesc.attrs) + variances = try PriorBoxParam.getAttr(key: "variances", attrs: opDesc.attrs) + flip = try PriorBoxParam.getAttr(key: "flip", attrs: opDesc.attrs) + clip = try PriorBoxParam.getAttr(key: "clip", attrs: opDesc.attrs) + stepW = try PriorBoxParam.getAttr(key: "step_w", attrs: opDesc.attrs) + stepH = try PriorBoxParam.getAttr(key: "step_h", attrs: opDesc.attrs) + offset = try PriorBoxParam.getAttr(key: "offset", attrs: opDesc.attrs) + } catch let error { + throw error + } } - do { - input = try PriorBoxParam.input(inputs: opDesc.inputs, from: inScope) - output = try PriorBoxParam.outputBoxes(outputs: opDesc.outputs, from: inScope) - inputImage = try PriorBoxParam.inputImage(inputs: opDesc.inputs, from: inScope) - outputVariances = try PriorBoxParam.outputVariances(outputs: opDesc.outputs, from: inScope) - minSizes = try PriorBoxParam.getAttr(key: "min_sizes", attrs: opDesc.attrs) - maxSizes = try PriorBoxParam.getAttr(key: "max_sizes", attrs: opDesc.attrs) - aspectRatios = try PriorBoxParam.getAttr(key: "aspect_ratios", attrs: opDesc.attrs) - variances = try PriorBoxParam.getAttr(key: "variances", attrs: opDesc.attrs) - flip = try PriorBoxParam.getAttr(key: "flip", attrs: opDesc.attrs) - clip = try PriorBoxParam.getAttr(key: "clip", attrs: opDesc.attrs) - stepW = try PriorBoxParam.getAttr(key: "step_w", attrs: opDesc.attrs) - stepH = try PriorBoxParam.getAttr(key: "step_h", attrs: opDesc.attrs) - offset = try PriorBoxParam.getAttr(key: "offset", attrs: opDesc.attrs) - } catch let error { - throw error - } - } - - var min_max_aspect_ratios_order: Bool = false - let minSizes: [Float32] - let maxSizes: [Float32] - let aspectRatios: [Float32] - var newAspectRatios: MTLBuffer? - let variances: [Float32] - let flip: Bool - let clip: Bool - var stepW: Float32 - var stepH: Float32 - let offset: Float32 - - let input: Texture - let inputImage: Texture - var output: Texture - let outputVariances: Texture + var min_max_aspect_ratios_order: Bool = false + let minSizes: [Float32] + let maxSizes: [Float32] + let aspectRatios: [Float32] + var newAspectRatios: MTLBuffer? + let variances: [Float32] + let flip: Bool + let clip: Bool + var stepW: Float32 + var stepH: Float32 + let offset: Float32 + + let input: Texture + let inputImage: Texture + var output: Texture + let outputVariances: Texture } -class PriorBoxOp: Operator, PriorBoxParam

>, Runable, Creator, InferShaperable{ - - typealias OpType = PriorBoxOp

- - func inferShape() { - } - - func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { - do { - try kernel.compute(commandBuffer: buffer, param: para) - } catch let error { - throw error - } - } - - func delogOutput() { - - print(" \(type) output: ") - // output -// let outputArray = para.output.metalTexture.float32Array() -// print(outputArray.strideArray()) -// let device = para.input.metalTexture!.device -// let boxes:[Float32] = device.texture2tensor(texture: para.output.metalTexture!, dim: para.output.tensorDim.dims, transpose: [2,0,1,3]) -// let variances:[Float32] = device.texture2tensor(texture: para.outputVariances.metalTexture!, dim: para.outputVariances.tensorDim.dims, transpose: [2,0,1,3]) -// print("boxes: ") -// print(boxes.strideArray()) -// print("variances: ") -// print(variances.strideArray()) - // output - print(" \(type) output: ") +class PriorBoxOp: Operator, PriorBoxParam

>, Runable, Creator, InferShaperable{ - let box = para.output.metalTexture.realNHWC(dim: (para.output.dim[0], para.output.dim[1], para.output.dim[2], para.output.dim[3])) - print(" dim: \(para.output.dim)") - print(box.strideArray()) -// print((0.. -// let padToFourDim = para.output.padToFourDim -// if para.output.transpose == [0, 1, 2, 3] { -// let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]), texturePrecision: computePrecision) -// print(outputArray.strideArray()) -// } else if para.output.transpose == [0, 2, 3, 1] { -// print(para.output.metalTexture.toTensor(dim: (n: padToFourDim[0], c: padToFourDim[1], h: padToFourDim[2], w: padToFourDim[3]), texturePrecision: computePrecision).strideArray()) -// } else { -// print(" not implement") -// } - -// writeToLibrary(fileName: "box_out", array: outputArray) - - // output variance -// let outputVarianceArray = para.outputVariances.metalTexture.floatArray { (o: Float32) -> Float32 in -// return o -// } -// -// print(" output variance: \(outputVarianceArray)") + func inferShape() { + } -// writeToLibrary(fileName: "variance_out", array: outputVarianceArray) + func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { + do { + try kernel.compute(commandBuffer: buffer, param: para) + } catch let error { + throw error + } + } - } + func delogOutput() { + + print(" \(type) output: ") + // output + // let outputArray = para.output.metalTexture.float32Array() + // print(outputArray.strideArray()) + // let device = para.input.metalTexture!.device + // let boxes:[Float32] = device.texture2tensor(texture: para.output.metalTexture!, dim: para.output.tensorDim.dims, transpose: [2,0,1,3]) + // let variances:[Float32] = device.texture2tensor(texture: para.outputVariances.metalTexture!, dim: para.outputVariances.tensorDim.dims, transpose: [2,0,1,3]) + // print("boxes: ") + // print(boxes.strideArray()) + // print("variances: ") + // print(variances.strideArray()) + // output + print(" \(type) output: ") + + let box = para.output.metalTexture.realNHWC(dim: (para.output.dim[0], para.output.dim[1], para.output.dim[2], para.output.dim[3])) + print(" dim: \(para.output.dim)") + print(box.strideArray()) + // print((0.. Float32 in + // return o + // } + // + // print(" output variance: \(outputVarianceArray)") + + // writeToLibrary(fileName: "variance_out", array: outputVarianceArray) + + } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ReluOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ReluOp.swift index ef109081061c601fb17a23e943dcd01af618b724..8a782f694b58cfe99146a8f7df2f1b8c0b5079be 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ReluOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ReluOp.swift @@ -15,45 +15,45 @@ import Foundation -class ReluParam: OpParam { - //typealias ParamPrecisionType = P - required init(opDesc: PMOpDesc, inScope: Scope) throws { - do { - input = try ReluParam.inputX(inputs: opDesc.inputs, from: inScope) - output = try ReluParam.outputOut(outputs: opDesc.outputs, from: inScope) - } catch let error { - throw error +class ReluParam: OpParam { + //typealias ParamPrecisionType = P + required init(opDesc: PMOpDesc, inScope: Scope) throws { + do { + input = try ReluParam.inputX(inputs: opDesc.inputs, from: inScope) + output = try ReluParam.outputOut(outputs: opDesc.outputs, from: inScope) + } catch let error { + throw error + } } - } - let input: Texture - var output: Texture + let input: Texture + var output: Texture } -class ReluOp: Operator, ReluParam

>, Runable, Creator, InferShaperable{ - - typealias OpType = ReluOp

- - func inferShape() { - para.output.dim = para.input.dim - } - - func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { - do { - try kernel.compute(commandBuffer: buffer, param: para) - } catch let error { - throw error +class ReluOp: Operator, ReluParam

>, Runable, Creator, InferShaperable{ + + typealias OpType = ReluOp

+ + func inferShape() { + para.output.dim = para.input.dim } - } - - func delogOutput() { - print(" \(type) output: ") - print(para.output.metalTexture) - print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray()) -// let device = para.output.metalTexture!.device -// let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose) -// print(outputArray.strideArray()) - } - + + func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { + do { + try kernel.compute(commandBuffer: buffer, param: para) + } catch let error { + throw error + } + } + + func delogOutput() { + print(" \(type) output: ") + print(para.output.metalTexture) + print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray()) + // let device = para.output.metalTexture!.device + // let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose) + // print(outputArray.strideArray()) + } + } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ReshapeOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ReshapeOp.swift index e40eae02d0c11c0bd372514466b28cef27dea96b..acff1c95eaf421acd46293048c43d18456c45439 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ReshapeOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ReshapeOp.swift @@ -15,64 +15,64 @@ import Foundation import Metal -class ReshapeParam: OpParam { - //typealias ParamPrecisionType = P - required init(opDesc: PMOpDesc, inScope: Scope) throws { - do { - input = try ReshapeParam.inputX(inputs: opDesc.inputs, from: inScope) - output = try ReshapeParam.outputOut(outputs: opDesc.outputs, from: inScope) - shape = try ReshapeParam.getAttr(key: "shape", attrs: opDesc.attrs) - - var s: [Int] = shape.map { Int($0) } - - var di = -1 - var ml = 1 - for i in 0..: OpParam { + //typealias ParamPrecisionType = P + required init(opDesc: PMOpDesc, inScope: Scope) throws { + do { + input = try ReshapeParam.inputX(inputs: opDesc.inputs, from: inScope) + output = try ReshapeParam.outputOut(outputs: opDesc.outputs, from: inScope) + shape = try ReshapeParam.getAttr(key: "shape", attrs: opDesc.attrs) + + var s: [Int] = shape.map { Int($0) } + + var di = -1 + var ml = 1 + for i in 0..= 0 { + s[di] = input.dim.numel() / ml + } + output.tensorDim = Dim.init(inDim: s) + var dim: [Int] = [1, 1, 1, 1] + for i in 0..= 0 { - s[di] = input.dim.numel() / ml - } - output.tensorDim = Dim.init(inDim: s) - var dim: [Int] = [1, 1, 1, 1] - for i in 0..: Operator, ReshapeParam

>, Runable, Creator, InferShaperable{ - - typealias OpType = ReshapeOp

- - func inferShape() { - // para.output.dim = para.input.dim - } - - func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { - do { - try kernel.compute(commandBuffer: buffer, param: para) - } catch let error { - throw error +class ReshapeOp: Operator, ReshapeParam

>, Runable, Creator, InferShaperable{ + + typealias OpType = ReshapeOp

+ + func inferShape() { + // para.output.dim = para.input.dim + } + + func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { + do { + try kernel.compute(commandBuffer: buffer, param: para) + } catch let error { + throw error + } + } + func delogOutput() { + print("reshape delog") + let device = para.output.metalTexture!.device + let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose) + print(outputArray.strideArray()) + // print(outputArray) } - } - func delogOutput() { - print("reshape delog") - let device = para.output.metalTexture!.device - let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose) - print(outputArray.strideArray()) -// print(outputArray) - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ResizeBilinearOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ResizeBilinearOp.swift index 980bb734a796c067012855f8a0d0c4ccef33afdb..7b9fbf412ff2216195f764f32cd757516604052d 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ResizeBilinearOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ResizeBilinearOp.swift @@ -14,51 +14,45 @@ import Foundation -class ResizeBilinearParam: OpParam { - typealias ParamPrecisionType = P - required init(opDesc: PMOpDesc, inScope: Scope) throws { - do { - input = try ResizeBilinearParam.inputX(inputs: opDesc.inputs, from: inScope) -// if (input.transpose != [0, 2, 3, 1]) || (input.tensorDim.cout() != 4) { -// fatalError() -// } - output = try ResizeBilinearParam.outputOut(outputs: opDesc.outputs, from: inScope) - out_h = try ResizeBilinearParam.getAttr(key: "out_h", attrs: opDesc.attrs) - out_w = try ResizeBilinearParam.getAttr(key: "out_w", attrs: opDesc.attrs) - } catch let error { - throw error +class ResizeBilinearParam: OpParam { + typealias ParamPrecisionType = P + required init(opDesc: PMOpDesc, inScope: Scope) throws { + do { + input = try ResizeBilinearParam.inputX(inputs: opDesc.inputs, from: inScope) + // if (input.transpose != [0, 2, 3, 1]) || (input.tensorDim.cout() != 4) { + // fatalError() + // } + output = try ResizeBilinearParam.outputOut(outputs: opDesc.outputs, from: inScope) + out_h = try ResizeBilinearParam.getAttr(key: "out_h", attrs: opDesc.attrs) + out_w = try ResizeBilinearParam.getAttr(key: "out_w", attrs: opDesc.attrs) + } catch let error { + throw error + } } - } - let input: Texture - var output: Texture - let out_h: Int32 - let out_w: Int32 + let input: Texture + var output: Texture + let out_h: Int32 + let out_w: Int32 } -class ResizeBilinearOp: Operator, ResizeBilinearParam

>, Runable, Creator, InferShaperable{ - - typealias OpType = ResizeBilinearOp

- - func inferShape() { - // para.output.dim = para.input.dim - } - - func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { - do { - try kernel.compute(commandBuffer: buffer, param: para) - } catch let error { - throw error +class ResizeBilinearOp: Operator, ResizeBilinearParam

>, Runable, Creator, InferShaperable{ + + typealias OpType = ResizeBilinearOp

+ + func inferShape() { + // para.output.dim = para.input.dim + } + + func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { + do { + try kernel.compute(commandBuffer: buffer, param: para) + } catch let error { + throw error + } + } + + func delogOutput() { + print(" \(type) output: ") } - } - - func delogOutput() { - print(" \(type) output: ") - } - + } - - - - - - diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ShapeOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ShapeOp.swift index c13c3864e4f73bdad1b83e19ca9f66051eea266d..c3a1d37f52286ee21355e308705b07a119c5a1b0 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ShapeOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ShapeOp.swift @@ -14,40 +14,40 @@ import Foundation -class ShapeParam: OpParam { - // typealias ParamPrecisionType = P - required init(opDesc: PMOpDesc, inScope: Scope) throws { - do { - input = try ShapeParam.input(inputs: opDesc.inputs, from: inScope) - output = try ShapeParam.outputOut(outputs: opDesc.outputs, from: inScope) - } catch let error { - throw error +class ShapeParam: OpParam { + // typealias ParamPrecisionType = P + required init(opDesc: PMOpDesc, inScope: Scope) throws { + do { + input = try ShapeParam.input(inputs: opDesc.inputs, from: inScope) + output = try ShapeParam.outputOut(outputs: opDesc.outputs, from: inScope) + } catch let error { + throw error + } } - } - var output: Texture - let input: Texture + var output: Texture + let input: Texture } -class ShapeOp: Operator, ShapeParam

>, Runable, Creator, InferShaperable{ - - typealias OpType = ShapeOp

- - func inferShape() { - // para.output.dim = para.input.dim - } - - func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { - do { - try kernel.compute(commandBuffer: buffer, param: para) - } catch let error { - throw error +class ShapeOp: Operator, ShapeParam

>, Runable, Creator, InferShaperable{ + + typealias OpType = ShapeOp

+ + func inferShape() { + // para.output.dim = para.input.dim } - } - - func delogOutput() { - print(" \(type) output: ") - } - + + func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { + do { + try kernel.compute(commandBuffer: buffer, param: para) + } catch let error { + throw error + } + } + + func delogOutput() { + print(" \(type) output: ") + } + } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/SoftmaxOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/SoftmaxOp.swift index 2b2455eaa60142f890c7ee5e14244c77854a0ccd..cf56f9590c2ddbfa5b5d190bb71a094239d06311 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/SoftmaxOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/SoftmaxOp.swift @@ -15,49 +15,49 @@ import Foundation import Metal -class SoftmaxParam: OpParam { - //typealias ParamPrecisionType = P - required init(opDesc: PMOpDesc, inScope: Scope) throws { - do { - input = try SoftmaxParam.inputX(inputs: opDesc.inputs, from: inScope) - output = try SoftmaxParam.outputOut(outputs: opDesc.outputs, from: inScope) - - //assert(input.tensorDim.dims.count == 2) - //assert(input.transpose == [0, 1, 2, 3]) - - output.dim = input.dim - output.tensorDim = input.tensorDim - output.padToFourDim = input.padToFourDim - } catch let error { - throw error +class SoftmaxParam: OpParam { + //typealias ParamPrecisionType = P + required init(opDesc: PMOpDesc, inScope: Scope) throws { + do { + input = try SoftmaxParam.inputX(inputs: opDesc.inputs, from: inScope) + output = try SoftmaxParam.outputOut(outputs: opDesc.outputs, from: inScope) + + //assert(input.tensorDim.dims.count == 2) + //assert(input.transpose == [0, 1, 2, 3]) + + output.dim = input.dim + output.tensorDim = input.tensorDim + output.padToFourDim = input.padToFourDim + } catch let error { + throw error + } } - } - let input: Texture - var output: Texture + let input: Texture + var output: Texture } -class SoftmaxOp: Operator, SoftmaxParam

>, Runable, Creator, InferShaperable{ - typealias OpType = SoftmaxOp

- - func inferShape() { - // para.output.dim = para.input.dim - } - - func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { - do { - try kernel.compute(commandBuffer: buffer, param: para) - } catch let error { - throw error +class SoftmaxOp: Operator, SoftmaxParam

>, Runable, Creator, InferShaperable{ + typealias OpType = SoftmaxOp

+ + func inferShape() { + // para.output.dim = para.input.dim } - } - - func delogOutput() { - print("softmax delog") - print(para.input) - print(para.output) - let padToFourDim = para.output.padToFourDim - let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3])) - print(outputArray.strideArray()) - } + func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { + do { + try kernel.compute(commandBuffer: buffer, param: para) + } catch let error { + throw error + } + } + + func delogOutput() { + print("softmax delog") + print(para.input) + + print(para.output) + let padToFourDim = para.output.padToFourDim + let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3])) + print(outputArray.strideArray()) + } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/SplitOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/SplitOp.swift index 4d9933f39275d522cec71ca08a591182433d7bae..d6f4aa4784dd7886b7ee9698abf7dca8b66137c0 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/SplitOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/SplitOp.swift @@ -14,64 +14,64 @@ import Foundation -class SplitParam: OpParam { - //typealias ParamPrecisionType = P - required init(opDesc: PMOpDesc, inScope: Scope) throws { - do { - input = try SplitParam.inputX(inputs: opDesc.inputs, from: inScope) - output = Texture.init(device: input.metalTexture!.device, inDim: input.dim) - axis = try SplitParam.getAttr(key: "axis", attrs: opDesc.attrs) - sections = try SplitParam.getAttr(key: "sections", attrs: opDesc.attrs) - if axis < 0 { - axis = input.tensorDim.cout() + axis - } - guard let outlist = opDesc.outputs["Out"] else { - fatalError() - } - for out in outlist { - guard let variant = inScope[out], let v = variant as? Texture else { - fatalError() +class SplitParam: OpParam { + //typealias ParamPrecisionType = P + required init(opDesc: PMOpDesc, inScope: Scope) throws { + do { + input = try SplitParam.inputX(inputs: opDesc.inputs, from: inScope) + output = Texture.init(device: input.metalTexture!.device, inDim: input.dim) + axis = try SplitParam.getAttr(key: "axis", attrs: opDesc.attrs) + sections = try SplitParam.getAttr(key: "sections", attrs: opDesc.attrs) + if axis < 0 { + axis = input.tensorDim.cout() + axis + } + guard let outlist = opDesc.outputs["Out"] else { + fatalError() + } + for out in outlist { + guard let variant = inScope[out], let v = variant as? Texture else { + fatalError() + } + outputList.append(v) + sections.append(Int32(v.tensorDim.dims[axis])) + } + } catch let error { + throw error } - outputList.append(v) - sections.append(Int32(v.tensorDim.dims[axis])) - } - } catch let error { - throw error } - } - - var axis: Int - let input: Texture - var output: Texture - var outputList: [Texture] = [] - var sections: [Int32] = [] + + var axis: Int + let input: Texture + var output: Texture + var outputList: [Texture] = [] + var sections: [Int32] = [] } -class SplitOp: Operator, SplitParam

>, Runable, Creator, InferShaperable{ - - typealias OpType = SplitOp

- - func inferShape() { - // para.output.dim = para.input.dim - } - - func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { - do { - try kernel.compute(commandBuffer: buffer, param: para) - } catch let error { - throw error +class SplitOp: Operator, SplitParam

>, Runable, Creator, InferShaperable{ + + typealias OpType = SplitOp

+ + func inferShape() { + // para.output.dim = para.input.dim + } + + func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { + do { + try kernel.compute(commandBuffer: buffer, param: para) + } catch let error { + throw error + } } - } - - func delogOutput() { - print(" \(type) output: ") - let device = para.input.metalTexture!.device - for out in para.outputList { - let arr: [Float32] = device.texture2tensor(texture: out.metalTexture, dim: out.tensorDim.dims, transpose: out.transpose) - print(arr.strideArray()) + + func delogOutput() { + print(" \(type) output: ") + let device = para.input.metalTexture!.device + for out in para.outputList { + let arr: [Float32] = device.texture2tensor(texture: out.metalTexture, dim: out.tensorDim.dims, transpose: out.transpose) + print(arr.strideArray()) + } } - } - + } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/TransposeOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/TransposeOp.swift index 064955fcac20937ae3ac8a12f51ef52ab5a00ba9..458bbef54fb3136e8b9144e387e8c359873f93d7 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/TransposeOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/TransposeOp.swift @@ -15,44 +15,44 @@ import Foundation import Metal -class TransposeParam: OpParam { - //typealias ParamPrecisionType = P - required init(opDesc: PMOpDesc, inScope: Scope) throws { - do { - input = try TransposeParam.inputX(inputs: opDesc.inputs, from: inScope) - output = try TransposeParam.outputOut(outputs: opDesc.outputs, from: inScope) - axis = try TransposeParam.getAttr(key: "axis", attrs: opDesc.attrs) - } catch let error { - throw error +class TransposeParam: OpParam { + //typealias ParamPrecisionType = P + required init(opDesc: PMOpDesc, inScope: Scope) throws { + do { + input = try TransposeParam.inputX(inputs: opDesc.inputs, from: inScope) + output = try TransposeParam.outputOut(outputs: opDesc.outputs, from: inScope) + axis = try TransposeParam.getAttr(key: "axis", attrs: opDesc.attrs) + } catch let error { + throw error + } } - } - let input: Texture - var output: Texture - let axis: [Int32] + let input: Texture + var output: Texture + let axis: [Int32] } -class TransposeOp: Operator, TransposeParam

>, Runable, Creator, InferShaperable{ - - typealias OpType = TransposeOp

- - func inferShape() { - //para.output.dim = para.input.dim - } - - func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { - do { - try kernel.compute(commandBuffer: buffer, param: para) - } catch let error { - throw error +class TransposeOp: Operator, TransposeParam

>, Runable, Creator, InferShaperable{ + + typealias OpType = TransposeOp

+ + func inferShape() { + //para.output.dim = para.input.dim + } + + func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { + do { + try kernel.compute(commandBuffer: buffer, param: para) + } catch let error { + throw error + } + } + + func delogOutput() { + print(" \(type) output: ") + let device = para.output.metalTexture!.device + let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose) + print(outputArray.strideArray()) } - } - - func delogOutput() { - print(" \(type) output: ") - let device = para.output.metalTexture!.device - let outputArray: [Float32] = device.texture2tensor(texture: para.output.metalTexture, dim: para.output.tensorDim.dims, transpose: para.output.transpose) - print(outputArray.strideArray()) - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Program/PMBlockDesc.swift b/metal/paddle-mobile/paddle-mobile/Src/Program/PMBlockDesc.swift index b021b09008b1f3bef3ba01d5a51fe7b7803fedaa..27ed620c24dcbc2f4423debe8b14c4436d0b6dda 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Program/PMBlockDesc.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Program/PMBlockDesc.swift @@ -45,13 +45,13 @@ public class PMBlockDesc { } extension PMBlockDesc: CustomStringConvertible, CustomDebugStringConvertible { - public var description: String { + public var description: String { var str = "" for i in 0.. Bool) -> [String : [String]] in @@ -58,24 +58,24 @@ class PMOpDesc { } extension PMOpDesc: CustomStringConvertible, CustomDebugStringConvertible { - var description: String { - var str = "" - str += "op type: \(type): \n" - str += " op inputs: \n" - str += " \(inputs) \n" - str += " op para inputs: \n" - str += " \(paraInputs) \n" - str += " op para outputs: \n" - str += " \(outputs) \n" - str += " op attrs: \n" - str += " \(attrs) \n" + var description: String { + var str = "" + str += "op type: \(type): \n" + str += " op inputs: \n" + str += " \(inputs) \n" + str += " op para inputs: \n" + str += " \(paraInputs) \n" + str += " op para outputs: \n" + str += " \(outputs) \n" + str += " op attrs: \n" + str += " \(attrs) \n" + + return str + } + + var debugDescription: String { + return description + } + - return str - } - - var debugDescription: String { - return description - } - - } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Program/PMVarDesc.swift b/metal/paddle-mobile/paddle-mobile/Src/Program/PMVarDesc.swift index 130e6f49fb61b1488538849e86ff793b53f31a86..e97f448e294c1187a12b4e6bf1139e0425de26b3 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Program/PMVarDesc.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Program/PMVarDesc.swift @@ -79,7 +79,7 @@ public class PMVarDesc { } extension PMVarDesc: CustomStringConvertible, CustomDebugStringConvertible { - public var description: String { + public var description: String { var str = "" str += "var name \(name): \n" if let inTensorDesc = tensorDesc { @@ -93,7 +93,7 @@ extension PMVarDesc: CustomStringConvertible, CustomDebugStringConvertible { return str } - public var debugDescription: String { + public var debugDescription: String { return description } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Program/ProgramOptimize.swift b/metal/paddle-mobile/paddle-mobile/Src/Program/ProgramOptimize.swift index dcb065de3d8c6e7ec6cf437cbc2a19305def08ae..aaf1da12382072efacfbb2dd91755b8d4f36f639 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Program/ProgramOptimize.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Program/ProgramOptimize.swift @@ -15,286 +15,286 @@ import Foundation precedencegroup ChainNode { - associativity: left - higherThan: MultiplicationPrecedence + associativity: left + higherThan: MultiplicationPrecedence } infix operator --> : ChainNode class Node { - var inputs: [Node] = [] - var outputs: [Node] = [] - var type: String - var opDesc: PMOpDesc? - init(inOpDesc: PMOpDesc) { - type = inOpDesc.type - opDesc = inOpDesc - } - - init(inType: String) { - type = inType - } - - subscript(index: Int) -> [Node] { - var nodes: [Node] = [] - getNodesWithLocation(index: index, nowIndex: 0, nodes: &nodes) - return nodes - } - - func getNodesWithLocation(index: Int, nowIndex: Int, nodes: inout [Node]) { - if index == nowIndex { - nodes.append(self) + var inputs: [Node] = [] + var outputs: [Node] = [] + var type: String + var opDesc: PMOpDesc? + init(inOpDesc: PMOpDesc) { + type = inOpDesc.type + opDesc = inOpDesc } - for output in outputs { - output.getNodesWithLocation(index: index, nowIndex: nowIndex + 1, nodes: &nodes) + init(inType: String) { + type = inType } - } - - static func -->(lNode: Node, rNode: Node) -> Node { - lNode.outputs.append(rNode) - rNode.inputs.append(lNode) - return rNode - } - - func depth(begin: UInt = 1) -> UInt { - var beginMax: UInt = 1 - for output in outputs { - let subDepth = output.depth(begin: begin + 1) - beginMax = max(begin, subDepth) - } - beginMax = max(begin, beginMax) - return beginMax - } - - func to(depth: UInt) -> Node { - let beginNode = Node.init(inType: type) - beginNode.opDesc = opDesc - to(depth: depth - 1, withNode: beginNode) - return beginNode - } - - func folderWith(fusion: Fusion.Type, removedNodes: inout [Node]) { - let fusionNode = fusion.fusionNode() - let change = fusion.change() - let inOutputs = outputs - outputs.removeAll() - opDesc?.outputs.removeAll() - for i in 0.. [Node] { + var nodes: [Node] = [] + getNodesWithLocation(index: index, nowIndex: 0, nodes: &nodes) + return nodes } - opDesc?.type = fusion.fusionType() - type = fusion.fusionType() - } - - private func folderWith(beginNode: Node, matchNode: Node, change: [String : [(from: String, to: String)]], removedNodes: inout [Node]) { - guard let inOpdesc = opDesc else { - fatalError() + + func getNodesWithLocation(index: Int, nowIndex: Int, nodes: inout [Node]) { + if index == nowIndex { + nodes.append(self) + } + + for output in outputs { + output.getNodesWithLocation(index: index, nowIndex: nowIndex + 1, nodes: &nodes) + } } - for attr in inOpdesc.attrs { - beginNode.opDesc?.attrs[attr.key] = attr.value - // print(beginNode.opDesc?.attrs) + static func -->(lNode: Node, rNode: Node) -> Node { + lNode.outputs.append(rNode) + rNode.inputs.append(lNode) + return rNode } - for paraInput in inOpdesc.paraInputs { - if let inChanges = change[type] { - for keyChange in inChanges { - if keyChange.from == paraInput.key { - beginNode.opDesc?.paraInputs[keyChange.to] = paraInput.value - } else { - beginNode.opDesc?.paraInputs[paraInput.key] = paraInput.value - } + func depth(begin: UInt = 1) -> UInt { + var beginMax: UInt = 1 + for output in outputs { + let subDepth = output.depth(begin: begin + 1) + beginMax = max(begin, subDepth) } - } else { - beginNode.opDesc?.paraInputs[paraInput.key] = paraInput.value - } + beginMax = max(begin, beginMax) + return beginMax } - if matchNode.outputs.count == 0 { - beginNode.outputs.append(contentsOf: outputs) - beginNode.opDesc?.outputs = inOpdesc.outputs - + func to(depth: UInt) -> Node { + let beginNode = Node.init(inType: type) + beginNode.opDesc = opDesc + to(depth: depth - 1, withNode: beginNode) + return beginNode } - removedNodes.append(self) - for i in 0.. [String : Node]{ - var map: [String : Node] = [:] - relationship(map: &map) - return map - } - - private func relationship(map: inout [String : Node]) { - guard let inOpDesc = opDesc else { - return + private func to(depth: UInt, withNode: Node) { + if depth < 1 { + return + } + + for output in outputs { + let node = Node.init(inType: output.type) + node.opDesc = output.opDesc + withNode.outputs.append(node) + output.to(depth: depth - 1, withNode: node) + } } - for output in inOpDesc.outputs { - for outputKey in output.value { - map[outputKey] = self - } + func relationship() -> [String : Node]{ + var map: [String : Node] = [:] + relationship(map: &map) + return map } - for output in outputs { - output.relationship(map: &map) + private func relationship(map: inout [String : Node]) { + guard let inOpDesc = opDesc else { + return + } + + for output in inOpDesc.outputs { + for outputKey in output.value { + map[outputKey] = self + } + } + + for output in outputs { + output.relationship(map: &map) + } } - } - + } extension Node: Equatable { - static func == (lhs: Node, rhs: Node) -> Bool { - if lhs.outputs.count != rhs.outputs.count { - return false - } - - if lhs.type != rhs.type { - return false + static func == (lhs: Node, rhs: Node) -> Bool { + if lhs.outputs.count != rhs.outputs.count { + return false + } + + if lhs.type != rhs.type { + return false + } + + for i in 0.. { - // register fusion - let fusionOps: [Fusion.Type] = [ConvAddBatchNormReluOp

.self, -// ConvAddAddPreluOp

.self, - ConvAddPreluOp

.self, - ConvAddOp

.self, - ConvBNReluOp

.self, - DwConvBNReluOp

.self, - ElementwiseAddPreluOp

.self - ] - - func optimize(originProgramDesc: PMProgramDesc) -> PMProgramDesc { +class ProgramOptimize { + // register fusion + let fusionOps: [Fusion.Type] = [ConvAddBatchNormReluOp

.self, + // ConvAddAddPreluOp

.self, + ConvAddPreluOp

.self, + ConvAddOp

.self, + ConvBNReluOp

.self, + DwConvBNReluOp

.self, + ElementwiseAddPreluOp

.self + ] - guard originProgramDesc.blocks.count == 1 else { - fatalError(" not support yet") - } - - var mapForNodeChain: [String : Node] = [:] - var nodes: [Node] = [] - var typeMapNodes: [String : [(node: Node, output: [String : Node])]] = [:] - let block = originProgramDesc.blocks[0] - for opDesc in block.ops { - print(opDesc.type) - guard let opInputKeys = opInfos[opDesc.type]?.inputs, let outputKeys = opInfos[opDesc.type]?.outputs else { - fatalError() - } - - let node = Node.init(inOpDesc: opDesc) - for inputKey in opInputKeys { - if let inputs = opDesc.inputs[inputKey] { - for input in inputs { - if let inputNode = mapForNodeChain[input] { - _ = inputNode --> node - } - } + func optimize(originProgramDesc: PMProgramDesc) -> PMProgramDesc { + + guard originProgramDesc.blocks.count == 1 else { + fatalError(" not support yet") } - } - - for outputKey in outputKeys { - if let outputs = opDesc.outputs[outputKey] { - for output in outputs { - mapForNodeChain[output] = node - } - } - } - - nodes.append(node) - - if var inNodes = typeMapNodes[opDesc.type] { - inNodes.append((node, mapForNodeChain)) - typeMapNodes[opDesc.type] = inNodes - } else { - typeMapNodes[opDesc.type] = [(node, mapForNodeChain)] - } - } - - for fusion in fusionOps { - let fusionNode = fusion.fusionNode() - let depth = fusionNode.depth() - if let toMatchNodes = typeMapNodes[fusionNode.type] { - for node in toMatchNodes { - - let toNode = node.node.to(depth: depth) - if toNode == fusionNode { // match - var canFolder = true - let relationshipMap = toNode.relationship() + + var mapForNodeChain: [String : Node] = [:] + var nodes: [Node] = [] + var typeMapNodes: [String : [(node: Node, output: [String : Node])]] = [:] + let block = originProgramDesc.blocks[0] + for opDesc in block.ops { + print(opDesc.type) + guard let opInputKeys = opInfos[opDesc.type]?.inputs, let outputKeys = opInfos[opDesc.type]?.outputs else { + fatalError() + } - for toCheck in fusion.needCheck() { - // let nodes = toCheck - let checkNodes = toNode[toCheck.0] - - for checkNode in checkNodes { - let inputToChecks = checkNode.opDesc?.inputs[toCheck.1] ?? [] - for inputToCheck in inputToChecks { - if node.output[inputToCheck] == nil { - if relationshipMap[inputToCheck] == nil { - canFolder = false + let node = Node.init(inOpDesc: opDesc) + for inputKey in opInputKeys { + if let inputs = opDesc.inputs[inputKey] { + for input in inputs { + if let inputNode = mapForNodeChain[input] { + _ = inputNode --> node + } } - } } - - let paramInputToChecks = checkNode.opDesc?.paraInputs[toCheck.1] ?? [] - for paramInputToCheck in paramInputToChecks { - if node.output[paramInputToCheck] == nil { - if relationshipMap[paramInputToCheck] == nil { - canFolder = false + } + + for outputKey in outputKeys { + if let outputs = opDesc.outputs[outputKey] { + for output in outputs { + mapForNodeChain[output] = node } - } } - } } - if !canFolder { - continue - } + nodes.append(node) - var removeNodes: [Node] = [] - node.node.folderWith(fusion: fusion, removedNodes: &removeNodes) - for removeNode in removeNodes { - nodes.remove(element: removeNode) + if var inNodes = typeMapNodes[opDesc.type] { + inNodes.append((node, mapForNodeChain)) + typeMapNodes[opDesc.type] = inNodes + } else { + typeMapNodes[opDesc.type] = [(node, mapForNodeChain)] } - } } - } - } - - var ops: [PMOpDesc] = [] - for node in nodes { - ops.append(node.opDesc!) + + for fusion in fusionOps { + let fusionNode = fusion.fusionNode() + let depth = fusionNode.depth() + if let toMatchNodes = typeMapNodes[fusionNode.type] { + for node in toMatchNodes { + + let toNode = node.node.to(depth: depth) + if toNode == fusionNode { // match + var canFolder = true + let relationshipMap = toNode.relationship() + + for toCheck in fusion.needCheck() { + // let nodes = toCheck + let checkNodes = toNode[toCheck.0] + + for checkNode in checkNodes { + let inputToChecks = checkNode.opDesc?.inputs[toCheck.1] ?? [] + for inputToCheck in inputToChecks { + if node.output[inputToCheck] == nil { + if relationshipMap[inputToCheck] == nil { + canFolder = false + } + } + } + + let paramInputToChecks = checkNode.opDesc?.paraInputs[toCheck.1] ?? [] + for paramInputToCheck in paramInputToChecks { + if node.output[paramInputToCheck] == nil { + if relationshipMap[paramInputToCheck] == nil { + canFolder = false + } + } + } + } + } + + if !canFolder { + continue + } + + var removeNodes: [Node] = [] + node.node.folderWith(fusion: fusion, removedNodes: &removeNodes) + for removeNode in removeNodes { + nodes.remove(element: removeNode) + } + } + } + } + } + + var ops: [PMOpDesc] = [] + for node in nodes { + ops.append(node.opDesc!) + } + + let newProgramDesc = PMProgramDesc.init() + let newBlock = PMBlockDesc.init(inVars: block.vars, inOps: ops) + newProgramDesc.blocks.append(newBlock) + return newProgramDesc } - - let newProgramDesc = PMProgramDesc.init() - let newBlock = PMBlockDesc.init(inVars: block.vars, inOps: ops) - newProgramDesc.blocks.append(newBlock) - return newProgramDesc - } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Program/Scope.swift b/metal/paddle-mobile/paddle-mobile/Src/Program/Scope.swift index d73eefd096b32e06bff5ac82f7fb3aa16fce825e..478867b08ce80ecde1bf85913fb35de434b54f9c 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Program/Scope.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Program/Scope.swift @@ -48,7 +48,7 @@ public class Scope { } } - + func clear(){ vars.removeAll() } diff --git a/src/common/types.cpp b/src/common/types.cpp old mode 100644 new mode 100755 index c812917e9e3d7d35b97fe83781d0300e8a3b8645..19bc6ea59522b727663144aa6c9df78ca88637a5 --- a/src/common/types.cpp +++ b/src/common/types.cpp @@ -105,12 +105,16 @@ const char *G_OP_TYPE_FUSION_DECONV_ADD_RELU = "fusion_deconv_add_relu"; const char *G_OP_TYPE_SEQUENCE_EXPAND = "sequence_expand"; const char *G_OP_TYPE_SEQUENCE_POOL = "sequence_pool"; const char *G_OP_TYPE_SEQUENCE_SOFTMAX = "sequence_softmax"; - const char *G_OP_TYPE_SLICE = "slice"; const char *G_OP_TYPE_ANCHOR_GENERATOR = "anchor_generator"; const char *G_OP_TYPE_GENERATE_PROPOSALS = "generate_proposals"; const char *G_OP_TYPE_PSROI_POOL = "psroi_pool"; +const char *G_OP_TYPE_ROIALIGN_POOL = "roialign_pool"; const char *G_OP_TYPE_ROI_PERSPECTIVE = "roi_perspective_transform"; +const char *G_OP_TYPE_PAD2D = "pad2d"; +const char *G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU = "fusion_deconv_add_bn_relu"; +const char *G_OP_TYPE_FUSION_DECONV_ADD_BN = "fusion_deconv_add_bn"; +const char *G_OP_TYPE_FUSION_DECONV_BN_RELU = "fusion_deconv_bn_relu"; const char *G_OP_TYPE_PAD2D = "pad2d"; @@ -212,6 +216,10 @@ std::unordered_map< {{"Scores", "BboxDeltas", "ImInfo", "Anchors", "Variances"}, {"RpnRois", "RpnRoiProbs"}}}, {G_OP_TYPE_PSROI_POOL, {{"X", "ROIs"}, {"Out"}}}, + {G_OP_TYPE_ROIALIGN_POOL, {{"X", "ROIs"}, {"Out"}}}, {G_OP_TYPE_ROI_PERSPECTIVE, {{"X", "ROIs"}, {"Out"}}}, + {G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU, {{"Input"}, {"Out"}}}, + {G_OP_TYPE_FUSION_DECONV_ADD_BN, {{"Input"}, {"Out"}}}, + {G_OP_TYPE_FUSION_DECONV_BN_RELU, {{"Input"}, {"Out"}}}, {G_OP_TYPE_PAD2D, {{"X"}, {"Out"}}}}; } // namespace paddle_mobile diff --git a/src/common/types.h b/src/common/types.h old mode 100644 new mode 100755 index b55b88ea688d18cf07faf07a60bf4a73d19008c7..35c1659c5a246c49f6e0ccb31c3e63ce4fdd2e71 --- a/src/common/types.h +++ b/src/common/types.h @@ -198,7 +198,12 @@ extern const char *G_OP_TYPE_SLICE; extern const char *G_OP_TYPE_ANCHOR_GENERATOR; extern const char *G_OP_TYPE_GENERATE_PROPOSALS; extern const char *G_OP_TYPE_PSROI_POOL; +extern const char *G_OP_TYPE_ROIALIGN_POOL; extern const char *G_OP_TYPE_ROI_PERSPECTIVE; +extern const char *G_OP_TYPE_PAD2D; +extern const char *G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU; +extern const char *G_OP_TYPE_FUSION_DECONV_ADD_BN; +extern const char *G_OP_TYPE_FUSION_DECONV_BN_RELU; extern const char *G_OP_TYPE_PAD2D; diff --git a/src/fpga/V1/api.cpp b/src/fpga/V1/api.cpp index 5c960bbea7f8e65053998a29cd72d7b78f2fb97a..5cef0ec1a64e7e696d6b5c797e39918d6f1ee915 100644 --- a/src/fpga/V1/api.cpp +++ b/src/fpga/V1/api.cpp @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "fpga/V1/api.h" +#include #include "fpga/V1/bias_scale.h" #include "fpga/V1/deconv_filter.h" #include "fpga/V1/filter.h" @@ -27,13 +28,25 @@ namespace fpga { void format_image(framework::Tensor *image_tensor) { auto dims = image_tensor->dims(); auto channel = dims[1], height = dims[2], width = dims[3]; - auto data_ptr = image_tensor->data(); - auto external_ptr = reinterpret_cast(image_tensor->external_data); - float *p_data = external_ptr == nullptr ? data_ptr : external_ptr; - float *old_p = p_data; - image::format_image(&p_data, channel, height, width); - if (old_p != p_data) { - image_tensor->reset_data_ptr(p_data); + std::type_index input_type = image_tensor->type(); + if (input_type == typeid(float)) { + auto data_ptr = image_tensor->data(); + auto external_ptr = reinterpret_cast(image_tensor->external_data); + float *p_data = external_ptr == nullptr ? data_ptr : external_ptr; + + image::format_image(&p_data, channel, height, width); + if (p_data != data_ptr && external_ptr == nullptr) { + image_tensor->reset_data_ptr(p_data); + } + } else { + auto data_ptr = image_tensor->data(); + auto external_ptr = reinterpret_cast(image_tensor->external_data); + int8_t *p_data = external_ptr == nullptr ? data_ptr : external_ptr; + + image::format_image(&p_data, channel, height, width); + if (p_data != data_ptr && external_ptr == nullptr) { + image_tensor->reset_data_ptr(p_data); + } } } @@ -48,9 +61,9 @@ void format_fp16_ofm(framework::Tensor *ofm_tensor) { auto dims = ofm_tensor->dims(); size_t memory_size = 0; if (dims.size() == 4) { - auto channel = dims[1], height = dims[2], width = dims[3]; - memory_size = - height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(half); + auto channel = dims[1], height = dims[2], width = dims[3], num = dims[0]; + memory_size = num * height * align_to_x(channel * width, IMAGE_ALIGNMENT) * + sizeof(half); } else if (dims.size() == 2) { memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(half); } else { @@ -60,6 +73,7 @@ void format_fp16_ofm(framework::Tensor *ofm_tensor) { memset(p, 0, memory_size); ofm_tensor->reset_data_ptr(p); ofm_tensor->set_type(typeid(half)); + ofm_tensor->fpga_data_num = memory_size / sizeof(half); } void format_fp16_ofm(framework::Tensor *ofm_tensor, framework::DDim dims) { @@ -78,7 +92,9 @@ void format_fp16_ofm(framework::Tensor *ofm_tensor, framework::DDim dims) { memset(p, 0, memory_size); ofm_tensor->reset_data_ptr(p); ofm_tensor->set_type(typeid(half)); + ofm_tensor->fpga_data_num = memory_size / sizeof(half); } + void format_fp32_ofm(framework::Tensor *ofm_tensor) { auto dims = ofm_tensor->dims(); size_t memory_size = 0; @@ -95,6 +111,7 @@ void format_fp32_ofm(framework::Tensor *ofm_tensor) { memset(p, 0, memory_size); ofm_tensor->reset_data_ptr(p); ofm_tensor->set_type(typeid(float)); + ofm_tensor->fpga_data_num = memory_size / sizeof(float); } float filter_find_max(framework::Tensor *filter_tensor) { @@ -162,7 +179,7 @@ void format_dwconv_filter(framework::Tensor *filter_tensor, float *scale_ptr) { fpga_copy(new_data, data_ptr, memory_size); filter::format_dwconv_filter(&new_data, num, height, width, scale_ptr); filter_tensor->reset_data_ptr(new_data); - filter_tensor->set_type(typeid(int8_t)); + filter_tensor->set_type(typeid(int16_t)); } void format_DWDconv_filter(framework::Tensor *filter_tensor, float *scale_ptr, @@ -364,9 +381,10 @@ void expand_conv_arg(ConvArgs *arg) { auto filter_pad_width_mul_channel = args.image.pad_width * args.image.channels; auto image_amount_per_row_multi_win_first = - image_amount_per_row * (2 * args.kernel.stride_h - args.image.pad_height); + image_amount_per_row * + (ROW_PARALLEL_NUM * args.kernel.stride_h - args.image.pad_height); auto image_amount_per_row_multi_win = - image_amount_per_row * (2 * args.kernel.stride_h); + image_amount_per_row * (ROW_PARALLEL_NUM * args.kernel.stride_h); auto image_block_num = block_num; auto image_block_len = @@ -396,8 +414,8 @@ void expand_conv_arg(ConvArgs *arg) { // auto cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS; auto cmd = 0UL | USE_BIAS; - auto deconv_param = ((args.deconv_tx_param.deconv_en) << 24) | - ((args.deconv_tx_param.sub_conv_num) << 16) | + auto deconv_param = ((args.deconv_tx_param.deconv_en) << 16) | + ((args.deconv_tx_param.sub_conv_num) << 8) | ((args.deconv_tx_param.omit_size) << 0); (*arg).driver.image_address_phy = vaddr_to_paddr(args.image.address); (*arg).driver.sb_address_phy = vaddr_to_paddr(args.sb_address); @@ -623,7 +641,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, fpga::format_fp16_ofm(out, dims_out_new); auto out_ptr = out->data(); arg->output.address = - out_ptr + + (half *)out_ptr + // NOLINT omit_size * sizeof(half) * (align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT)); arg->output.scale_address = out->scale; @@ -758,9 +776,9 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, align_to_x(arg->split_conv_args[i]->conv_arg[j].filter_num, FILTER_NUM_ALIGNMENT) * sizeof(int8_t); - auto filter_head = - &filter_ptr[j * element_num * filter_num_per_div + // NOLINT - i * filter_sub_conv_offset]; + auto filter_head = &(( + int8_t *)filter_ptr)[j * element_num * filter_num_per_div + // NOLINT + i * filter_sub_conv_offset]; arg->split_conv_args[i]->conv_arg[j].filter_address = fpga_malloc(filter_size); arg->split_conv_args[i]->vector_conv_space.push_back( @@ -839,7 +857,7 @@ void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input, arg->vector_dwconv_space.push_back( std::shared_ptr(reinterpret_cast(bias_ptr), deleter)); - auto filter_ptr = filter->data(); + auto filter_ptr = filter->data(); auto input_ptr = input->data(); auto output_ptr = out->mutable_data(); arg->sub_conv_num = 1; diff --git a/src/fpga/V1/image.cpp b/src/fpga/V1/image.cpp index ebba4f3eaf7ff822bae240f8565b4b5f86f1a796..4ba5af83ab26a8b21ea868c8a28bb94da5216c69 100644 --- a/src/fpga/V1/image.cpp +++ b/src/fpga/V1/image.cpp @@ -13,60 +13,48 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "fpga/V1/image.h" -#include -#include -#include "fpga/common/fpga_common.h" namespace paddle_mobile { namespace fpga { namespace image { -void convert_to_hwc(float **data_in, int channel, int height, int width) { - float *data_tmp = - (float *)fpga_malloc(channel * height * width * sizeof(float)); // NOLINT +void convert_to_hwc(float **data_in, int channel, int height, int width, + int num) { + float *data_tmp = reinterpret_cast( + fpga_malloc(num * channel * height * width * sizeof(float))); int64_t amount_per_row = width * channel; - for (int c = 0; c < channel; c++) { - for (int h = 0; h < height; h++) { - int64_t offset_height = h * amount_per_row; - for (int w = 0; w < width; w++) { - *(data_tmp + offset_height + w * channel + c) = *((*data_in)++); + for (int n = 0; n < num; n++) { + for (int c = 0; c < channel; c++) { + for (int h = 0; h < height; h++) { + int64_t offset_height = h * amount_per_row; + for (int w = 0; w < width; w++) { + *(data_tmp + n * channel * height * width + offset_height + + w * channel + c) = *((*data_in)++); + } } } } *data_in = data_tmp; } -void align_element_conv(float **data_in, int height, int cw) { - int h = 0; - int align_cw = align_to_x(cw, IMAGE_ALIGNMENT); - +void convert_to_chw(float **data_in, int channel, int height, int width, + int num) { float *data_tmp = - (float *)fpga_malloc(height * align_cw * sizeof(float)); // NOLINT - - memset(data_tmp, 0, height * align_cw * sizeof(float)); - - for (h = 0; h < height; h++) { - memcpy((void *)(data_tmp + h * align_cw), // NOLINT - (void *)(*data_in + h * cw), // NOLINT - cw * sizeof(float)); + (float *)fpga_malloc(channel * height * width * sizeof(float)); // NOLINT + int64_t amount_per_side = width * height; + for (int n = 0; n < num; n++) { + for (int h = 0; h < height; h++) { + for (int w = 0; w < width; w++) { + for (int c = 0; c < channel; c++) { + *(data_tmp + n * height * width * channel + c * amount_per_side + + width * h + w) = *((*data_in)++); + } + } + } } - *data_in = data_tmp; } -void format_image(float **data_in, int channel, int height, int width) { - convert_to_hwc(data_in, channel, height, width); - int cw = channel * width; - int align_cw = align_to_x(cw, IMAGE_ALIGNMENT); - if (align_cw != cw) { - float *hwc_temp = *data_in; - align_element_conv(data_in, height, channel * width); - fpga_free(hwc_temp); - } - fpga_flush(*data_in, align_to_x(channel * width, IMAGE_ALIGNMENT) * height * - sizeof(float)); -} - void concat_images(int16_t **images_in, float **scales_in, void *image_out, float *scale_out, int image_num, uint32_t *channel_num, int height, int width) { @@ -132,8 +120,8 @@ void split_image(int16_t *image_in, const float *scale_in, void **images_out, for (int i = 0; i < image_num; i++) { des_offset = h * align_to_x(channel_nums[i] * width, IMAGE_ALIGNMENT) + w * channel_nums[i]; - memcpy((int16_t *)images_out[i] + des_offset, image_in + src_offset, - channel_nums[i] * sizeof(int16_t)); + memcpy(reinterpret_cast(images_out[i]) + des_offset, + image_in + src_offset, channel_nums[i] * sizeof(int16_t)); src_offset += channel_nums[i]; } } diff --git a/src/fpga/V1/image.h b/src/fpga/V1/image.h index f3c7b2731cb555c0c8871f6cd1d9f9df3e6429f2..f5dc6ffe3e1d9747bf4c9cfd86f5a951e7b0ac24 100644 --- a/src/fpga/V1/image.h +++ b/src/fpga/V1/image.h @@ -14,16 +14,54 @@ limitations under the License. */ #pragma once +#include +#include #include - +#include "fpga/common/fpga_common.h" namespace paddle_mobile { namespace fpga { namespace image { -void convert_to_hwc(float** data_in, int channel, int height, int width); -void align_element_conv(float** data_in, int height, int cw); -void format_image(float** data_in, int channel, int height, int width); +void convert_to_hwc(float** data_in, int channel, int height, int width, + int num = 1); +void convert_to_chw(float** data_in, int channel, int height, int width, + int num = 1); +// template +// void align_element_conv(Dtype** data_in, int height, int cw); +// template +// void format_image(T** data_in, int channel, int height, int width); +template +void align_element_conv(Dtype** data_in, int height, int cw); +template +void align_element_conv(Dtype** data_in, int height, int cw) { + int h = 0; + int align_cw = align_to_x(cw, IMAGE_ALIGNMENT); + + Dtype* data_tmp = + (Dtype*)fpga_malloc(height * align_cw * sizeof(Dtype)); // NOLINT + + memset(data_tmp, 0, height * align_cw * sizeof(Dtype)); + + for (h = 0; h < height; h++) { + memcpy((void*)(data_tmp + h * align_cw), // NOLINT + (void*)(*data_in + h * cw), // NOLINT + cw * sizeof(Dtype)); + } + *data_in = data_tmp; +} +template +void format_image(T** data_in, int channel, int height, int width) { + int cw = channel * width; + int align_cw = align_to_x(cw, IMAGE_ALIGNMENT); + if (align_cw != cw) { + T* hwc_temp = *data_in; + align_element_conv(data_in, height, channel * width); + fpga_free(hwc_temp); + } + fpga_flush(*data_in, + align_to_x(channel * width, IMAGE_ALIGNMENT) * height * sizeof(T)); +} // Concat featuremaps along channel direction void concat_images(int16_t** images_in, float** scales_in, void* image_out, float* scale_out, int image_num, uint32_t* channel_num, diff --git a/src/fpga/V1/pe.cpp b/src/fpga/V1/pe.cpp index 37feeb9dfa1a0e9a8c4dc9f789c0ab673e0f4d65..24ef95e6fc25b32a2faf69c7e685b5c1f07d1cdd 100644 --- a/src/fpga/V1/pe.cpp +++ b/src/fpga/V1/pe.cpp @@ -38,10 +38,12 @@ using namespace std; // NOLINT #define CMD_FP16_TO_FP32 1 #define CMD_FP32_TO_FP16 2 #define CMD_FP32_TO_FP32 3 +#define CMD_INT8_TO_FP16 4 // bypass macro #define SIZE_FP16 2 #define SIZE_FP32 4 +#define SIZE_INT8 1 #define PE_IRQ_TIMEOUT 1000000 @@ -607,6 +609,16 @@ int PerformBypass(const struct BypassArgs &args) { } } break; + case DATA_TYPE_INT8: { + if (args.output_data_type != DATA_TYPE_FP16) { + DLOG << "error:Output Datetype error,not DATA_TYPE_FP16: " + << args.output_data_type; + } + data_cell_in = SIZE_INT8; + data_cell_out = SIZE_FP16; + cmd = CMD_INT8_TO_FP16; + } break; + case DATA_TYPE_FP32: { switch (args.output_data_type) { case DATA_TYPE_FP16: @@ -630,10 +642,13 @@ int PerformBypass(const struct BypassArgs &args) { break; } if (cmd != CMD_FP16_TO_FP16 && cmd != CMD_FP16_TO_FP32 && - cmd != CMD_FP32_TO_FP16 && cmd != CMD_FP32_TO_FP32) { + cmd != CMD_FP32_TO_FP16 && cmd != CMD_FP32_TO_FP32 && + cmd != CMD_INT8_TO_FP16) { + // std::cout<< " err back Error1!" <= 0) { - *p |= mask_to_set; - len -= bits_to_set; - bits_to_set = BITS_PER_LONG; - mask_to_set = ~0UL; - p++; - } - if (len) { - mask_to_set &= BITMAP_LAST_WORD_MASK(size); - *p |= mask_to_set; - } -} - -void bitmap_clear(uint64_t *map, unsigned int start, int len) { - uint64_t *p = map + BIT_WORD(start); - const unsigned int size = start + len; - int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG); - uint64_t mask_to_clear = BITMAP_FIRST_WORD_MASK(start); - - while (len - bits_to_clear >= 0) { - *p &= ~mask_to_clear; - len -= bits_to_clear; - bits_to_clear = BITS_PER_LONG; - mask_to_clear = ~0UL; - p++; - } - if (len) { - mask_to_clear &= BITMAP_LAST_WORD_MASK(size); - *p &= ~mask_to_clear; - } -} - -static uint64_t ffs(uint64_t data) { - uint64_t bit = 0; - int i = 0; - - for (i = 0; i < sizeof(data) * 8; i++) { - if (data & (1UL << i)) { - bit = i; - break; - } - } - - return bit; -} - -static uint64_t _find_next_bit(const uint64_t *addr, uint64_t nbits, - uint64_t start, uint64_t invert) { - uint64_t tmp = 0; - - if (!nbits || start >= nbits) return nbits; - - tmp = addr[start / BITS_PER_LONG] ^ invert; - - /* Handle 1st word. */ - tmp &= BITMAP_FIRST_WORD_MASK(start); - start = round_down(start, BITS_PER_LONG); - - while (!tmp) { - start += BITS_PER_LONG; - if (start >= nbits) return nbits; - - tmp = addr[start / BITS_PER_LONG] ^ invert; - } - - return (start + ffs(tmp)) < nbits ? (start + ffs(tmp)) : nbits; -} - -uint64_t find_next_zero_bit(const uint64_t *addr, uint64_t size, - uint64_t offset) { - return _find_next_bit(addr, size, offset, ~0UL); -} - -uint64_t find_next_bit(const uint64_t *addr, uint64_t size, uint64_t offset) { - return _find_next_bit(addr, size, offset, 0UL); -} - -uint64_t bitmap_find_next_zero_area_off(uint64_t *map, uint64_t size, - uint64_t start, unsigned int nr, - uint64_t align_mask, - uint64_t align_offset) { - uint64_t index = 0; - uint64_t end = 0; - uint64_t i = 0; - -again: - index = find_next_zero_bit(map, size, start); - - /* Align allocation */ - index = __ALIGN_MASK(index + align_offset, align_mask) - align_offset; - - end = index + nr; - if (end > size) return end; - i = find_next_bit(map, end, index); - if (i < end) { - start = i + 1; - goto again; - } - - return index; -} - -uint64_t bitmap_find_next_zero_area(uint64_t *map, uint64_t size, - uint64_t start, unsigned int nr, - uint64_t align_mask) { - return bitmap_find_next_zero_area_off(map, size, start, nr, align_mask, 0); -} -} // namespace fpga_bitmap diff --git a/src/fpga/common/bitmap.h b/src/fpga/common/bitmap.h deleted file mode 100644 index 4cb1673d91d61c1ec27bbc6923e49e8dd04e3a37..0000000000000000000000000000000000000000 --- a/src/fpga/common/bitmap.h +++ /dev/null @@ -1,37 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include - -#define BITS_PER_LONG 64 -#define BIT_WORD(nr) ((nr) / BITS_PER_LONG) -#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1))) -#define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1))) - -#define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask)) -#define __ALIGN_MASK(x, mask) __ALIGN_KERNEL_MASK((x), (mask)) - -#define round_down(x, y) ((x) & ~((y)-1)) - -namespace fpga_bitmap { -void bitmap_set(uint64_t *map, unsigned int start, int len); -void bitmap_clear(uint64_t *map, unsigned int start, int len); -uint64_t bitmap_find_next_zero_area(uint64_t *map, uint64_t size, - uint64_t start, unsigned int nr, - uint64_t align_mask); - -} // namespace fpga_bitmap diff --git a/src/fpga/common/driver.cpp b/src/fpga/common/driver.cpp index b1d3559dbbb238ae24cc6224e2d253dab744dce1..0774cab71e99ce28987e922e22d46ab9a63b1a93 100644 --- a/src/fpga/common/driver.cpp +++ b/src/fpga/common/driver.cpp @@ -26,9 +26,9 @@ limitations under the License. */ #include #include #include +#include #include "common/enforce.h" -#include "fpga/common/bitmap.h" #include "fpga/common/driver.h" namespace paddle_mobile { @@ -148,34 +148,6 @@ int fpga_regpoll(uint64_t reg, uint64_t val, int time) { } } -/*内存管理*/ -int memory_request(struct fpga_memory *memory, size_t size, uint64_t *addr) { - uint64_t _nr = DIV_ROUND_UP(size, FPGA_PAGE_SIZE); - unsigned int nr = (unsigned int)_nr; - int ret = 0; - uint64_t a_size = FPGA_PAGE_SIZE * nr; - - pthread_mutex_lock(&memory->mutex); - - unsigned int pos = (unsigned int)fpga_bitmap::bitmap_find_next_zero_area( - memory->bitmap, memory->page_num, 0, nr, 0); - if (pos <= memory->page_num) { - uint64_t address_ofset = - memory->mem_start + ((uint64_t)pos) * FPGA_PAGE_SIZE; - fpga_bitmap::bitmap_set(memory->bitmap, pos, nr); - memory->nr[pos] = nr; - - *addr = address_ofset; - } else { - DLOG << "memory request failed!"; - ret = -ENOMEM; - } - - pthread_mutex_unlock(&memory->mutex); - - return ret; -} - void memory_release(struct fpga_memory *memory) { void *ptr = nullptr; @@ -187,97 +159,6 @@ void memory_release(struct fpga_memory *memory) { } } -int create_fpga_memory_inner(struct fpga_memory *memory, size_t memory_size) { - int rc = 0; - - uint64_t *bitmap = nullptr; - unsigned int *nr = nullptr; - - // 不允许多份memory创建,所以创建memory结构体不存在互斥 - // pthread_mutex_lock(&memory->mutex); - memory->page_num = (unsigned int)(memory_size / FPGA_PAGE_SIZE); - memory->page_num_long = DIV_ROUND_UP(memory->page_num, BITS_PER_LONG); - - bitmap = - (uint64_t *)malloc(sizeof(int64_t) * memory->page_num_long); // NOLINT - if (!bitmap) { - rc = -EFAULT; - return rc; - } - memory->bitmap = bitmap; - - nr = (unsigned int *)calloc(memory->page_num, sizeof(unsigned int)); - if (!nr) { - rc = -EFAULT; - free(bitmap); - return rc; - } - memory->nr = nr; - - memory->mem_start = FPGA_MEM_PHY_ADDR; - memory->mem_end = FPGA_MEM_SIZE; - // pthread_mutex_unlock(memory->mutex); - - return rc; -} - -int create_fpga_memory(struct fpga_memory **memory_info) { - int rc = 0; - - *memory_info = (struct fpga_memory *)malloc(sizeof(struct fpga_memory)); - if (*memory_info == NULL) { - rc = -EFAULT; - return rc; - } - pthread_mutex_init(&((*memory_info)->mutex), nullptr); - - rc = create_fpga_memory_inner(*memory_info, FPGA_MEM_SIZE); - if (rc) { - free(*memory_info); - } - - return rc; -} - -int init_fpga_memory(struct fpga_memory *memory) { - int rc = 0; - - if (!memory) { - rc = -EFAULT; - return rc; - } - - fpga_bitmap::bitmap_clear(memory->bitmap, 0, memory->page_num); - fpga_bitmap::bitmap_set(memory->bitmap, 0, 1); // NOTE reserve fpga page 0. - - return 0; -} - -void destroy_fpga_memory(struct fpga_memory *memory) { - if (memory) { - free(memory->nr); - free(memory->bitmap); - free(memory); - } -} - -int fpga_memory_add() { - int rc = 0; - - rc = create_fpga_memory(&g_fpgainfo.memory_info); - if (rc) { - return rc; - } - - rc = init_fpga_memory(g_fpgainfo.memory_info); - if (rc) { - destroy_fpga_memory(g_fpgainfo.memory_info); - return rc; - } - - return 0; -} - uint64_t vaddr_to_paddr_driver(void *address) { uint64_t paddr = 0; auto iter = g_fpgainfo.fpga_vaddr2paddr_map.find(address); @@ -314,17 +195,28 @@ void *fpga_reg_free(void *ptr) { } } +static inline int do_ioctl(int64_t req, const void *arg) { + return ioctl(g_fpgainfo.fd_mem, req, arg); +} + void *fpga_malloc_driver(size_t size) { void *ret = nullptr; uint64_t phy_addr = 0; int i = 0; + struct MemoryVM2PHYArgs args; + struct MemoryCacheArgs args_c; - memory_request(g_fpgainfo.memory_info, size, &phy_addr); + // memory_request(g_fpgainfo.memory_info, size, &phy_addr); ret = mmap64(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, - g_fpgainfo.fd_mem, phy_addr); + g_fpgainfo.fd_mem, FPGA_MEM_PHY_ADDR); PADDLE_MOBILE_ENFORCE(ret != (void *)-1, "Should not be -1"); + args.pVM = reinterpret_cast(ret); + args.pPHY = reinterpret_cast(0); + do_ioctl(IOCTL_MEMORY_VM2PHY, &args); + phy_addr = (uint64_t)args.pPHY; + g_fpgainfo.fpga_vaddr2paddr_map.insert(std::make_pair(ret, phy_addr)); g_fpgainfo.fpga_addr2size_map.insert(std::make_pair(ret, size)); @@ -342,14 +234,8 @@ void fpga_free_driver(void *ptr) { g_fpgainfo.fpga_addr2size_map.erase(iter); munmap(ptr, size); - p_addr = vaddr_to_paddr_driver(ptr); - pos = (p_addr - g_fpgainfo.memory_info->mem_start) / FPGA_PAGE_SIZE; - - /*clear bitmap*/ - pthread_mutex_lock(&g_fpgainfo.memory_info->mutex); - fpga_bitmap::bitmap_clear(g_fpgainfo.memory_info->bitmap, pos, - g_fpgainfo.memory_info->nr[pos]); - pthread_mutex_unlock(&g_fpgainfo.memory_info->mutex); + // p_addr = vaddr_to_paddr_driver(ptr); + // pos = (p_addr - g_fpgainfo.memory_info->mem_start) / FPGA_PAGE_SIZE; auto iter = g_fpgainfo.fpga_vaddr2paddr_map.find(ptr); if (iter != g_fpgainfo.fpga_vaddr2paddr_map.end()) { @@ -360,10 +246,6 @@ void fpga_free_driver(void *ptr) { } } -static inline int do_ioctl(int64_t req, const void *arg) { - return ioctl(g_fpgainfo.fd_mem, req, arg); -} - int fpga_flush_driver(void *address, size_t size) { struct MemoryCacheArgs args; uint64_t p_addr; @@ -413,7 +295,7 @@ int open_device_driver() { g_fpgainfo.FpgaRegVirAddr = (uint64_t *)fpga_reg_malloc(FPGA_REG_SIZE); // NOLINT - fpga_memory_add(); + // fpga_memory_add(); pl_init(); @@ -424,7 +306,6 @@ int close_device_driver() { pl_destroy(); fpga_reg_free(g_fpgainfo.FpgaRegVirAddr); memory_release(g_fpgainfo.memory_info); - destroy_fpga_memory(g_fpgainfo.memory_info); return 0; } diff --git a/src/fpga/common/driver.h b/src/fpga/common/driver.h index d35627cd46b3f233255a98d1e1fbca27469f715c..87c68cbb5a1abe935b97ed9783785be65030ffff 100644 --- a/src/fpga/common/driver.h +++ b/src/fpga/common/driver.h @@ -31,8 +31,8 @@ namespace driver { #define FPGA_REG_PHY_ADDR 0x80000000 #define FPGA_REG_SIZE 0x1000 -#define FPGA_MEM_PHY_ADDR 0x40000000 -#define FPGA_MEM_SIZE 0x80000000 +#define FPGA_MEM_PHY_ADDR 0x20000000 +#define FPGA_MEM_SIZE 0x20000000 #define FPGA_PAGE_SIZE (16UL * 1024UL) @@ -52,9 +52,15 @@ struct MemoryCacheArgs { size_t size; }; +struct MemoryVM2PHYArgs { + void *pVM; + void *pPHY; +}; + #define IOCTL_FPGA_MAGIC 'F' #define IOCTL_MEMCACHE_INVAL _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryCacheArgs) #define IOCTL_MEMCACHE_FLUSH _IOW(IOCTL_FPGA_MAGIC, 13, struct MemoryCacheArgs) +#define IOCTL_MEMORY_VM2PHY _IOWR(IOCTL_FPGA_MAGIC, 15, struct MemoryVM2PHYArgs) struct fpga_pe { char type_name[MAX_TYPE_NAME_LENTH + 1]; diff --git a/src/fpga/common/fpga_common.h b/src/fpga/common/fpga_common.h index 898e76a65425c357a00e76eaedf39c003c9603f3..fc3ca28cc95df503756d722142abf70ad5108308 100644 --- a/src/fpga/common/fpga_common.h +++ b/src/fpga/common/fpga_common.h @@ -25,11 +25,13 @@ limitations under the License. */ #define FILTER_ELEMENT_ALIGNMENT (16) // Filter element number aligned to 16 #define BS_NUM_ALIGNMENT (8) #define BIAS_NUM_ALIGNMENT (16) +#define ROW_PARALLEL_NUM (3) #endif namespace paddle_mobile { namespace fpga { enum DataType { + DATA_TYPE_INT8 = 2, DATA_TYPE_FP32 = 1, DATA_TYPE_FP16 = 0, }; diff --git a/src/framework/executor.cpp b/src/framework/executor.cpp index 5d95df063b50d86165dc73d5da31dd17827c09d7..0716ee7f2e5d6c6bc7065c4ce8087ce95db4573b 100644 --- a/src/framework/executor.cpp +++ b/src/framework/executor.cpp @@ -524,12 +524,25 @@ void Executor::FeedData(const std::vector &v) { PADDLE_MOBILE_ENFORCE(input_size == vars.size(), "input data number not correct"); for (int i = 0; i < input_size; i++) { - auto var = program_.scope->Var("feed", i); + auto var = vars[i]; auto feed_tensor = var->template GetMutable(); feed_tensor->external_data = v[i]; } } +template +void Executor::FeedTensorData(const vector &v) { + auto input_size = v.size(); + auto vars = program_.scope->VarContain("feed"); + PADDLE_MOBILE_ENFORCE(input_size == vars.size(), + "input data number not correct"); + for (int i = 0; i < input_size; i++) { + auto var = vars[i]; + auto feed_tensor = var->template GetMutable(); + feed_tensor->ShareDataWith(v[i]); + } +} + template void Executor::GetResults(std::vector *v) { auto output_size = v->size(); @@ -537,13 +550,34 @@ void Executor::GetResults(std::vector *v) { auto vars = program_.scope->VarContain("fetch"); PADDLE_MOBILE_ENFORCE(output_size == vars.size(), "output data number not correct"); + for (int i = 0; i < output_size; i++) { - auto var = program_.scope->Var("fetch", i); + auto var = vars[i]; auto fetch_tensor = var->template GetMutable(); (*v)[i] = fetch_tensor->template data(); } } +template +void Executor::GetTensorResults( + std::vector *v) { + auto vars = program_.scope->VarContain("fetch"); + auto output_size = vars.size(); + + for (int i = 0; i < output_size; i++) { + auto var = vars[i]; + auto fetch_tensor = var->template GetMutable(); + v->push_back(fetch_tensor); + } +} + +template +framework::Tensor *Executor::GetTensorByName( + const std::string &name) { + auto var = program_.scope->Var(name); + return var->template GetMutable(); +} + template std::shared_ptr Executor::FetchResult(int id) { auto &ops = ops_of_block0_; diff --git a/src/framework/executor.h b/src/framework/executor.h index a706af54f9ab3c7b165993d4ffe9e627ed68a6a3..ea7bde7f748352c9b1221e69f3359938b7371a39 100644 --- a/src/framework/executor.h +++ b/src/framework/executor.h @@ -53,7 +53,12 @@ class Executor { void InjectVariable(const Tensor &t, std::string var_name); void FeedData(const Tensor &t); void FeedData(const std::vector &v); + void FeedTensorData(const std::vector &v); + void GetResults(std::vector *v); + void GetTensorResults(std::vector *v); + framework::Tensor *GetTensorByName(const std::string &name); + std::shared_ptr FetchResult(int id = -1); void Predict_From_To(int start = 0, int end = -1); void Predict_From(int start); diff --git a/src/framework/loader.cpp b/src/framework/loader.cpp index 6f03dae0c7378d506e6ba7b3fc42c93e3f24a319..99674307aae2b105ca1e125dbbb959f0f5301c6d 100644 --- a/src/framework/loader.cpp +++ b/src/framework/loader.cpp @@ -241,6 +241,7 @@ const Program Loader::LoadProgram( FusionAndPrintInfos(optimize, can_add_split, &program, originProgramDesc); paddle_mobile__framework__proto__program_desc__free_unpacked(c_program, NULL); + free(buf); return program; } diff --git a/src/framework/operator.cpp b/src/framework/operator.cpp index 12fc3d7f1439d160e19db5773cead7bff5b4f155..74398bbc5b368236d56e5180452b5b05d7d156ad 100644 --- a/src/framework/operator.cpp +++ b/src/framework/operator.cpp @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "framework/operator.h" +#include #include "operators/op_param.h" - namespace paddle_mobile { namespace framework { diff --git a/src/framework/operator.h b/src/framework/operator.h index 9b8226c5efb27553d56960762c8400a2d10e6b71..d58168017f9c82f07bbe31129ddb70d10c933939 100644 --- a/src/framework/operator.h +++ b/src/framework/operator.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include +#include #include #include #include @@ -80,6 +81,7 @@ class OperatorBase { } #ifdef PADDLE_MOBILE_FPGA void InsertTensors(); + void ChangeNameMap(string key, std::vector value); #endif protected: @@ -96,15 +98,35 @@ class OperatorBase { template class OperatorWithKernel : public OperatorBase { public: +#ifndef PADDLE_MOBILE_FPGA1 OperatorWithKernel(const std::string &type, const VariableNameMap &inputs, const VariableNameMap &outputs, const AttributeMap &attrs, framework::Scope *scope) : OperatorBase(type, inputs, outputs, attrs, scope), - param_(inputs, outputs, attrs, *scope) { + param_(inputs, outputs, attrs, scope.get()) { #ifdef PADDLE_MOBILE_CL kernel_.InitCLHelper(scope->GetCLScpoe()); #endif } +#else + OperatorWithKernel(const std::string &type, const VariableNameMap inputs, + const VariableNameMap &outputs, const AttributeMap &attrs, + std::shared_ptr scope) + : OperatorBase(type, inputs, outputs, attrs, scope) { + static int feed_num = 0; + static int fetch_num = 0; + if (type == "feed") { + auto new_name = string("feed") + std::to_string(feed_num++); + auto var = scope->Var(new_name); + (const_cast(inputs)).at("X") = {string(new_name)}; + } else if (type == "fetch") { + auto new_name = string("fetch") + std::to_string(fetch_num++); + auto var = scope->Var(new_name); + (const_cast(outputs)).at("Out") = {string(new_name)}; + } + param_ = ParamType(inputs, outputs, attrs, *scope); + } +#endif virtual void RunImpl() { this->kernel_.Compute(this->param_); } virtual void InferShape() const = 0; diff --git a/src/framework/scope.cpp b/src/framework/scope.cpp index 5ddb71aaf700b96b0630c1d0a4a8779f3ac1ddcb..db263081446f9804e5352588063a23f72a8bf163 100644 --- a/src/framework/scope.cpp +++ b/src/framework/scope.cpp @@ -126,6 +126,8 @@ std::vector Scope::VarContain(const std::string substring) { return v; } +void Scope::InsertVar(const std::string str, Variable *var) {} + void Scope::print_vars() { DLOG << "====================start to print variables================="; for (auto pair : vars_) { diff --git a/src/framework/scope.h b/src/framework/scope.h index 08eebf8935abb52d01179837a0c76f24fae3f36d..c52917499639c9ed03c8807c726cdf1dcdaece9e 100644 --- a/src/framework/scope.h +++ b/src/framework/scope.h @@ -78,6 +78,7 @@ class Scope { #ifdef PADDLE_MOBILE_FPGA Variable *Var(const std::string &name, const int id); std::vector VarContain(const std::string substring); + void InsertVar(const std::string str, Variable *var); void print_vars(); #endif diff --git a/src/framework/tensor.h b/src/framework/tensor.h index 16656c08b866aa4db08481bc4ac91f6b5e86a728..24f09662ea5ecca2a96ccdac7e863034f6a3a311 100644 --- a/src/framework/tensor.h +++ b/src/framework/tensor.h @@ -31,6 +31,11 @@ limitations under the License. */ namespace paddle_mobile { namespace framework { +enum LayoutType { + LAYOUT_CHW = 1, + LAYOUT_HWC = 0, +}; + class LoDTensor; class Tensor : public TensorBase { @@ -224,6 +229,8 @@ class Tensor : public TensorBase { float scale[2]; // scale[0]= MAX/127.0, scale[1]= 127.0/MAX void *external_data = nullptr; // only used for Feed + LayoutType layout = LAYOUT_HWC; + int64_t fpga_data_num; #endif }; diff --git a/src/io/api_paddle_mobile.cc b/src/io/api_paddle_mobile.cc index 7c391c0bf84c34f0ea884a171e5a014711150d77..638a369b4ef7153772dbe5ca1770722c5212edaa 100644 --- a/src/io/api_paddle_mobile.cc +++ b/src/io/api_paddle_mobile.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "io/api_paddle_mobile.h" +#include #include #include "common/enforce.h" #include "framework/tensor.h" @@ -41,10 +42,12 @@ bool PaddleMobilePredictor::Init(const PaddleMobileConfig &config) { config.memory_pack.combined_params_buf); } else if (!config.model_dir.empty()) { paddle_mobile_->Load(config.model_dir, config.optimize, - config.quantification, config.batch_size); + config.quantification, config.batch_size, + config.lod_mode); } else if (!config.prog_file.empty() && !config.param_file.empty()) { paddle_mobile_->Load(config.prog_file, config.param_file, config.optimize, - config.quantification, config.batch_size); + config.quantification, config.batch_size, + config.lod_mode); } else { LOG(kLOG_ERROR) << "fail to load inference model!"; return false; @@ -111,72 +114,64 @@ bool PaddleMobilePredictor::Run( } #ifdef PADDLE_MOBILE_FPGA -template -bool PaddleMobilePredictor::Run( - const std::vector &inputs, - std::vector *output_data, std::vector *index_data, - int batch_size) { - if (inputs.empty()) { - LOG(kLOG_ERROR) << "At least one output should be set with tensors' names."; - return false; - } - auto input = inputs[0]; - - if (input.shape.size() != 4) { - LOG(kLOG_ERROR) << "input shape not equal to 4!"; - return false; - } - std::vector dims; - for (auto d : input.shape) { - dims.push_back(static_cast(d)); - } +void ConvertPaddleTensors(const PaddleTensor &src, framework::Tensor *des) { + des->Resize(framework::make_ddim(src.shape)); + des->external_data = src.data.data(); + des->set_type(src.dtypeid); + des->layout = + src.layout == LAYOUT_HWC ? framework::LAYOUT_HWC : framework::LAYOUT_CHW; +} - // use tensor - framework::DDim ddim = - framework::make_ddim({dims[0], dims[1], dims[2], dims[3]}); +void ConvertTensors(const framework::Tensor &src, PaddleTensor *des) { + des->shape = framework::vectorize2int(src.dims()); + des->dtypeid = src.type(); + des->layout = src.layout == framework::LAYOUT_HWC ? LAYOUT_HWC : LAYOUT_CHW; - framework::Tensor input_tensor; - input_tensor.Resize(ddim); - int input_length = framework::product(ddim); - auto input_ptr = input_tensor.mutable_data(); - - memcpy(input_ptr, static_cast(input.data.data()), - input_length * sizeof(T)); - paddle_mobile_->Predict(input_tensor); - auto num_result = index_data->size(); - if (output_data->size() != num_result) { - LOG(kLOG_ERROR) << "index and output number don't match"; - return false; + auto num = src.numel(); + if (src.type() == typeid(float)) { + des->data.Reset(const_cast(src.data()), + num * sizeof(float)); + } else { + des->data.Reset(const_cast(src.data()), + num * sizeof(int16_t)); } +} - for (int i = 0; i < num_result; i++) { - auto output_tensor = paddle_mobile_->FetchResult((*index_data)[i]); - - if (output_data->empty()) { - LOG(kLOG_ERROR) - << "At least one output should be set with tensors' names."; - return false; - } - - auto &output = (*output_data)[i]; - int output_length = output_tensor->numel(); - std::vector tensor_shape = - framework::vectorize(output_tensor->dims()); - - for (auto d : tensor_shape) { - output.shape.push_back(static_cast(d)); - } - - if (output.data.length() < output_length * sizeof(T)) { - output.data.Resize(output_length * sizeof(T)); - } +template +void PaddleMobilePredictor::FeedPaddleTensors( + const std::vector &inputs) { + auto num = inputs.size(); + std::vector tensors(num, framework::Tensor()); + for (int i = 0; i < num; i++) { + tensors[i].init(typeid(float)); + ConvertPaddleTensors(inputs[i], &tensors[i]); + } + paddle_mobile_->FeedTensorData(tensors); +} - memcpy(output.data.data(), output_tensor->template data(), - output_length * sizeof(T)); +template +void PaddleMobilePredictor::FetchPaddleTensors( + std::vector *outputs) { + // auto num = outputs->size(); + // PADDLE_MOBILE_ENFORCE(num > 0, "0 output pointers is not permitted"); + // std::vector tensors(num, nullptr); + outputs->clear(); + std::vector tensors; + paddle_mobile_->GetTensorResults(&tensors); + auto num = tensors.size(); + outputs->resize(num, PaddleTensor()); + for (int i = 0; i < num; i++) { + ConvertTensors(*tensors[i], &(*outputs)[i]); } +} - return true; +template +void PaddleMobilePredictor::GetPaddleTensor(const std::string &name, + PaddleTensor *output) { + framework::Tensor *t = paddle_mobile_->GetTensorByName(name); + ConvertTensors(*t, output); } + template void PaddleMobilePredictor::FeedData( const std::vector &inputs) { diff --git a/src/io/api_paddle_mobile.h b/src/io/api_paddle_mobile.h index 0cadd71c226b20331c8399d2cfd8873c093a6b84..15cb4a6d0bd82997f24ff5bbf0d15921c376b3f9 100644 --- a/src/io/api_paddle_mobile.h +++ b/src/io/api_paddle_mobile.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include #include "common/types.h" #include "io/paddle_inference_api.h" @@ -32,13 +33,15 @@ class PaddleMobilePredictor : public PaddlePredictor { std::vector* output_data, int batch_size = -1) override; #ifdef PADDLE_MOBILE_FPGA - bool Run(const std::vector& inputs, - std::vector* output_data, std::vector* index_data, - int batch_size = -1) override; void FeedData(const std::vector& inputs) override; void GetResults(std::vector* outputs) override; - void Predict_From_To(int start = 0, int end = -1) override; + void Predict_From_To(int start, int end) override; + void FeedPaddleTensors(const std::vector& inputs) override; + void FetchPaddleTensors(std::vector* outputs) override; + void GetPaddleTensor(const std::string& name, PaddleTensor* output) override; + #endif + ~PaddleMobilePredictor() override; private: diff --git a/src/io/ios_io/PaddleMobileCPU.h b/src/io/ios_io/PaddleMobileCPU.h index 69e8b894d7b16eefa36259b479902e6185e5a36e..0536f513aa00a26478c16820b20af5100f3ebc62 100644 --- a/src/io/ios_io/PaddleMobileCPU.h +++ b/src/io/ios_io/PaddleMobileCPU.h @@ -19,10 +19,21 @@ @interface PaddleMobileCPUResult: NSObject +/** + @b 输出指针 + */ @property (assign, nonatomic, readonly) float *output; +/** + @b 输出的 float 数 + * */ @property (assign, nonatomic, readonly) int outputSize; +/** + @b 维度信息, longlongValue + */ +@property (strong, nonatomic, readonly) NSArray *dim; + -(void)releaseOutput; @end @@ -92,11 +103,6 @@ andModelParamsLen:(size_t)combinedParamsLen andCombinedParamsBuf:(const uint8_t *)combinedParamsBuf; -/* - * - * */ - - /** @b 对图像进行预处理, 需要外部开辟 output 内存, 外部释放 output 内存, 每一个像素经过这样的预处理 (x + means) * scale, 其中 x 为像素值 @@ -134,7 +140,7 @@ - (PaddleMobileCPUResult *)predict:(CGImageRef)image dim:(NSArray *)dim means:(NSArray *)means scale:(float)scale; /** - 进行预测, 预处理 means 值为 0, scale 值为 1 + @b 进行预测, 预处理 means 值为 0, scale 值为 1 @param image 输入图像 @param dim 输入维度 @@ -142,6 +148,22 @@ */ - (PaddleMobileCPUResult *)predict:(CGImageRef)image dim:(NSArray *)dim; + +/** + @b 取出模型描述中 key 为 "fetch" 对应的输出 + + @return 预测结果 + */ +- (PaddleMobileCPUResult *)fetchOutput; + +/** + @b 当输出为多个时, 可用此函数取出对应的输出 + + @param key 模型中输出的key + @return 预测结果 + */ +- (PaddleMobileCPUResult *)fetchOutputWithKey:(NSString *)key; + /** @b 清理内存 */ diff --git a/src/io/ios_io/PaddleMobileCPU.mm b/src/io/ios_io/PaddleMobileCPU.mm index 7103dce16b4eeed8b2e63c93f5dbf4b122f06a84..f3a804e713c1e3caa5d806ceeca5b3b2d52ebce3 100644 --- a/src/io/ios_io/PaddleMobileCPU.mm +++ b/src/io/ios_io/PaddleMobileCPU.mm @@ -43,6 +43,10 @@ _outputSize = outputSize; } +-(void)toSetDim:(NSArray *)dim { + _dim = dim; +} + @end @implementation PaddleMobileCPUConfig @@ -92,6 +96,7 @@ static std::mutex shared_mutex; - (void)dealloc { if (pam_) { delete pam_; + pam_ = nullptr; } } @@ -105,6 +110,7 @@ static std::mutex shared_mutex; } - (BOOL)loadModel:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath { + std::lock_guard lock(shared_mutex); std::string model_path_str = std::string([modelPath UTF8String]); std::string weights_path_str = std::string([weighsPath UTF8String]); pam_->SetThreadNum(self.config.threadNum); @@ -119,12 +125,14 @@ static std::mutex shared_mutex; andModelBuf:(const uint8_t *)modelBuf andModelParamsLen:(size_t)combinedParamsLen andCombinedParamsBuf:(const uint8_t *)combinedParamsBuf { + std::lock_guard lock(shared_mutex); pam_->SetThreadNum(self.config.threadNum); return loaded_ = pam_->LoadCombinedMemory(modelLen, modelBuf, combinedParamsLen, const_cast(combinedParamsBuf), self.config.optimize, false, 1, self.config.loddable); } - (BOOL)load:(NSString *)modelAndWeightPath{ + std::lock_guard lock(shared_mutex); std::string model_path_str = std::string([modelAndWeightPath UTF8String]); if (loaded_ = pam_->Load(model_path_str, self.config.optimize, false, 1, self.config.loddable)) { return YES; @@ -241,17 +249,22 @@ static std::mutex shared_mutex; } paddle_mobile::framework::Tensor input_tensor; - paddle_mobile::framework::DDim dims = paddle_mobile::framework::make_ddim(dim_vec); - float *input_ptr = input_tensor.mutable_data(dims); - memcpy(input_ptr, input, numel * sizeof(float)); pam_->Predict(input_tensor); std::shared_ptr output = pam_->Fetch(); + auto output_dims = output->dims(); + std::vector output_dim_vec = vectorize(output_dims); + NSMutableArray *ocDim = [NSMutableArray array]; + for (int i = 0; i < output_dim_vec.size(); ++i) { + NSNumber *num = [NSNumber numberWithLongLong:output_dim_vec[i]]; + [ocDim addObject:num]; + } + float *output_pointer = new float[output->numel()]; memcpy(output_pointer, output->data(), @@ -259,6 +272,7 @@ static std::mutex shared_mutex; PaddleMobileCPUResult *cpuResult = [[PaddleMobileCPUResult alloc] init]; [cpuResult toSetOutput: output_pointer]; + [cpuResult toSetDim: ocDim]; [cpuResult toSetOutputSize: output->numel()]; return cpuResult; @@ -304,21 +318,30 @@ static std::mutex shared_mutex; return nil; } - // input - std::vector predict_input; - for (int j = 0; j < numel; ++j) { - predict_input.push_back(dataPointer[j]); - } + paddle_mobile::framework::Tensor input_tensor; + paddle_mobile::framework::DDim dims = paddle_mobile::framework::make_ddim(dim_vec); + float *input_ptr = input_tensor.mutable_data(dims); + memcpy(input_ptr, dataPointer, + numel * sizeof(float)); - // predict - std::vector cpp_result = pam_->Predict(predict_input, dim_vec); + pam_->Predict(input_tensor); + std::shared_ptr output_tensor = pam_->Fetch(); + + auto output_dims = output_tensor->dims(); + std::vector output_dim_vec = vectorize(output_dims); + NSMutableArray *ocDim = [NSMutableArray array]; + for (int i = 0; i < output_dim_vec.size(); ++i) { + NSNumber *num = [NSNumber numberWithLongLong:output_dim_vec[i]]; + [ocDim addObject:num]; + } - float *output_pointer = new float[cpp_result.size()]; - memcpy(output_pointer, cpp_result.data(), - cpp_result.size() * sizeof(float)); + float *output_pointer = new float[output_tensor->numel()]; + memcpy(output_pointer, output_tensor->data(), + output_tensor->numel() * sizeof(float)); PaddleMobileCPUResult *cpuResult = [[PaddleMobileCPUResult alloc] init]; [cpuResult toSetOutput: output_pointer]; - [cpuResult toSetOutputSize: cpp_result.size()]; + [cpuResult toSetDim: ocDim]; + [cpuResult toSetOutputSize: output_tensor->numel()]; free(output); CFRelease(cfData); @@ -331,8 +354,63 @@ static std::mutex shared_mutex; return [self predict:image dim:dim means:nil scale:1]; } +- (PaddleMobileCPUResult *)fetchOutput{ + if (pam_ && loaded_) { + auto tensorPtr = pam_->Fetch(); + float *output_pointer = new float[tensorPtr->numel()]; + memcpy(output_pointer, tensorPtr->data(), + tensorPtr->numel() * sizeof(float)); + auto dims = tensorPtr->dims(); + std::vector dim_vec = vectorize(dims); + + + NSMutableArray *ocDim = [NSMutableArray array]; + for (int i = 0; i < dim_vec.size(); ++i) { + NSNumber *num = [NSNumber numberWithLongLong:dim_vec[i]]; + [ocDim addObject:num]; + } + + PaddleMobileCPUResult *cpuResult = [[PaddleMobileCPUResult alloc] init]; + [cpuResult toSetOutput: output_pointer]; + [cpuResult toSetDim: ocDim]; + [cpuResult toSetOutputSize: tensorPtr->numel()]; + + return cpuResult; + } + return nil; +} + +- (PaddleMobileCPUResult *)fetchOutputWithKey:(NSString *)key{ + if (pam_ && loaded_ && key.length) { + auto tensorPtr = pam_->Fetch(std::string([key cStringUsingEncoding:NSUTF8StringEncoding])); + float *output_pointer = new float[tensorPtr->numel()]; + memcpy(output_pointer, tensorPtr->data(), + tensorPtr->numel() * sizeof(float)); + + auto dims = tensorPtr->dims(); + std::vector dim_vec = vectorize(dims); + + NSMutableArray *ocDim = [NSMutableArray array]; + for (int i = 0; i < dim_vec.size(); ++i) { + NSNumber *num = [NSNumber numberWithLongLong:dim_vec[i]]; + [ocDim addObject:num]; + } + + PaddleMobileCPUResult *cpuResult = [[PaddleMobileCPUResult alloc] init]; + [cpuResult toSetOutput: output_pointer]; + [cpuResult toSetDim: ocDim]; + [cpuResult toSetOutputSize: tensorPtr->numel()]; + + return cpuResult; + } + return nil; +} + - (void)clear{ - pam_->Clear(); + std::lock_guard lock(shared_mutex); + if (pam_) { + pam_->Clear(); + } } @end diff --git a/src/io/paddle_inference_api.h b/src/io/paddle_inference_api.h index 42509915d13cf7e632ed20c73f1320ec8bac09d1..81779da1eafa892bd800641e133014988dab13e8 100644 --- a/src/io/paddle_inference_api.h +++ b/src/io/paddle_inference_api.h @@ -24,21 +24,29 @@ limitations under the License. */ #include #include #include +#include #include -// #define PADDLE_MOBILE_FPGA - namespace paddle_mobile { #ifdef PADDLE_MOBILE_FPGA namespace fpga { int open_device(); -} +void* fpga_malloc(size_t size); +void fpga_free(void* ptr); +} // namespace fpga #endif enum PaddleDType { FLOAT32, + FLOAT16, INT64, + INT8, +}; + +enum LayoutType { + LAYOUT_CHW = 1, + LAYOUT_HWC = 0, }; class PaddleBuf { @@ -78,6 +86,8 @@ struct PaddleTensor { // TODO(Superjomn) for LoD support, add a vector> field if needed. PaddleBuf data; // blob of data. PaddleDType dtype; + std::type_index dtypeid = typeid(float); + LayoutType layout; }; enum class PaddleEngineKind { @@ -116,12 +126,13 @@ class PaddlePredictor { std::string param_file; }; #ifdef PADDLE_MOBILE_FPGA - virtual bool Run(const std::vector& inputs, - std::vector* output_data, - std::vector* index_data, int batch_size = -1) = 0; virtual void FeedData(const std::vector& inputs) = 0; virtual void GetResults(std::vector* outputs) = 0; - virtual void Predict_From_To(int start = 0, int end = -1) = 0; + virtual void Predict_From_To(int start, int end) = 0; + virtual void FeedPaddleTensors(const std::vector& inputs) = 0; + virtual void FetchPaddleTensors(std::vector* outputs) = 0; + virtual void GetPaddleTensor(const std::string& name, + PaddleTensor* output) = 0; #endif protected: diff --git a/src/io/paddle_mobile.cpp b/src/io/paddle_mobile.cpp index 0b47d595c4a5a02d13524c78866c126d827a5805..6294f6bf467b1c1684d87c51b9a3b04508d56016 100644 --- a/src/io/paddle_mobile.cpp +++ b/src/io/paddle_mobile.cpp @@ -238,6 +238,18 @@ void PaddleMobile::GetResults(std::vector *v) { executor_->GetResults(v); } +template +void PaddleMobile::GetTensorResults( + std::vector *v) { + executor_->GetTensorResults(v); +} + +template +framework::Tensor *PaddleMobile::GetTensorByName( + const std::string &name) { + return executor_->GetTensorByName(name); +} + template std::shared_ptr PaddleMobile::FetchResult( int id) { diff --git a/src/io/paddle_mobile.h b/src/io/paddle_mobile.h index c0ef24f7f2d4d70c1c6043cc0227dc33a072f2a0..e3fd9f40f4194ed2841ba11366c6c5142e6279ab 100644 --- a/src/io/paddle_mobile.h +++ b/src/io/paddle_mobile.h @@ -91,7 +91,12 @@ class PaddleMobile { void InjectVariable(const framework::Tensor &t, std::string var_name); void FeedData(const framework::Tensor &t); void FeedData(const std::vector &v); + void FeedTensorData(const std::vector &v); + void GetResults(std::vector *v); + void GetTensorResults(std::vector *v); + framework::Tensor *GetTensorByName(const std::string &name); + std::shared_ptr FetchResult(int id = -1); void Predict_From_To(int start = 0, int end = -1); void Predict_From(int start); diff --git a/src/operators/activation_op.cpp b/src/operators/activation_op.cpp index afe806a651a92dd775bb94cef35397e563fc9208..158eb8eb47e872ed3c90fd4ae3ea1a9d257333e6 100644 --- a/src/operators/activation_op.cpp +++ b/src/operators/activation_op.cpp @@ -59,6 +59,7 @@ REGISTER_OPERATOR_CPU(relu6, ops::Relu6Op); REGISTER_OPERATOR_MALI_GPU(relu, ops::ReluOp); #endif #ifdef PADDLE_MOBILE_FPGA +REGISTER_OPERATOR_FPGA(relu, ops::ReluOp); #endif #ifdef PADDLE_MOBILE_CL REGISTER_OPERATOR_CL(relu, ops::ReluOp); diff --git a/src/operators/detection_ops.cpp b/src/operators/detection_ops.cpp index 630b672225f139891d136844558f9e418ac54508..b87d1d3e80fd7945dd0cf4571041c18378e6ac1a 100644 --- a/src/operators/detection_ops.cpp +++ b/src/operators/detection_ops.cpp @@ -65,6 +65,23 @@ void PSRoiPoolOp::InferShape() const { } #endif +#ifdef ROIALIGN_POOL_OP +template +void RoiAlignPoolOp::InferShape() const { + const auto &rois_dims = this->param_.input_rois_->dims(); + const int pooled_height = this->param_.pooled_height_; + const int pooled_width = this->param_.pooled_width_; + + auto out_dims = this->param_.input_x_->dims(); + out_dims[0] = rois_dims[0]; + // out_dims[1] = + // output_channels; // input_dims[1] / (pooled_height * pooled_width); + out_dims[2] = pooled_height; + out_dims[3] = pooled_width; + this->param_.output_->Resize(out_dims); +} +#endif + #ifdef ROI_PERSPECTIVE_OP template void RoiPerspectiveOp::InferShape() const { @@ -110,4 +127,8 @@ REGISTER_OPERATOR_FPGA(generate_proposals, ops::ProposalOp); #ifdef PSROI_POOL_OP REGISTER_OPERATOR_FPGA(psroi_pool, ops::PSRoiPoolOp); #endif +#ifdef ROIALIGN_POOL_OP +REGISTER_OPERATOR_FPGA(roialign_pool, ops::RoiAlignPoolOp); +#endif + #endif diff --git a/src/operators/detection_ops.h b/src/operators/detection_ops.h index 38d0890756a84bfc70119f30d8515159c57cca21..3b3a54dc4ba2e99eabe2250de63f38c7c7744d47 100644 --- a/src/operators/detection_ops.h +++ b/src/operators/detection_ops.h @@ -34,6 +34,10 @@ DECLARE_OPERATOR(Proposal, ProposalParam, ProposalKernel); DECLARE_OPERATOR(PSRoiPool, PSRoiPoolParam, PSRoiPoolKernel); #endif +#ifdef ROIALIGN_POOL_OP +DECLARE_OPERATOR(RoiAlignPool, RoiAlignPoolParam, RoiAlignPoolKernel); +#endif + #ifdef ROI_PERSPECTIVE_OP DECLARE_OPERATOR(RoiPerspective, RoiPerspectiveParam, RoiPerspectiveKernel); #endif diff --git a/src/operators/elementwise_mul_op.cpp b/src/operators/elementwise_mul_op.cpp index 335a908ace54664f0bcbca37bdcde30047edee5d..3417fedbb2b8717355e1a7492321ecd5d7c6a9c3 100644 --- a/src/operators/elementwise_mul_op.cpp +++ b/src/operators/elementwise_mul_op.cpp @@ -36,6 +36,7 @@ REGISTER_OPERATOR_CPU(elementwise_mul, ops::ElementwiseMulOp); REGISTER_OPERATOR_MALI_GPU(elementwise_mul, ops::ElementwiseMulOp); #endif #ifdef PADDLE_MOBILE_FPGA +REGISTER_OPERATOR_FPGA(elementwise_mul, ops::ElementwiseMulOp); #endif #endif diff --git a/src/operators/fill_constant_op.h b/src/operators/fill_constant_op.h index d9920cba9cc977c89021f08a8e6e0ed81f9f08b5..3d078d0e59b98b999e86c8602b8349f36139b841 100644 --- a/src/operators/fill_constant_op.h +++ b/src/operators/fill_constant_op.h @@ -34,7 +34,7 @@ class FillConstantOp : public framework::OperatorBase { const framework::AttributeMap attrs, framework::Scope *scope) : framework::OperatorBase(type, inputs, outputs, attrs, scope), - param_(inputs, outputs, attrs, *scope) {} + param_(inputs, outputs, attrs, scope.get()) {} void RunImpl() { auto data_type = static_cast<_PaddleMobile__Framework__Proto__VarType__Type>( diff --git a/src/operators/fusion_deconv_add_bn_op.cpp b/src/operators/fusion_deconv_add_bn_op.cpp new file mode 100644 index 0000000000000000000000000000000000000000..cb22e29f0903259d7bcf46271fb2a8bd70ba8eb7 --- /dev/null +++ b/src/operators/fusion_deconv_add_bn_op.cpp @@ -0,0 +1,33 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_DECONVADDBN_OP + +#include "operators/fusion_deconv_add_bn_op.h" + +namespace paddle_mobile { +namespace operators {} +} // namespace paddle_mobile + +namespace ops = paddle_mobile::operators; +REGISTER_FUSION_MATCHER(fusion_deconv_add_bn, ops::FusionDeconvAddBNMatcher); +#ifdef PADDLE_MOBILE_CPU +#endif +#ifdef PADDLE_MOBILE_MALI_GPU +#endif +#ifdef PADDLE_MOBILE_FPGA +REGISTER_OPERATOR_FPGA(fusion_deconv_add_bn, ops::FusionDeconvAddBNOp); +#endif + +#endif diff --git a/src/operators/fusion_deconv_add_bn_op.h b/src/operators/fusion_deconv_add_bn_op.h new file mode 100644 index 0000000000000000000000000000000000000000..f7f9b9e2094a7228c944b70b88ae3105ae9f37e8 --- /dev/null +++ b/src/operators/fusion_deconv_add_bn_op.h @@ -0,0 +1,116 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#ifdef FUSION_DECONVADDBN_OP +#pragma once +#include +#include + +#include "framework/operator.h" +#include "framework/program/program-optimize/fusion_op_register.h" +#include "operators/kernel/deconv_add_bn_kernel.h" + +namespace paddle_mobile { +namespace operators { +using std::string; +using std::vector; +class FusionDeconvAddBNMatcher : public framework::FusionOpMatcher { + public: + FusionDeconvAddBNMatcher() { + node_ = framework::Node(G_OP_TYPE_CONV_TRANSPOSE); + node_ > std::make_shared(G_OP_TYPE_ELEMENTWISE_ADD) > + std::make_shared(G_OP_TYPE_BATCHNORM); + } + + void FolderNodes( + framework::Node *node, + std::vector> *removed_nodes) { + node->Folder(node_.Depth(), Type(), + {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}, {"X", "X"}}}, + {G_OP_TYPE_BATCHNORM, + {{"Scale", "Scale"}, + {"Mean", "Mean"}, + {"Bias", "Bias"}, + {"Variance", "Variance"}, + {"Y", "BNY"}}}}, + removed_nodes); + } + + std::string Type() { return G_OP_TYPE_FUSION_DECONV_ADD_BN; } +}; + +template +class FusionDeconvAddBNOp : public framework::OperatorWithKernel< + DeviceType, FusionDeconvAddBNParam, + operators::DeconvAddBNKernel> { + public: + FusionDeconvAddBNOp(const string &type, const VariableNameMap &inputs, + const VariableNameMap &outputs, + const framework::AttributeMap &attrs, + std::shared_ptr scope) + : framework::OperatorWithKernel< + DeviceType, FusionDeconvAddBNParam, + operators::DeconvAddBNKernel>(type, inputs, outputs, + attrs, scope) {} + + void InferShape() const { + auto input = this->param_.Input(); + auto in_dims = input->dims(); + + auto filter = this->param_.Filter(); + auto filter_dims = filter->dims(); + + std::vector strides = this->param_.Strides(); + std::vector paddings = this->param_.Paddings(); + std::vector dilations = this->param_.Dilations(); + + int groups = this->param_.Groups(); + + PADDLE_MOBILE_ENFORCE( + in_dims.size() == 4 || in_dims.size() == 5, + "ConvTransposeOp intput should be 4-D or 5-D tensor."); + PADDLE_MOBILE_ENFORCE( + in_dims.size() == filter_dims.size(), + "ConvTransposeOp input dimension and filter dimension " + "should be the same."); + PADDLE_MOBILE_ENFORCE( + in_dims.size() - strides.size() == 2U, + "ConvTransposeOp input dimension and strides dimension should " + "be consistent."); + PADDLE_MOBILE_ENFORCE(paddings.size() == strides.size(), + "ConvTransposeOp paddings dimension and strides " + "dimension should be the same."); + PADDLE_MOBILE_ENFORCE(paddings.size() == dilations.size(), + "ConvTransposeOp paddings dimension and dilations " + "dimension should be the same."); + PADDLE_MOBILE_ENFORCE( + in_dims[1] == filter_dims[0], + "In ConvTransposeOp, The number of input channels should " + "be equal to the number of filter's channels."); + + std::vector output_shape({in_dims[0], filter_dims[1] * groups}); + for (size_t i = 0; i < strides.size(); ++i) { + auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1; + output_shape.push_back((in_dims[i + 2] - 1) * strides[i] - + 2 * paddings[i] + filter_extent); + } + this->param_.Output()->Resize(framework::make_ddim(output_shape)); + } + + protected: +}; + +} // namespace operators +} // namespace paddle_mobile + +#endif // FUSION_DECONV_ADD_BN_OP diff --git a/src/operators/fusion_deconv_add_bn_relu_op.cpp b/src/operators/fusion_deconv_add_bn_relu_op.cpp new file mode 100755 index 0000000000000000000000000000000000000000..b7e9abe660b350e9d3ccc89aef685505a7449a9f --- /dev/null +++ b/src/operators/fusion_deconv_add_bn_relu_op.cpp @@ -0,0 +1,34 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_DECONVADDBNRELU_OP + +#include "operators/fusion_deconv_add_bn_relu_op.h" + +namespace paddle_mobile { +namespace operators {} +} // namespace paddle_mobile + +namespace ops = paddle_mobile::operators; +REGISTER_FUSION_MATCHER(fusion_deconv_add_bn_relu, + ops::FusionDeconvAddBNReluMatcher); +#ifdef PADDLE_MOBILE_CPU +#endif +#ifdef PADDLE_MOBILE_MALI_GPU +#endif +#ifdef PADDLE_MOBILE_FPGA +REGISTER_OPERATOR_FPGA(fusion_deconv_add_bn_relu, ops::FusionDeconvAddBNReluOp); +#endif + +#endif diff --git a/src/operators/fusion_deconv_add_bn_relu_op.h b/src/operators/fusion_deconv_add_bn_relu_op.h new file mode 100644 index 0000000000000000000000000000000000000000..97070ef01e544839be8eab6ddba21c43dfa9a26e --- /dev/null +++ b/src/operators/fusion_deconv_add_bn_relu_op.h @@ -0,0 +1,118 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#ifdef FUSION_DECONVADDBNRELU_OP +#pragma once +#include +#include + +#include "framework/operator.h" +#include "framework/program/program-optimize/fusion_op_register.h" +#include "operators/kernel/deconv_add_bn_relu_kernel.h" + +namespace paddle_mobile { +namespace operators { +using std::string; +using std::vector; +class FusionDeconvAddBNReluMatcher : public framework::FusionOpMatcher { + public: + FusionDeconvAddBNReluMatcher() { + node_ = framework::Node(G_OP_TYPE_CONV_TRANSPOSE); + node_ > std::make_shared(G_OP_TYPE_ELEMENTWISE_ADD) > + std::make_shared(G_OP_TYPE_BATCHNORM) > + std::make_shared(G_OP_TYPE_RELU); + } + + void FolderNodes( + framework::Node *node, + std::vector> *removed_nodes) { + node->Folder(node_.Depth(), Type(), + {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}, {"X", "X"}}}, + {G_OP_TYPE_BATCHNORM, + {{"Scale", "Scale"}, + {"Mean", "Mean"}, + {"Bias", "Bias"}, + {"Variance", "Variance"}, + {"Y", "BNY"}}}}, + removed_nodes); + } + + std::string Type() { return G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU; } +}; + +template +class FusionDeconvAddBNReluOp + : public framework::OperatorWithKernel< + DeviceType, FusionDeconvAddBNReluParam, + operators::DeconvAddBNReluKernel> { + public: + FusionDeconvAddBNReluOp(const string &type, const VariableNameMap &inputs, + const VariableNameMap &outputs, + const framework::AttributeMap &attrs, + std::shared_ptr scope) + : framework::OperatorWithKernel< + DeviceType, FusionDeconvAddBNReluParam, + operators::DeconvAddBNReluKernel>( + type, inputs, outputs, attrs, scope) {} + + void InferShape() const { + auto input = this->param_.Input(); + auto in_dims = input->dims(); + + auto filter = this->param_.Filter(); + auto filter_dims = filter->dims(); + + std::vector strides = this->param_.Strides(); + std::vector paddings = this->param_.Paddings(); + std::vector dilations = this->param_.Dilations(); + + int groups = this->param_.Groups(); + + PADDLE_MOBILE_ENFORCE( + in_dims.size() == 4 || in_dims.size() == 5, + "ConvTransposeOp intput should be 4-D or 5-D tensor."); + PADDLE_MOBILE_ENFORCE( + in_dims.size() == filter_dims.size(), + "ConvTransposeOp input dimension and filter dimension " + "should be the same."); + PADDLE_MOBILE_ENFORCE( + in_dims.size() - strides.size() == 2U, + "ConvTransposeOp input dimension and strides dimension should " + "be consistent."); + PADDLE_MOBILE_ENFORCE(paddings.size() == strides.size(), + "ConvTransposeOp paddings dimension and strides " + "dimension should be the same."); + PADDLE_MOBILE_ENFORCE(paddings.size() == dilations.size(), + "ConvTransposeOp paddings dimension and dilations " + "dimension should be the same."); + PADDLE_MOBILE_ENFORCE( + in_dims[1] == filter_dims[0], + "In ConvTransposeOp, The number of input channels should " + "be equal to the number of filter's channels."); + + std::vector output_shape({in_dims[0], filter_dims[1] * groups}); + for (size_t i = 0; i < strides.size(); ++i) { + auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1; + output_shape.push_back((in_dims[i + 2] - 1) * strides[i] - + 2 * paddings[i] + filter_extent); + } + this->param_.Output()->Resize(framework::make_ddim(output_shape)); + } + + protected: +}; + +} // namespace operators +} // namespace paddle_mobile + +#endif // FUSION_DECONV_ADD_BN_RELU_OP diff --git a/src/operators/fusion_deconv_bn_relu_op.cpp b/src/operators/fusion_deconv_bn_relu_op.cpp new file mode 100644 index 0000000000000000000000000000000000000000..22f549d1fcd501c420d3fb3c209c4dbb1273f7a8 --- /dev/null +++ b/src/operators/fusion_deconv_bn_relu_op.cpp @@ -0,0 +1,33 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_DECONVBNRELU_OP + +#include "operators/fusion_deconv_bn_relu_op.h" + +namespace paddle_mobile { +namespace operators {} +} // namespace paddle_mobile + +namespace ops = paddle_mobile::operators; +REGISTER_FUSION_MATCHER(fusion_deconv_bn_relu, ops::FusionDeconvBNReluMatcher); +#ifdef PADDLE_MOBILE_CPU +#endif +#ifdef PADDLE_MOBILE_MALI_GPU +#endif +#ifdef PADDLE_MOBILE_FPGA +REGISTER_OPERATOR_FPGA(fusion_deconv_bn_relu, ops::FusionDeconvBNReluOp); +#endif + +#endif diff --git a/src/operators/fusion_deconv_bn_relu_op.h b/src/operators/fusion_deconv_bn_relu_op.h new file mode 100644 index 0000000000000000000000000000000000000000..ad0920ebd69b1a13ebc0e85f2c5f6008379715da --- /dev/null +++ b/src/operators/fusion_deconv_bn_relu_op.h @@ -0,0 +1,115 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#ifdef FUSION_DECONVBNRELU_OP +#pragma once +#include +#include + +#include "framework/operator.h" +#include "framework/program/program-optimize/fusion_op_register.h" +#include "operators/kernel/deconv_bn_relu_kernel.h" + +namespace paddle_mobile { +namespace operators { +using std::string; +using std::vector; +class FusionDeconvBNReluMatcher : public framework::FusionOpMatcher { + public: + FusionDeconvBNReluMatcher() { + node_ = framework::Node(G_OP_TYPE_CONV_TRANSPOSE); + node_ > std::make_shared(G_OP_TYPE_BATCHNORM) > + std::make_shared(G_OP_TYPE_RELU); + } + + void FolderNodes( + framework::Node *node, + std::vector> *removed_nodes) { + node->Folder(node_.Depth(), Type(), + {{G_OP_TYPE_BATCHNORM, + {{"Scale", "Scale"}, + {"Mean", "Mean"}, + {"Bias", "Bias"}, + {"Variance", "Variance"}}}}, + removed_nodes); + } + + std::string Type() { return G_OP_TYPE_FUSION_DECONV_BN_RELU; } +}; + +template +class FusionDeconvBNReluOp + : public framework::OperatorWithKernel< + DeviceType, FusionDeconvBNReluParam, + operators::DeconvBNReluKernel> { + public: + FusionDeconvBNReluOp(const string &type, const VariableNameMap &inputs, + const VariableNameMap &outputs, + const framework::AttributeMap &attrs, + std::shared_ptr scope) + : framework::OperatorWithKernel< + DeviceType, FusionDeconvBNReluParam, + operators::DeconvBNReluKernel>(type, inputs, outputs, + attrs, scope) {} + + void InferShape() const { + auto input = this->param_.Input(); + auto in_dims = input->dims(); + + auto filter = this->param_.Filter(); + auto filter_dims = filter->dims(); + + std::vector strides = this->param_.Strides(); + std::vector paddings = this->param_.Paddings(); + std::vector dilations = this->param_.Dilations(); + + int groups = this->param_.Groups(); + + PADDLE_MOBILE_ENFORCE( + in_dims.size() == 4 || in_dims.size() == 5, + "ConvTransposeOp intput should be 4-D or 5-D tensor."); + PADDLE_MOBILE_ENFORCE( + in_dims.size() == filter_dims.size(), + "ConvTransposeOp input dimension and filter dimension " + "should be the same."); + PADDLE_MOBILE_ENFORCE( + in_dims.size() - strides.size() == 2U, + "ConvTransposeOp input dimension and strides dimension should " + "be consistent."); + PADDLE_MOBILE_ENFORCE(paddings.size() == strides.size(), + "ConvTransposeOp paddings dimension and strides " + "dimension should be the same."); + PADDLE_MOBILE_ENFORCE(paddings.size() == dilations.size(), + "ConvTransposeOp paddings dimension and dilations " + "dimension should be the same."); + PADDLE_MOBILE_ENFORCE( + in_dims[1] == filter_dims[0], + "In ConvTransposeOp, The number of input channels should " + "be equal to the number of filter's channels."); + + std::vector output_shape({in_dims[0], filter_dims[1] * groups}); + for (size_t i = 0; i < strides.size(); ++i) { + auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1; + output_shape.push_back((in_dims[i + 2] - 1) * strides[i] - + 2 * paddings[i] + filter_extent); + } + this->param_.Output()->Resize(framework::make_ddim(output_shape)); + } + + protected: +}; + +} // namespace operators +} // namespace paddle_mobile + +#endif // FUSION_DECONV_BN_RELU_OP diff --git a/src/operators/fusion_fc_relu_op.cpp b/src/operators/fusion_fc_relu_op.cpp index b19e94cf9a8255b7e9d860cdd17fcfa76274aa02..e11da8814b3a5ef3b128be944965fb97d6142da8 100644 --- a/src/operators/fusion_fc_relu_op.cpp +++ b/src/operators/fusion_fc_relu_op.cpp @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef FUSION_FC_RELU_OP +#ifdef FUSION_FCRELU_OP #include "operators/fusion_fc_relu_op.h" namespace paddle_mobile { diff --git a/src/operators/kernel/arm/conv_kernel.cpp b/src/operators/kernel/arm/conv_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..de19127e68361bd51f25a15c4c7ab69639707433 --- /dev/null +++ b/src/operators/kernel/arm/conv_kernel.cpp @@ -0,0 +1,137 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef CONV_OP + +#include "operators/kernel/conv_kernel.h" +#include "operators/kernel/central-arm-func/conv_arm_func.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool ConvKernel::Init(ConvParam *param) { + bool conv3x3 = param->Filter()->dims()[2] == param->Filter()->dims()[3] && + param->Filter()->dims()[2] == 3; + bool conv5x5 = param->Filter()->dims()[2] == param->Filter()->dims()[3] && + param->Filter()->dims()[2] == 5; + bool depth3x3 = conv3x3 && param->Groups() == param->Input()->dims()[1] && + param->Input()->dims()[1] == param->Output()->dims()[1]; + bool depth5x5 = conv5x5 && param->Groups() == param->Input()->dims()[1] && + param->Input()->dims()[1] == param->Output()->dims()[1]; + if (param->Filter()->type() == typeid(int8_t)) { +#ifndef __aarch64__ + if (depth3x3 && param->Strides()[0] < 3 && + param->Strides()[0] == param->Strides()[1]) { + param->ExecMode() = ConvParam::EXEC_DEPTHWISE3x3_INT8; + } else if (depth5x5 && param->Strides()[0] < 2 && + param->Strides()[0] == param->Strides()[1]) { + param->ExecMode() = ConvParam::EXEC_DEPTHWISE5x5_INT8; + } else { +#endif // __aarch64__ + param->ExecMode() = ConvParam::EXEC_GEMM_INT8; +#ifndef __aarch64__ + } +#endif // __aarch64__ + } else { + if (depth3x3 && param->Strides()[0] == param->Strides()[1] && + param->Strides()[0] == 1 && param->Paddings()[0] == 1 && + param->Paddings()[0] == param->Paddings()[1]) { + param->ExecMode() = ConvParam::EXEC_DEPTHWISE3x3S1P1_FLOAT; + } else if (depth3x3 && param->Strides()[0] == param->Strides()[1] && + param->Strides()[0] == 2 && param->Paddings()[0] == 0 && + param->Paddings()[0] == param->Paddings()[1]) { + param->ExecMode() = ConvParam::EXEC_DEPTHWISE3x3S2P0_FLOAT; + } else if (depth3x3 && param->Strides()[0] == param->Strides()[1] && + param->Strides()[0] == 2 && param->Paddings()[0] == 1 && + param->Paddings()[0] == param->Paddings()[1]) { + param->ExecMode() = ConvParam::EXEC_DEPTHWISE3x3S2P1_FLOAT; + } else if (depth3x3) { + param->ExecMode() = ConvParam::EXEC_DEPTHWISE3x3_FLOAT; +#ifndef __aarch64__ + } else if (depth5x5 && param->Strides()[0] == param->Strides()[1] && + param->Strides()[0] == 1) { + param->ExecMode() = ConvParam::EXEC_DEPTHWISE5x5_FLOAT; + } else if (conv3x3 && param->Strides()[0] == param->Strides()[1] && + param->Dilations()[0] == param->Dilations()[1] && + param->Strides()[0] == 1 && param->Dilations()[0] == 1 && + param->Output()->dims()[1] >= 16 && + param->Input()->dims()[1] >= 16 && + param->Input()->dims()[2] <= 140 /* refered from ncnn */) { + param->ExecMode() = ConvParam::EXEC_WINOGRAD3X3_FLOAT; + // transform weight + param->transformed_filter_ = new framework::Tensor; + operators::math::winograd_transform_weight<8, 3>( + *param->Filter(), param->transformed_filter_); +#endif + } else { + param->ExecMode() = ConvParam::EXEC_GEMM_FLOAT; + } + } + return true; +} + +template <> +void ConvKernel::Compute(const ConvParam ¶m) { + switch (param.ExecMode()) { + case ConvParam::EXEC_GEMM_INT8: + GemmConv(param); + break; +#ifndef __aarch64__ + case ConvParam::EXEC_DEPTHWISE3x3_INT8: + DepthwiseConv3x3(param); + break; + case ConvParam::EXEC_DEPTHWISE5x5_INT8: + DepthwiseConv5x5(param); + break; +#endif // __aarch64__ + case ConvParam::EXEC_DEPTHWISE3x3S1P1_FLOAT: + math::DepthwiseConv3x3s1p1(param.Input(), param.Filter(), param.Output(), + nullptr, false, false); + break; + case ConvParam::EXEC_DEPTHWISE3x3S2P1_FLOAT: + math::DepthwiseConv3x3s2p1v2(param.Input(), param.Filter(), + param.Output(), nullptr, false, false); + break; + case ConvParam::EXEC_DEPTHWISE3x3S2P0_FLOAT: + math::DepthwiseConv3x3s2p0(param.Input(), param.Filter(), param.Output(), + nullptr, false, false); + break; + case ConvParam::EXEC_DEPTHWISE3x3_FLOAT: + math::DepthwiseConv3x3(param.Input(), param.Strides(), param.Paddings(), + param.Filter(), nullptr, param.Output(), false); + break; +#ifndef __aarch64__ + case ConvParam::EXEC_DEPTHWISE5x5_FLOAT: + DepthwiseConv5x5(param); + break; + case ConvParam::EXEC_WINOGRAD3X3_FLOAT: + WinogradConv3x3<8, 3>(param); + break; +#endif // __aarch64__ + case ConvParam::EXEC_GEMM_FLOAT: + GemmConv(param); + break; + default: + PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d", + param.ExecMode()); + } +} + +template class ConvKernel; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/arm/convolution/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/arm/convolution/conv_add_bn_relu_kernel.cpp index f9489330ca33a7b055ff91e9c8e259d1feb1e827..0cbb55acb9e771b14ba727c1c354a83752690e47 100644 --- a/src/operators/kernel/arm/convolution/conv_add_bn_relu_kernel.cpp +++ b/src/operators/kernel/arm/convolution/conv_add_bn_relu_kernel.cpp @@ -43,8 +43,11 @@ bool ConvAddBNReluKernel::Init( inv_std_ptr[i] = 1 / static_cast(pow((variance_ptr[i] + epsilon), 0.5)); } - LoDTensor *new_scale = new LoDTensor(); - LoDTensor *new_bias = new LoDTensor(); + // Tensor *new_scale = new Tensor(); + // Tensor *new_bias = new Tensor(); + Tensor *new_scale = param->CreateNewScale(); + Tensor *new_bias = param->CreateNewBiase(); + auto new_scale_ptr = new_scale->mutable_data({C}); auto new_bias_ptr = new_bias->mutable_data({C}); for (int i = 0; i < C; i++) { diff --git a/src/operators/kernel/arm/convolution/conv_bn_relu_kernel.cpp b/src/operators/kernel/arm/convolution/conv_bn_relu_kernel.cpp index 15129d72895a89a4cba918d7a8da747a17962f58..4de479a1b9d1966a053ca04076f5c4a75936fae9 100644 --- a/src/operators/kernel/arm/convolution/conv_bn_relu_kernel.cpp +++ b/src/operators/kernel/arm/convolution/conv_bn_relu_kernel.cpp @@ -42,8 +42,9 @@ bool ConvBNReluKernel::Init(FusionConvBNReluParam *param) { inv_std_ptr[i] = 1 / static_cast(pow((variance_ptr[i] + epsilon), 0.5)); } - LoDTensor *new_scale = new LoDTensor(); - LoDTensor *new_bias = new LoDTensor(); + + Tensor *new_scale = param->CreateNewScale(); + Tensor *new_bias = param->CreateNewBiase(); auto new_scale_ptr = new_scale->mutable_data({C}); auto new_bias_ptr = new_bias->mutable_data({C}); for (int i = 0; i < C; i++) { diff --git a/src/operators/kernel/arm/quantize_kernel.cpp b/src/operators/kernel/arm/quantize_kernel.cpp index c8af30a65590ef6dd3592da8709b94cba1ee60c6..79186be79e5b4207e9f6b2b221bc1a4160e4e67e 100644 --- a/src/operators/kernel/arm/quantize_kernel.cpp +++ b/src/operators/kernel/arm/quantize_kernel.cpp @@ -167,7 +167,7 @@ float find_abs_max(const Tensor *input) { max_abs = vmaxvq_f32(__max); #endif for (size_t i = 0; i < remain; ++i) { - max_abs = std::max(max_abs, fabs(x[i])); + max_abs = std::max(max_abs, static_cast(fabs(x[i]))); } return max_abs; } diff --git a/src/operators/kernel/central-arm-func/conv_add_arm_func.h b/src/operators/kernel/central-arm-func/conv_add_arm_func.h new file mode 100644 index 0000000000000000000000000000000000000000..0051fc9ae8cfb57fa6e602422b89f90f930f25a8 --- /dev/null +++ b/src/operators/kernel/central-arm-func/conv_add_arm_func.h @@ -0,0 +1,151 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_CONVADD_OP +#pragma once + +#include +#include "operators/math/conv_func.h" +#include "operators/math/depthwise_conv3x3.h" +#include "operators/math/im2col.h" +#include "operators/math/math_function.h" +#include "operators/math/vol2col.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +void ConvAddBasic(const FusionConvAddParam ¶m) { + const Tensor *input = param.Input(); + Tensor filter = *param.Filter(); + Tensor bias = *param.Bias(); + Tensor *output = param.Output(); + output->mutable_data(); + float *biase_data = bias.data(); + + int axis = param.Axis(); + int groups = param.Groups(); + std::vector strides = param.Strides(); + std::vector paddings = param.Paddings(); + std::vector dilations = param.Dilations(); + + const int batch_size = static_cast(input->dims()[0]); + + std::vector filter_shape_vec(framework::vectorize(filter.dims())); + + std::vector output_shape_vec(framework::vectorize(output->dims())); + size_t data_dim = filter_shape_vec.size() - 2; + std::vector col_shape_vec(1 + 2 * data_dim); + col_shape_vec[0] = input->dims()[1] / groups; + for (size_t j = 0; j < data_dim; ++j) { + col_shape_vec[j + 1] = filter_shape_vec[j + 2]; + col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; + } + framework::DDim col_shape(framework::make_ddim(col_shape_vec)); + + framework::DDim col_matrix_shape = + framework::flatten_to_2d(col_shape, data_dim + 1); + + bool is_expand = + math::IsExpand(filter_shape_vec, strides, paddings, dilations); + Tensor col; + Tensor col_matrix; + if (is_expand) { + col.mutable_data(col_shape); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } + + framework::DDim input_shape = framework::slice_ddim( + input->dims(), 1, static_cast(input->dims().size())); + + framework::DDim filter_matrix_shape = {filter.dims()[0], + filter.numel() / filter.dims()[0]}; + filter.Resize(filter_matrix_shape); + framework::DDim output_matrix_shape = { + output->dims()[1], + output->numel() / (output->dims()[0] * output->dims()[1])}; + + // convolution operator: im2col(or vol2col) + gemm + int in_step = static_cast(input->dims()[1]) / groups; + int out_step = static_cast(output->dims()[1]) / groups; + + math::Vol2ColFunctor vol2col; + math::Im2ColFunctor im2col; + + for (int i = 0; i < batch_size; i++) { + Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); + Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape); + + for (int g = 0; g < groups; g++) { + Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); + + if (!is_expand) { + col.ShareDataWith(in_slice); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } else if (data_dim == 2U) { + // im2col + im2col(in_slice, dilations, strides, + std::vector{paddings[0], paddings[1], paddings[0], + paddings[1]}, + &col); + } else if (data_dim == 3U) { + // vol2col + vol2col(in_slice, dilations, strides, paddings, &col); + } + // gemm + Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); + Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); + math::MatMul(filter_slice, false, col_matrix, false, + static_cast(1), &out_slice, + static_cast(1), false, biase_data); + } + } +} + +template +void ConvAddCompute(const FusionConvAddParam ¶m) { + param.Output()->mutable_data(); + if (param.Groups() == param.Input()->dims()[1] && + param.Input()->dims()[1] == param.Output()->dims()[1] && + param.Filter()->dims()[2] == param.Filter()->dims()[3] && + param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1 && + param.paddings_[0] == 1) { + math::DepthwiseConv3x3s1p1(param.Input(), param.Filter(), param.Output(), + param.Bias(), true, false); + } else if (param.Groups() == param.Input()->dims()[1] && + param.Input()->dims()[1] == param.Output()->dims()[1] && + param.Filter()->dims()[2] == param.Filter()->dims()[3] && + param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) { + // math::DepthwiseConv3x3(param.Input(), param.Strides(), + // param.Paddings(), + // param.Filter(), param.Bias(), + // param.Output(), false); + if (param.Paddings()[0] == 0) { + math::DepthwiseConv3x3s2p0(param.Input(), param.Filter(), param.Output(), + param.Bias(), true, false); + } else { + math::DepthwiseConv3x3s2p1v2(param.Input(), param.Filter(), + param.Output(), param.Bias(), true, false); + } + } else { + ConvAddBasic(param); + } +} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h b/src/operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h new file mode 100644 index 0000000000000000000000000000000000000000..5ee1e251d95d499f368899cf5ed712d498ef7b51 --- /dev/null +++ b/src/operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h @@ -0,0 +1,143 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_CONVADDBNRELU_OP + +#pragma once + +#include +#include "operators/math/depthwise_conv3x3.h" +#include "operators/math/im2col.h" +#include "operators/math/math_function.h" +#include "operators/math/vol2col.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +void ConvAddBNReluBasic(const FusionConvAddBNReluParam ¶m) { + const Tensor *input = param.Input(); + Tensor filter = *param.Filter(); + Tensor new_bias = *param.NewBias(); + Tensor new_scale = *param.NewScale(); + Tensor *output = param.Output(); + output->mutable_data(); + + int groups = param.Groups(); + std::vector strides = param.Strides(); + std::vector paddings = param.Paddings(); + std::vector dilations = param.Dilations(); + + const int batch_size = static_cast(input->dims()[0]); + + std::vector filter_shape_vec(framework::vectorize(filter.dims())); + + std::vector output_shape_vec(framework::vectorize(output->dims())); + size_t data_dim = filter_shape_vec.size() - 2; + std::vector col_shape_vec(1 + 2 * data_dim); + col_shape_vec[0] = input->dims()[1] / groups; + for (size_t j = 0; j < data_dim; ++j) { + col_shape_vec[j + 1] = filter_shape_vec[j + 2]; + col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; + } + framework::DDim col_shape(framework::make_ddim(col_shape_vec)); + + framework::DDim col_matrix_shape = + framework::flatten_to_2d(col_shape, data_dim + 1); + + bool is_expand = + math::IsExpand(filter_shape_vec, strides, paddings, dilations); + Tensor col; + Tensor col_matrix; + if (is_expand) { + col.mutable_data(col_shape); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } + + framework::DDim input_shape = framework::slice_ddim( + input->dims(), 1, static_cast(input->dims().size())); + + framework::DDim filter_matrix_shape = {filter.dims()[0], + filter.numel() / filter.dims()[0]}; + filter.Resize(filter_matrix_shape); + framework::DDim output_matrix_shape = { + output->dims()[1], + output->numel() / (output->dims()[0] * output->dims()[1])}; + + // convolution operator: im2col(or vol2col) + gemm + int in_step = static_cast(input->dims()[1]) / groups; + int out_step = static_cast(output->dims()[1]) / groups; + + math::Vol2ColFunctor vol2col; + math::Im2ColFunctor im2col; + + for (int i = 0; i < batch_size; i++) { + Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); + Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape); + + for (int g = 0; g < groups; g++) { + Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); + + if (!is_expand) { + col.ShareDataWith(in_slice); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } else if (data_dim == 2U) { + // im2col + im2col(in_slice, dilations, strides, + std::vector{paddings[0], paddings[1], paddings[0], + paddings[1]}, + &col); + } else if (data_dim == 3U) { + // vol2col + vol2col(in_slice, dilations, strides, paddings, &col); + } + // gemm + Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); + Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); + + math::MatMulWithBn(filter_slice, false, col_matrix, false, + static_cast(1), &out_slice, + static_cast(0), true, &new_scale, &new_bias, g); + } + } +} + +template +void ConvAddBNReluCompute(const FusionConvAddBNReluParam ¶m) { + if (param.Groups() == param.Input()->dims()[1] && + param.Input()->dims()[1] == param.Output()->dims()[1] && + param.Filter()->dims()[2] == param.Filter()->dims()[3] && + param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1 && + param.paddings_[0] == 1) { + math::DepthwiseConvAddBNRelu3x3s1p1(param.Input(), param.Filter(), + param.Output(), param.NewScale(), + param.NewBias(), true); + } else if (param.Groups() == param.Input()->dims()[1] && + param.Input()->dims()[1] == param.Output()->dims()[1] && + param.Filter()->dims()[2] == param.Filter()->dims()[3] && + param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) { + math::DepthwiseConvAddBNRelu3x3s2p1v2(param.Input(), param.Filter(), + param.Output(), param.NewScale(), + param.NewBias(), true); + } else { + ConvAddBNReluBasic(param); + } +} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h b/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h new file mode 100644 index 0000000000000000000000000000000000000000..9f8e885a3160f0c9c04e11736c3feec363ffe8cb --- /dev/null +++ b/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h @@ -0,0 +1,154 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_CONVADDRELU_OP + +#pragma once +#include +#include +#include "operators/math/conv_func.h" +#include "operators/math/im2col.h" +#include "operators/math/math_function.h" +#include "operators/math/vol2col.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +template +void ConvAddReluBasic(const FusionConvAddReluParam ¶m) { + const Tensor *input = param.Input(); + Tensor filter = *param.Filter(); + Tensor bias = *param.Bias(); + int32_t axis = param.Axis(); + Otype *bias_data = bias.data(); + Tensor *output = param.Output(); + output->mutable_data(); + + float alpha = 1.0f; + float beta = 1.0f; + int32_t groups = param.Groups(); + std::vector strides = param.Strides(); + std::vector paddings = param.Paddings(); + std::vector dilations = param.Dilations(); + + const int32_t batch_size = static_cast(input->dims()[0]); + + std::vector filter_shape_vec(framework::vectorize(filter.dims())); + + std::vector output_shape_vec(framework::vectorize(output->dims())); + size_t data_dim = filter_shape_vec.size() - 2; + std::vector col_shape_vec(1 + 2 * data_dim); + col_shape_vec[0] = input->dims()[1] / groups; + for (size_t j = 0; j < data_dim; ++j) { + col_shape_vec[j + 1] = filter_shape_vec[j + 2]; + col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; + } + framework::DDim col_shape(framework::make_ddim(col_shape_vec)); + + framework::DDim col_matrix_shape = + framework::flatten_to_2d(col_shape, data_dim + 1); + + bool is_expand = + math::IsExpand(filter_shape_vec, strides, paddings, dilations); + Tensor col; + Tensor col_matrix; + if (is_expand) { + col.mutable_data(col_shape); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } + + framework::DDim input_shape = framework::slice_ddim( + input->dims(), 1, static_cast(input->dims().size())); + + framework::DDim filter_matrix_shape = {filter.dims()[0], + filter.numel() / filter.dims()[0]}; + filter.Resize(filter_matrix_shape); + framework::DDim output_matrix_shape = { + output->dims()[1], + output->numel() / (output->dims()[0] * output->dims()[1])}; + + // convolution operator: im2col(or vol2col) + gemm + int32_t in_step = static_cast(input->dims()[1]) / groups; + int32_t out_step = static_cast(output->dims()[1]) / groups; + + math::Vol2ColFunctor vol2col; + math::Im2ColFunctor im2col; + + for (int32_t i = 0; i < batch_size; i++) { + Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); + Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape); + + for (int32_t g = 0; g < groups; g++) { + Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); + + if (!is_expand) { + col.ShareDataWith(in_slice); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } else if (data_dim == 2U) { + // im2col + im2col(in_slice, dilations, strides, + std::vector{paddings[0], paddings[1], paddings[0], + paddings[1]}, + &col); + } else if (data_dim == 3U) { + // vol2col + vol2col(in_slice, dilations, strides, paddings, &col); + } + + // gemm + Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); + Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); + + math::MatMul(filter_slice, false, col_matrix, false, alpha, + &out_slice, beta, true, bias_data); + } + } +} + +template +void ConvAddReluCompute(const FusionConvAddReluParam ¶m) { + param.Output()->mutable_data(); + if (param.Groups() == param.Input()->dims()[1] && + param.Input()->dims()[1] == param.Output()->dims()[1] && + param.Filter()->dims()[2] == param.Filter()->dims()[3] && + param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1 && + param.paddings_[0] == 1) { + math::DepthwiseConv3x3s1p1(param.Input(), param.Filter(), param.Output(), + param.Bias(), true, true); + } else if (param.Groups() == param.Input()->dims()[1] && + param.Input()->dims()[1] == param.Output()->dims()[1] && + param.Filter()->dims()[2] == param.Filter()->dims()[3] && + param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) { + // math::DepthwiseConv3x3(param.Input(), param.Strides(), + // param.Paddings(), + // param.Filter(), param.Bias(), + // param.Output(), false); + if (param.Paddings()[0] == 0) { + math::DepthwiseConv3x3s2p0(param.Input(), param.Filter(), param.Output(), + param.Bias(), true, true); + } else { + math::DepthwiseConv3x3s2p1v2(param.Input(), param.Filter(), + param.Output(), param.Bias(), true, true); + } + } else { + ConvAddReluBasic(param); + } +} +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h b/src/operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h new file mode 100644 index 0000000000000000000000000000000000000000..1ff51aa39c880e1619af4e158bc77815e0dc1278 --- /dev/null +++ b/src/operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h @@ -0,0 +1,148 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_CONVBNADDRELU_OP + +#pragma once + +#include +#include "operators/math/depthwise_conv3x3.h" +#include "operators/math/im2col.h" +#include "operators/math/math_function.h" +#include "operators/math/vol2col.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { +void ConvBNAddReluBasic(const FusionConvBNAddReluParam ¶m) { + const Tensor *input = param.Input(); + Tensor filter = *param.Filter(); + Tensor new_bias = *param.NewBias(); + Tensor new_scale = *param.NewScale(); + Tensor *bias1 = param.Bias(); + Tensor *output = param.Output(); + output->mutable_data(); + + int groups = param.Groups(); + std::vector strides = param.Strides(); + std::vector paddings = param.Paddings(); + std::vector dilations = param.Dilations(); + + const int batch_size = static_cast(input->dims()[0]); + + std::vector filter_shape_vec(framework::vectorize(filter.dims())); + + std::vector output_shape_vec(framework::vectorize(output->dims())); + size_t data_dim = filter_shape_vec.size() - 2; + std::vector col_shape_vec(1 + 2 * data_dim); + col_shape_vec[0] = input->dims()[1] / groups; + for (size_t j = 0; j < data_dim; ++j) { + col_shape_vec[j + 1] = filter_shape_vec[j + 2]; + col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; + } + framework::DDim col_shape(framework::make_ddim(col_shape_vec)); + + framework::DDim col_matrix_shape = + framework::flatten_to_2d(col_shape, data_dim + 1); + + bool is_expand = + math::IsExpand(filter_shape_vec, strides, paddings, dilations); + Tensor col; + Tensor col_matrix; + if (is_expand) { + col.mutable_data(col_shape); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } + + framework::DDim input_shape = framework::slice_ddim( + input->dims(), 1, static_cast(input->dims().size())); + + framework::DDim filter_matrix_shape = {filter.dims()[0], + filter.numel() / filter.dims()[0]}; + filter.Resize(filter_matrix_shape); + framework::DDim output_matrix_shape = { + output->dims()[1], + output->numel() / (output->dims()[0] * output->dims()[1])}; + + // convolution operator: im2col(or vol2col) + gemm + int in_step = static_cast(input->dims()[1]) / groups; + int out_step = static_cast(output->dims()[1]) / groups; + + math::Vol2ColFunctor vol2col; + math::Im2ColFunctor im2col; + + for (int i = 0; i < batch_size; i++) { + Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); + Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape); + Tensor bias_batch = bias1->Slice(i, i + 1).Resize(output_matrix_shape); + for (int g = 0; g < groups; g++) { + Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); + + if (!is_expand) { + col.ShareDataWith(in_slice); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } else if (data_dim == 2U) { + // im2col + im2col(in_slice, dilations, strides, + std::vector{paddings[0], paddings[1], paddings[0], + paddings[1]}, + &col); + } else if (data_dim == 3U) { + // vol2col + vol2col(in_slice, dilations, strides, paddings, &col); + } + // gemm + Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); + Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); + Tensor bias_data = bias_batch.Slice(g * out_step, (g + 1) * out_step); + math::MatMulWithBn(filter_slice, false, col_matrix, false, + static_cast(1), &out_slice, + static_cast(1), true, &new_scale, &new_bias, g, + bias_data.data()); + } + } +} +template +void ConvBNAddReluCompute(const FusionConvBNAddReluParam ¶m) { + Tensor Bias; + Bias.mutable_data({param.Groups()}); + if (param.Groups() == param.Input()->dims()[1] && + param.Input()->dims()[1] == param.Output()->dims()[1] && + param.Filter()->dims()[2] == param.Filter()->dims()[3] && + param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1 && + param.paddings_[0] == 1) { + math::DepthwiseConvAddBNRelu3x3s1p1(param.Input(), param.Filter(), + param.Output(), param.NewScale(), + param.NewBias(), true); + } else if (param.Groups() == param.Input()->dims()[1] && + param.Input()->dims()[1] == param.Output()->dims()[1] && + param.Filter()->dims()[2] == param.Filter()->dims()[3] && + param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) { + // math::DepthwiseConvAddBNRelu3x3s2p1(param.Input(), param.Filter(), + // param.Output(), param.NewScale(), + // param.NewBias(), 1); + math::DepthwiseConvAddBNRelu3x3s2p1v2(param.Input(), param.Filter(), + param.Output(), param.NewScale(), + param.NewBias(), true); + } else { + ConvBNAddReluBasic(param); + } +} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/central-arm-func/conv_bn_relu_arm_func.h b/src/operators/kernel/central-arm-func/conv_bn_relu_arm_func.h new file mode 100644 index 0000000000000000000000000000000000000000..5606eb3304ac1384fdbd5c3899b6ad3186d315b6 --- /dev/null +++ b/src/operators/kernel/central-arm-func/conv_bn_relu_arm_func.h @@ -0,0 +1,146 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_CONVBNRELU_OP + +#pragma once +#include +#include "operators/math/depthwise_conv3x3.h" +#include "operators/math/im2col.h" +#include "operators/math/math_function.h" +#include "operators/math/vol2col.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +void ConvBNReluBasic(const FusionConvBNReluParam ¶m) { + const Tensor *input = param.Input(); + Tensor filter = *param.Filter(); + Tensor new_bias = *param.NewBias(); + Tensor new_scale = *param.NewScale(); + + Tensor *output = param.Output(); + output->mutable_data(); + + int groups = param.Groups(); + std::vector strides = param.Strides(); + std::vector paddings = param.Paddings(); + std::vector dilations = param.Dilations(); + + const int batch_size = static_cast(input->dims()[0]); + + std::vector filter_shape_vec(framework::vectorize(filter.dims())); + + std::vector output_shape_vec(framework::vectorize(output->dims())); + size_t data_dim = filter_shape_vec.size() - 2; + std::vector col_shape_vec(1 + 2 * data_dim); + col_shape_vec[0] = input->dims()[1] / groups; + for (size_t j = 0; j < data_dim; ++j) { + col_shape_vec[j + 1] = filter_shape_vec[j + 2]; + col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; + } + framework::DDim col_shape(framework::make_ddim(col_shape_vec)); + + framework::DDim col_matrix_shape = + framework::flatten_to_2d(col_shape, data_dim + 1); + + bool is_expand = + math::IsExpand(filter_shape_vec, strides, paddings, dilations); + Tensor col; + Tensor col_matrix; + if (is_expand) { + col.mutable_data(col_shape); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } + + framework::DDim input_shape = framework::slice_ddim( + input->dims(), 1, static_cast(input->dims().size())); + + framework::DDim filter_matrix_shape = {filter.dims()[0], + filter.numel() / filter.dims()[0]}; + filter.Resize(filter_matrix_shape); + framework::DDim output_matrix_shape = { + output->dims()[1], + output->numel() / (output->dims()[0] * output->dims()[1])}; + + // convolution operator: im2col(or vol2col) + gemm + int in_step = static_cast(input->dims()[1]) / groups; + int out_step = static_cast(output->dims()[1]) / groups; + + math::Vol2ColFunctor vol2col; + math::Im2ColFunctor im2col; + + for (int i = 0; i < batch_size; i++) { + Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); + Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape); + + for (int g = 0; g < groups; g++) { + Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); + + if (!is_expand) { + col.ShareDataWith(in_slice); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } else if (data_dim == 2U) { + // im2col + im2col(in_slice, dilations, strides, + std::vector{paddings[0], paddings[1], paddings[0], + paddings[1]}, + &col); + } else if (data_dim == 3U) { + // vol2col + vol2col(in_slice, dilations, strides, paddings, &col); + } + // gemm + Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); + Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); + + math::MatMulWithBn(filter_slice, false, col_matrix, false, + static_cast(1), &out_slice, + static_cast(0), true, &new_scale, &new_bias, g); + } + } +} + +template +void ConvBNReluCompute(const FusionConvBNReluParam ¶m) { + if (param.Groups() == param.Input()->dims()[1] && + param.Input()->dims()[1] == param.Output()->dims()[1] && + param.Filter()->dims()[2] == param.Filter()->dims()[3] && + param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1 && + param.paddings_[0] == 1) { + math::DepthwiseConvAddBNRelu3x3s1p1(param.Input(), param.Filter(), + param.Output(), param.NewScale(), + param.NewBias(), true); + } else if (param.Groups() == param.Input()->dims()[1] && + param.Input()->dims()[1] == param.Output()->dims()[1] && + param.Filter()->dims()[2] == param.Filter()->dims()[3] && + param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) { + // math::DepthwiseConvAddBNRelu3x3s2p1(param.Input(), param.Filter(), + // param.Output(), param.NewScale(), + // param.NewBias(), 1); + math::DepthwiseConvAddBNRelu3x3s2p1v2(param.Input(), param.Filter(), + param.Output(), param.NewScale(), + param.NewBias(), true); + } else { + ConvBNReluBasic(param); + } +} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/central-arm-func/dwconv_bn_relu_arm_func.h b/src/operators/kernel/central-arm-func/dwconv_bn_relu_arm_func.h new file mode 100644 index 0000000000000000000000000000000000000000..150485032491ab9b5051ee0bc458ff8ca2a700e2 --- /dev/null +++ b/src/operators/kernel/central-arm-func/dwconv_bn_relu_arm_func.h @@ -0,0 +1,144 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_DWCONVBNRELU_OP + +#pragma once +#include +#include "operators/math/depthwise_conv3x3.h" +#include "operators/math/im2col.h" +#include "operators/math/math_function.h" +#include "operators/math/vol2col.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +void DWConvBNReluBasic(const FusionDWConvBNReluParam ¶m) { + const Tensor *input = param.Input(); + Tensor filter = *param.Filter(); + Tensor new_bias = *param.NewBias(); + Tensor new_scale = *param.NewScale(); + + Tensor *output = param.Output(); + output->mutable_data(); + + int groups = param.Groups(); + std::vector strides = param.Strides(); + std::vector paddings = param.Paddings(); + std::vector dilations = param.Dilations(); + + const int batch_size = static_cast(input->dims()[0]); + + std::vector filter_shape_vec(framework::vectorize(filter.dims())); + + std::vector output_shape_vec(framework::vectorize(output->dims())); + size_t data_dim = filter_shape_vec.size() - 2; + std::vector col_shape_vec(1 + 2 * data_dim); + col_shape_vec[0] = input->dims()[1] / groups; + for (size_t j = 0; j < data_dim; ++j) { + col_shape_vec[j + 1] = filter_shape_vec[j + 2]; + col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; + } + framework::DDim col_shape(framework::make_ddim(col_shape_vec)); + + framework::DDim col_matrix_shape = + framework::flatten_to_2d(col_shape, data_dim + 1); + + bool is_expand = + math::IsExpand(filter_shape_vec, strides, paddings, dilations); + Tensor col; + Tensor col_matrix; + if (is_expand) { + col.mutable_data(col_shape); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } + + framework::DDim input_shape = framework::slice_ddim( + input->dims(), 1, static_cast(input->dims().size())); + + framework::DDim filter_matrix_shape = {filter.dims()[0], + filter.numel() / filter.dims()[0]}; + filter.Resize(filter_matrix_shape); + framework::DDim output_matrix_shape = { + output->dims()[1], + output->numel() / (output->dims()[0] * output->dims()[1])}; + + // convolution operator: im2col(or vol2col) + gemm + int in_step = static_cast(input->dims()[1]) / groups; + int out_step = static_cast(output->dims()[1]) / groups; + + math::Vol2ColFunctor vol2col; + math::Im2ColFunctor im2col; + + for (int i = 0; i < batch_size; i++) { + Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); + Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape); + + for (int g = 0; g < groups; g++) { + Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); + + if (!is_expand) { + col.ShareDataWith(in_slice); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } else if (data_dim == 2U) { + // im2col + im2col(in_slice, dilations, strides, + std::vector{paddings[0], paddings[1], paddings[0], + paddings[1]}, + &col); + } else if (data_dim == 3U) { + // vol2col + vol2col(in_slice, dilations, strides, paddings, &col); + } + // gemm + Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); + Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); + math::MatMulWithBn(filter_slice, false, col_matrix, false, + static_cast(1), &out_slice, + static_cast(0), true, &new_scale, &new_bias, g); + } + } +} +template +void DWConvBNReluCompute(const FusionDWConvBNReluParam ¶m) { + if (param.Groups() == param.Input()->dims()[1] && + param.Input()->dims()[1] == param.Output()->dims()[1] && + param.Filter()->dims()[2] == param.Filter()->dims()[3] && + param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1 && + param.paddings_[0] == 1) { + math::DepthwiseConvAddBNRelu3x3s1p1(param.Input(), param.Filter(), + param.Output(), param.NewScale(), + param.NewBias(), true); + } else if (param.Groups() == param.Input()->dims()[1] && + param.Input()->dims()[1] == param.Output()->dims()[1] && + param.Filter()->dims()[2] == param.Filter()->dims()[3] && + param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) { + // math::DepthwiseConvAddBNRelu3x3s2p1(param.Input(), param.Filter(), + // param.Output(), param.NewScale(), + // param.NewBias(), 1); + math::DepthwiseConvAddBNRelu3x3s2p1v2(param.Input(), param.Filter(), + param.Output(), param.NewScale(), + param.NewBias(), true); + } else { + DWConvBNReluBasic(param); + } +} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/deconv_add_bn_kernel.h b/src/operators/kernel/deconv_add_bn_kernel.h new file mode 100755 index 0000000000000000000000000000000000000000..181367031c0be48666efeda3df4426da38c67d4f --- /dev/null +++ b/src/operators/kernel/deconv_add_bn_kernel.h @@ -0,0 +1,39 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_DECONVADDBN_OP + +#pragma once + +#include "framework/operator.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +using framework::OpKernelBase; + +template +class DeconvAddBNKernel + : public OpKernelBase> { + public: + void Compute(const FusionDeconvAddBNParam ¶m); + + bool Init(FusionDeconvAddBNParam *param); +}; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/deconv_add_bn_relu_kernel.h b/src/operators/kernel/deconv_add_bn_relu_kernel.h new file mode 100755 index 0000000000000000000000000000000000000000..c63b4db050ade64903ff817b40900faaef65924d --- /dev/null +++ b/src/operators/kernel/deconv_add_bn_relu_kernel.h @@ -0,0 +1,39 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_DECONVADDBNRELU_OP + +#pragma once + +#include "framework/operator.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +using framework::OpKernelBase; + +template +class DeconvAddBNReluKernel + : public OpKernelBase> { + public: + void Compute(const FusionDeconvAddBNReluParam ¶m); + + bool Init(FusionDeconvAddBNReluParam *param); +}; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/deconv_bn_relu_kernel.h b/src/operators/kernel/deconv_bn_relu_kernel.h new file mode 100755 index 0000000000000000000000000000000000000000..4ab0257b07e53149ff88c6a6ecca2dc77c0eb634 --- /dev/null +++ b/src/operators/kernel/deconv_bn_relu_kernel.h @@ -0,0 +1,39 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_DECONVBNRELU_OP + +#pragma once + +#include "framework/operator.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +using framework::OpKernelBase; + +template +class DeconvBNReluKernel + : public OpKernelBase> { + public: + void Compute(const FusionDeconvBNReluParam ¶m); + + bool Init(FusionDeconvBNReluParam *param); +}; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/detection_kernel.h b/src/operators/kernel/detection_kernel.h index 417c68fff7d0e88d2e1fcc1dc8c1f14aa3a4399b..77c35b0253d06f2bc979861e53daeba815b46647 100644 --- a/src/operators/kernel/detection_kernel.h +++ b/src/operators/kernel/detection_kernel.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include #include "framework/operator.h" #include "operators/op_param.h" @@ -27,12 +28,14 @@ class AnchorGeneratorParam : public OpParam { public: AnchorGeneratorParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) { - input_ = OpParam::GetVarValue("Input", inputs, scope); + const AttributeMap &attrs, Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + input_ = + OpParam::GetVarValue("Input", inputs, *scope); output_anchors_ = - OpParam::GetVarValue("Anchors", outputs, scope); - output_variances_ = - OpParam::GetVarValue("Variances", outputs, scope); + OpParam::GetVarValue("Anchors", outputs, *scope); + output_variances_ = OpParam::GetVarValue( + "Variances", outputs, *scope); anchor_sizes_ = OpParam::GetAttr>("anchor_sizes", attrs); aspect_ratios_ = @@ -64,22 +67,23 @@ template class ProposalParam : public OpParam { public: ProposalParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) { + const AttributeMap &attrs, Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { scores_ = - OpParam::GetVarValue("Scores", inputs, scope); - bbox_deltas_ = - OpParam::GetVarValue("BboxDeltas", inputs, scope); + OpParam::GetVarValue("Scores", inputs, *scope); + bbox_deltas_ = OpParam::GetVarValue("BboxDeltas", + inputs, *scope); im_info_ = - OpParam::GetVarValue("ImInfo", inputs, scope); + OpParam::GetVarValue("ImInfo", inputs, *scope); anchors_ = - OpParam::GetVarValue("Anchors", inputs, scope); + OpParam::GetVarValue("Anchors", inputs, *scope); variances_ = - OpParam::GetVarValue("Variances", inputs, scope); + OpParam::GetVarValue("Variances", inputs, *scope); rpn_rois_ = - OpParam::GetVarValue("RpnRois", outputs, scope); + OpParam::GetVarValue("RpnRois", outputs, *scope); rpn_probs_ = OpParam::GetVarValue("RpnRoiProbs", - outputs, scope); + outputs, *scope); pre_nms_topn_ = OpParam::GetAttr("pre_nms_topN", attrs); post_nms_topn_ = OpParam::GetAttr("post_nms_topN", attrs); @@ -95,6 +99,8 @@ class ProposalParam : public OpParam { framework::Tensor *anchors_; framework::Tensor *variances_; + std::shared_ptr score_index_; + framework::LoDTensor *rpn_rois_; framework::LoDTensor *rpn_probs_; @@ -117,11 +123,13 @@ template class PSRoiPoolParam : public OpParam { public: PSRoiPoolParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) { - input_x_ = OpParam::GetVarValue("X", inputs, scope); + const AttributeMap &attrs, Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + input_x_ = OpParam::GetVarValue("X", inputs, *scope); input_rois_ = - OpParam::GetVarValue("ROIs", inputs, scope); - output_ = OpParam::GetVarValue("Out", outputs, scope); + OpParam::GetVarValue("ROIs", inputs, *scope); + output_ = + OpParam::GetVarValue("Out", outputs, *scope); output_channels_ = OpParam::GetAttr("output_channels", attrs); pooled_height_ = OpParam::GetAttr("pooled_height", attrs); @@ -146,17 +154,56 @@ class PSRoiPoolParam : public OpParam { DECLARE_KERNEL(PSRoiPool, PSRoiPoolParam); #endif +#ifdef ROIALIGN_POOL_OP +template +class RoiAlignPoolParam : public OpParam { + public: + RoiAlignPoolParam(const VariableNameMap &inputs, + const VariableNameMap &outputs, const AttributeMap &attrs, + Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + input_x_ = OpParam::GetVarValue("X", inputs, *scope); + input_rois_ = + OpParam::GetVarValue("ROIs", inputs, *scope); + output_ = + OpParam::GetVarValue("Out", outputs, *scope); + + pooled_height_ = OpParam::GetAttr("pooled_height", attrs); + pooled_width_ = OpParam::GetAttr("pooled_width", attrs); + spatial_scale_ = OpParam::GetAttr("spatial_scale", attrs); + sampling_ratio_ = OpParam::GetAttr("sampling_ratio", attrs); + } + + public: + framework::Tensor *input_x_; + framework::LoDTensor *input_rois_; + framework::Tensor *output_; + int pooled_height_; + int pooled_width_; + float spatial_scale_; + int sampling_ratio_; +#ifdef PADDLE_MOBILE_FPGA + std::shared_ptr float_input, float_output; + fpga::BypassArgs input_arg, output_arg; +#endif +}; + +DECLARE_KERNEL(RoiAlignPool, RoiAlignPoolParam); +#endif + #ifdef ROI_PERSPECTIVE_OP template class RoiPerspectiveParam : public OpParam { public: RoiPerspectiveParam(const VariableNameMap &inputs, const VariableNameMap &outputs, const AttributeMap &attrs, - const Scope &scope) { - input_x_ = OpParam::GetVarValue("X", inputs, scope); + Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + input_x_ = OpParam::GetVarValue("X", inputs, *scope); input_rois_ = - OpParam::GetVarValue("ROIs", inputs, scope); - output_ = OpParam::GetVarValue("Out", outputs, scope); + OpParam::GetVarValue("ROIs", inputs, *scope); + output_ = + OpParam::GetVarValue("Out", outputs, *scope); spatial_scale_ = OpParam::GetAttr("spatial_scale", attrs); transformed_height_ = OpParam::GetAttr("transformed_height", attrs); diff --git a/src/operators/kernel/elementwise_mul_kernel.h b/src/operators/kernel/elementwise_mul_kernel.h index 54baa50fcafb8ddbbefecb635ea85f120f16250d..f71b6257d5c978735198f1b42e15f3f454eb8787 100644 --- a/src/operators/kernel/elementwise_mul_kernel.h +++ b/src/operators/kernel/elementwise_mul_kernel.h @@ -17,7 +17,6 @@ limitations under the License. */ #pragma once #include "framework/operator.h" -#include "operators/math/elementwise_op_function.h" #include "operators/op_param.h" namespace paddle_mobile { diff --git a/src/operators/kernel/fpga/V1/anchor_generator_kernel.cpp b/src/operators/kernel/fpga/V1/anchor_generator_kernel.cpp index 4e68b5e30ccc53ae84deb0866f982d70e175d8eb..359c34b0cefa20ee13789402c87c8f13ca31cc50 100644 --- a/src/operators/kernel/fpga/V1/anchor_generator_kernel.cpp +++ b/src/operators/kernel/fpga/V1/anchor_generator_kernel.cpp @@ -43,9 +43,11 @@ bool AnchorGeneratorKernel::Init( // DLOG << "stride_height: " << stride_height; for (int h_idx = 0; h_idx < feature_height; ++h_idx) { + int offset0 = h_idx * feature_width * num_anchors * 4; for (int w_idx = 0; w_idx < feature_width; ++w_idx) { - int offset = h_idx * w_idx * num_anchors * 4; + int offset1 = w_idx * num_anchors * 4; for (int idx = 0; idx < num_anchors; idx++) { + int offset = offset0 + offset1 + idx * 4; anchor_ptr[offset + 0] = anchors_offset[idx * 4 + 0] + w_idx * stride_width; anchor_ptr[offset + 1] = diff --git a/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp old mode 100644 new mode 100755 diff --git a/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp b/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp index eb5b913b730183be88d2470b1f57783aba15eb92..4ce8265f7f780d5ea4291783e309cd9507bf18b6 100644 --- a/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp +++ b/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp @@ -16,13 +16,10 @@ limitations under the License. */ #include "operators/kernel/conv_bn_relu_kernel.h" #include - namespace paddle_mobile { namespace operators { - template <> bool ConvBNReluKernel::Init(FusionConvBNReluParam *param) { - // bool relu_enabled = true; paddle_mobile::fpga::ActivationType activation_enable = paddle_mobile::fpga::LEAKYRELU; int16_t leaky_relu_negative_slope = 0; @@ -43,7 +40,6 @@ bool ConvBNReluKernel::Init(FusionConvBNReluParam *param) { auto new_bias = new Tensor(); auto new_scale_ptr = new_scale->mutable_data({channel}); auto new_bias_ptr = new_bias->mutable_data({channel}); - for (int i = 0; i < channel; i++) { new_scale_ptr[i] = bn_scale_ptr[i] / static_cast(pow((bn_var_ptr[i] + epsilon), 0.5)); @@ -51,24 +47,36 @@ bool ConvBNReluKernel::Init(FusionConvBNReluParam *param) { bs_ptr[i + channel] = new_scale_ptr[i]; bs_ptr[i] = new_bias_ptr[i]; } - - fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); - fpga::SplitConvArgs conv_arg = {0}; - fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable, - leaky_relu_negative_slope, param->Groups(), - param->Strides()[0], param->Strides()[1], - param->Paddings()[0], param->Paddings()[1], bs_ptr); - param->SetFpgaArgs(conv_arg); - + const int groups = param->Groups(); + if (groups == channel) { + fpga::format_dwconv_data(filter, out, new_scale_ptr, &new_bias_ptr); + fpga::DWconvArgs dwconv_arg = {0}; + fpga::fill_dwconv_arg(&dwconv_arg, input, out, filter, activation_enable, + leaky_relu_negative_slope, param->Strides()[0], + param->Strides()[1], param->Paddings()[0], + param->Paddings()[1], new_bias_ptr); + param->SetFpgaArgs(dwconv_arg); + } else { + fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); + fpga::SplitConvArgs conv_arg = {0}; + fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable, + leaky_relu_negative_slope, param->Groups(), + param->Strides()[0], param->Strides()[1], + param->Paddings()[0], param->Paddings()[1], bs_ptr); + param->SetFpgaArgs(conv_arg); + } delete new_scale; delete new_bias; return true; } - template <> void ConvBNReluKernel::Compute( const FusionConvBNReluParam ¶m) { - fpga::ComputeFpgaConv(param.FpgaArgs()); + if (param.Groups() == param.Output()->dims()[1]) { + fpga::ComputeDWConv(param.FpgaDwconvArgs()); + } else { + fpga::ComputeFpgaConv(param.FpgaArgs()); + } } } // namespace operators diff --git a/src/operators/kernel/fpga/V1/conv_kernel.cpp b/src/operators/kernel/fpga/V1/conv_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..73722820bd90b54abd64dd01b157c74c6a1069e8 --- /dev/null +++ b/src/operators/kernel/fpga/V1/conv_kernel.cpp @@ -0,0 +1,56 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef CONV_OP + +#include "operators/kernel/conv_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool ConvKernel::Init(ConvParam *param) { + paddle_mobile::fpga::ActivationType activation_enable = + paddle_mobile::fpga::NONE; + int16_t leaky_relu_negative_slope = 0; + auto input = const_cast(param->Input()); + auto filter = const_cast(param->Filter()); + auto out = param->Output(); + int channel = out->dims()[1]; + auto bs_ptr = + (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT + for (int i = 0; i < channel; i++) { + bs_ptr[i + channel] = 1; + bs_ptr[i] = 0; + } + + fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); + fpga::SplitConvArgs conv_arg = {0}; + fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable, + leaky_relu_negative_slope, param->Groups(), + param->Strides()[0], param->Strides()[1], + param->Paddings()[0], param->Paddings()[1], bs_ptr); + param->SetFpgaArgs(conv_arg); + return true; +} + +template <> +void ConvKernel::Compute(const ConvParam ¶m) { + fpga::ComputeFpgaConv(param.FpgaArgs()); +} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/fpga/V1/conv_transpose_kernel.cpp b/src/operators/kernel/fpga/V1/conv_transpose_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..788504df5d2ea1005cfaa76f12b58e61c0218391 --- /dev/null +++ b/src/operators/kernel/fpga/V1/conv_transpose_kernel.cpp @@ -0,0 +1,89 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef CONV_TRANSPOSE_OP + +#include "operators/kernel/conv_transpose_kernel.h" +#include "framework/operator.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool ConvTransposeKernel::Init(ConvTransposeParam *param) { + // bool relu_enabled = false; + paddle_mobile::fpga::ActivationType activation_enable = + paddle_mobile::fpga::NONE; + int16_t leaky_relu_negative_slope = 0; + auto input = const_cast(param->Input()); + // const Tensor *bias = param->Bias(); + // auto bias_ptr = bias->data(); + auto filter = const_cast(param->Filter()); + auto out = param->Output(); + + // PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], + // "Output channel should be equal to bias number"); + int channel = out->dims()[1]; + + int sub_conv_n = param->Strides()[0]; + auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n * // NOLINT + sizeof(float)); // NOLINT + + for (int i = 0; i < channel * sub_conv_n; i++) { + bs_ptr[i + sub_conv_n * channel] = 1; + bs_ptr[i] = 0; // bias_ptr[i % (channel)]; + } + + PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0], + "stride_width should be equal to stride_height "); + PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3], + "filter width should be equal to filter height "); + PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0), + "filter axis should be the multiple of stride axis "); + if (param->Groups() == channel) { + fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(), + sub_conv_n); + fpga::DWDeconvArgs DWDeconv_arg = {0}; + fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, + activation_enable, leaky_relu_negative_slope, + param->Strides()[0], param->Strides()[1], + param->Paddings()[0], param->Paddings()[1], bs_ptr); + param->SetFpgaArgs(DWDeconv_arg); + } else { + fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n); + fpga::DeconvArgs deconv_arg = {0}; + fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable, + leaky_relu_negative_slope, param->Groups(), + param->Strides()[0], param->Strides()[1], + param->Paddings()[0], param->Paddings()[1], bs_ptr); + param->SetFpgaArgs(deconv_arg); + } + return true; +} + +template <> +void ConvTransposeKernel::Compute( + const ConvTransposeParam ¶m) { + if (param.Groups() == param.Output()->dims()[1]) { + fpga::ComputeDWDeconv(param.FpgaDWDconvArgs()); + } else { + fpga::ComputeFpgaDeconv(param.FpgaArgs()); + } +} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/fpga/V1/deconv_add_bn_kernel.cpp b/src/operators/kernel/fpga/V1/deconv_add_bn_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4239ac1e5da421cb0e2421a8919d8d15e40348af --- /dev/null +++ b/src/operators/kernel/fpga/V1/deconv_add_bn_kernel.cpp @@ -0,0 +1,90 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_DECONVADDBN_OP + +#include "operators/kernel/deconv_add_bn_kernel.h" +#include "framework/operator.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool DeconvAddBNKernel::Init(FusionDeconvAddBNParam *param) { + // bool relu_enabled = true; + paddle_mobile::fpga::ActivationType activation_enable = + paddle_mobile::fpga::NONE; + int16_t leaky_relu_negative_slope = 0; + auto input = const_cast(param->Input()); + const Tensor *bias = param->InputBias(); + auto bias_ptr = bias->data(); + auto filter = const_cast(param->Filter()); + auto out = param->Output(); + + PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], + "Output channel should be equal to bias number"); + int channel = out->dims()[1]; + + int sub_conv_n = param->Strides()[0]; + auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n * // NOLINT + sizeof(float)); // NOLINT + + for (int i = 0; i < channel * sub_conv_n; i++) { + bs_ptr[i + sub_conv_n * channel] = 1; + bs_ptr[i] = bias_ptr[i % (channel)]; + } + + PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0], + "stride_width should be equal to stride_height "); + PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3], + "filter width should be equal to filter height "); + PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0), + "filter axis should be the multiple of stride axis "); + if (param->Groups() == channel) { + fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(), + sub_conv_n); + fpga::DWDeconvArgs DWDeconv_arg = {0}; + fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, + activation_enable, leaky_relu_negative_slope, + param->Strides()[0], param->Strides()[1], + param->Paddings()[0], param->Paddings()[1], bs_ptr); + param->SetFpgaArgs(DWDeconv_arg); + } else { + fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n); + fpga::DeconvArgs deconv_arg = {0}; + fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable, + leaky_relu_negative_slope, param->Groups(), + param->Strides()[0], param->Strides()[1], + param->Paddings()[0], param->Paddings()[1], bs_ptr); + param->SetFpgaArgs(deconv_arg); + } + return true; +} + +template <> +void DeconvAddBNKernel::Compute( + const FusionDeconvAddBNParam ¶m) { + // fpga::ComputeFpgaDeconv(param.FpgaArgs()); + if (param.Groups() == param.Output()->dims()[1]) { + fpga::ComputeDWDeconv(param.FpgaDWDconvArgs()); + } else { + fpga::ComputeFpgaDeconv(param.FpgaArgs()); + } +} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/fpga/V1/deconv_add_bn_relu_kernel.cpp b/src/operators/kernel/fpga/V1/deconv_add_bn_relu_kernel.cpp new file mode 100755 index 0000000000000000000000000000000000000000..28b8c83198a5517ed0dc9732e0033030a876a7da --- /dev/null +++ b/src/operators/kernel/fpga/V1/deconv_add_bn_relu_kernel.cpp @@ -0,0 +1,91 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_DECONVADDBNRELU_OP + +#include "operators/kernel/deconv_add_bn_relu_kernel.h" +#include "framework/operator.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool DeconvAddBNReluKernel::Init( + FusionDeconvAddBNReluParam *param) { + // bool relu_enabled = true; + paddle_mobile::fpga::ActivationType activation_enable = + paddle_mobile::fpga::LEAKYRELU; + int16_t leaky_relu_negative_slope = 0; + auto input = const_cast(param->Input()); + const Tensor *bias = param->InputBias(); + auto bias_ptr = bias->data(); + auto filter = const_cast(param->Filter()); + auto out = param->Output(); + + PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], + "Output channel should be equal to bias number"); + int channel = out->dims()[1]; + + int sub_conv_n = param->Strides()[0]; + auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n * // NOLINT + sizeof(float)); // NOLINT + + for (int i = 0; i < channel * sub_conv_n; i++) { + bs_ptr[i + sub_conv_n * channel] = 1; + bs_ptr[i] = bias_ptr[i % (channel)]; + } + + PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0], + "stride_width should be equal to stride_height "); + PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3], + "filter width should be equal to filter height "); + PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0), + "filter axis should be the multiple of stride axis "); + if (param->Groups() == channel) { + fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(), + sub_conv_n); + fpga::DWDeconvArgs DWDeconv_arg = {0}; + fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, + activation_enable, leaky_relu_negative_slope, + param->Strides()[0], param->Strides()[1], + param->Paddings()[0], param->Paddings()[1], bs_ptr); + param->SetFpgaArgs(DWDeconv_arg); + } else { + fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n); + fpga::DeconvArgs deconv_arg = {0}; + fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable, + leaky_relu_negative_slope, param->Groups(), + param->Strides()[0], param->Strides()[1], + param->Paddings()[0], param->Paddings()[1], bs_ptr); + param->SetFpgaArgs(deconv_arg); + } + return true; +} + +template <> +void DeconvAddBNReluKernel::Compute( + const FusionDeconvAddBNReluParam ¶m) { + // fpga::ComputeFpgaDeconv(param.FpgaArgs()); + if (param.Groups() == param.Output()->dims()[1]) { + fpga::ComputeDWDeconv(param.FpgaDWDconvArgs()); + } else { + fpga::ComputeFpgaDeconv(param.FpgaArgs()); + } +} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/fpga/V1/deconv_bn_relu_kernel.cpp b/src/operators/kernel/fpga/V1/deconv_bn_relu_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f166587109e5f63e30203a940aa3baa8ae87f844 --- /dev/null +++ b/src/operators/kernel/fpga/V1/deconv_bn_relu_kernel.cpp @@ -0,0 +1,108 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_DECONVBNRELU_OP + +#include "operators/kernel/deconv_bn_relu_kernel.h" +#include +#include "framework/operator.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool DeconvBNReluKernel::Init( + FusionDeconvBNReluParam *param) { + // bool relu_enabled = true; + paddle_mobile::fpga::ActivationType activation_enable = + paddle_mobile::fpga::LEAKYRELU; + int16_t leaky_relu_negative_slope = 0; + auto input = const_cast(param->Input()); + const Tensor *bias = param->InputBias(); + auto bias_ptr = bias->data(); + auto filter = const_cast(param->Filter()); + auto out = param->Output(); + auto bn_mean_ptr = param->InputMean()->data(); + auto bn_var_ptr = param->InputVariance()->data(); + auto bn_scale_ptr = param->InputScale()->data(); + auto bn_bias_ptr = param->InputBias()->data(); + const float epsilon = param->Epsilon(); + + PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], + "Output channel should be equal to bias number"); + int channel = out->dims()[1]; + auto new_scale = new Tensor(); + auto new_bias = new Tensor(); + auto new_scale_ptr = new_scale->mutable_data({channel}); + auto new_bias_ptr = new_bias->mutable_data({channel}); + for (int i = 0; i < channel; i++) { + new_scale_ptr[i] = bn_scale_ptr[i] / + static_cast(pow((bn_var_ptr[i] + epsilon), 0.5)); + new_bias_ptr[i] = bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i]; + } + + int sub_conv_n = param->Strides()[0]; + auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n * // NOLINT + sizeof(float)); // NOLINT + + for (int i = 0; i < channel * sub_conv_n; i++) { + bs_ptr[i + sub_conv_n * channel] = new_scale_ptr[i % channel]; + bs_ptr[i] = new_bias_ptr[i % (channel)]; + } + + PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0], + "stride_width should be equal to stride_height "); + PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3], + "filter width should be equal to filter height "); + PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0), + "filter axis should be the multiple of stride axis "); + if (param->Groups() == channel) { + fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(), + sub_conv_n); + fpga::DWDeconvArgs DWDeconv_arg = {0}; + fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, + activation_enable, leaky_relu_negative_slope, + param->Strides()[0], param->Strides()[1], + param->Paddings()[0], param->Paddings()[1], bs_ptr); + param->SetFpgaArgs(DWDeconv_arg); + } else { + fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n); + fpga::DeconvArgs deconv_arg = {0}; + fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable, + leaky_relu_negative_slope, param->Groups(), + param->Strides()[0], param->Strides()[1], + param->Paddings()[0], param->Paddings()[1], bs_ptr); + param->SetFpgaArgs(deconv_arg); + } + delete new_scale; + delete new_bias; + return true; +} + +template <> +void DeconvBNReluKernel::Compute( + const FusionDeconvBNReluParam ¶m) { + // fpga::ComputeFpgaDeconv(param.FpgaArgs()); + if (param.Groups() == param.Output()->dims()[1]) { + fpga::ComputeDWDeconv(param.FpgaDWDconvArgs()); + } else { + fpga::ComputeFpgaDeconv(param.FpgaArgs()); + } +} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp b/src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp index a830996524cba9ff05259bf7ccf3a55c99749a87..c549e5a6eee98f38f1806367054b925440e3ebf1 100644 --- a/src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp +++ b/src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp @@ -15,55 +15,176 @@ limitations under the License. */ #include "operators/kernel/elementwise_add_kernel.h" +#include +#include "fpga/V1/api.h" + namespace paddle_mobile { namespace operators { template <> bool ElementwiseAddKernel::Init(ElementwiseAddParam *param) { - // bool relu_enabled = false; - paddle_mobile::fpga::ActivationType activation_enable = - paddle_mobile::fpga::NONE; - int16_t leaky_relu_negative_slope = 0; - auto *input_x = const_cast(param->InputX()); auto *input_y = const_cast(param->InputY()); auto *out = param->Out(); - auto input_x_ptr = input_x->data(); - auto input_y_ptr = input_y->data(); - fpga::format_fp16_ofm(out); - auto out_ptr = out->mutable_data(); - - fpga::EWAddArgs ewaddArgs = {0}; - // ewaddArgs.relu_enabled = relu_enabled; - ewaddArgs.output.activation.activation_type = activation_enable; - ewaddArgs.output.activation.leaky_relu_negative_slope = - leaky_relu_negative_slope; - ewaddArgs.const0 = 0x3c00; // =1 - ewaddArgs.const1 = 0x3c00; // =1 - ewaddArgs.image0.address = input_x_ptr; - ewaddArgs.image0.channels = (uint32_t)input_x->dims()[1]; - ewaddArgs.image0.scale_address = input_x->scale; - ewaddArgs.image0.height = (uint32_t)input_x->dims()[2]; - ewaddArgs.image0.width = (uint32_t)input_x->dims()[3]; - ewaddArgs.image0.pad_height = 0; - ewaddArgs.image0.pad_width = 0; - ewaddArgs.image1.address = input_y_ptr; - ewaddArgs.image1.channels = (uint32_t)input_y->dims()[1]; - ewaddArgs.image1.scale_address = input_y->scale; - ewaddArgs.image1.height = (uint32_t)input_y->dims()[2]; - ewaddArgs.image1.width = (uint32_t)input_y->dims()[3]; - ewaddArgs.image1.pad_height = 0; - ewaddArgs.image1.pad_width = 0; - ewaddArgs.output.scale_address = out->scale; - ewaddArgs.output.address = out_ptr; - fpga::expand_EW_arg(&ewaddArgs); - param->SetFpgaArgs(ewaddArgs); + if (input_y->type() != typeid(float)) { + paddle_mobile::fpga::ActivationType activation_enable = + paddle_mobile::fpga::NONE; + int16_t leaky_relu_negative_slope = 0; + auto *input_x = const_cast(param->InputX()); + auto input_x_ptr = input_x->data(); + auto input_y_ptr = input_y->data(); + fpga::format_fp16_ofm(out); + auto out_ptr = out->mutable_data(); + + fpga::EWAddArgs ewaddArgs = {0}; + // ewaddArgs.relu_enabled = relu_enabled; + ewaddArgs.output.activation.activation_type = activation_enable; + ewaddArgs.output.activation.leaky_relu_negative_slope = + leaky_relu_negative_slope; + ewaddArgs.const0 = 0x3c00; // =1 + ewaddArgs.const1 = 0x3c00; // =1 + ewaddArgs.image0.address = input_x_ptr; + ewaddArgs.image0.channels = (uint32_t)input_x->dims()[1]; + ewaddArgs.image0.scale_address = input_x->scale; + ewaddArgs.image0.height = (uint32_t)input_x->dims()[2]; + ewaddArgs.image0.width = (uint32_t)input_x->dims()[3]; + ewaddArgs.image0.pad_height = 0; + ewaddArgs.image0.pad_width = 0; + ewaddArgs.image1.address = input_y_ptr; + ewaddArgs.image1.channels = (uint32_t)input_y->dims()[1]; + ewaddArgs.image1.scale_address = input_y->scale; + ewaddArgs.image1.height = (uint32_t)input_y->dims()[2]; + ewaddArgs.image1.width = (uint32_t)input_y->dims()[3]; + ewaddArgs.image1.pad_height = 0; + ewaddArgs.image1.pad_width = 0; + ewaddArgs.output.scale_address = out->scale; + ewaddArgs.output.address = out_ptr; + fpga::expand_EW_arg(&ewaddArgs); + param->SetFpgaArgs(ewaddArgs); + } else { + param->float_input_x.Resize(param->InputX()->dims()); + param->float_input_x.init(typeid(float)); + fpga::format_fp32_ofm(&(param->float_input_x)); + + param->float_out.Resize(param->InputX()->dims()); + // param->float_out.init(typeid(float)); + param->float_out.mutable_data(param->InputX()->dims()); + fpga::format_fp32_ofm(&(param->float_out)); + + fpga::format_fp16_ofm(out); + } return true; } +inline void ElementwiseAddCompute(const ElementwiseAddParam ¶m) { + auto input_x = param.float_input_x; + auto input_y = param.InputY(); + auto Out = param.float_out; + int axis = param.Axis(); + + const auto &x_dims = input_x.dims(); + const auto &y_dims = input_y->dims(); + /// axis = -1 represent the last dimensions. + axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis); + size_t batch = 1; + size_t channels = 1; + size_t elementwise_num = 1; + for (int i = 0; i < axis; ++i) { + batch *= x_dims[i]; + } + for (int i = 0; i < y_dims.size(); ++i) { + channels *= y_dims[i]; + } + for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) { + elementwise_num *= x_dims[i]; + } + const float *bias_data = input_y->data(); + const float *input_data = input_x.data(); + float *output_data = Out.mutable_data(); + for (int i = 0; i < batch; ++i) { + for (int j = 0; j < channels; ++j) { + size_t offset = (i * channels + j) * elementwise_num; + const float *input = input_data + offset; + const float bias = bias_data[j]; + float *output = output_data + offset; + // DLOG << "output address: "<< output; + for (int k = 0; k < elementwise_num; ++k) { + output[k] = input[k] + bias; + // DLOG << "output[" << k << "]= " << output[k] ; + } + } + } +} template <> void ElementwiseAddKernel::Compute( const ElementwiseAddParam ¶m) { - fpga::ComputeFpgaEWAdd(param.FpgaArgs()); + auto input_y = const_cast(param.InputY()); + if (input_y->type() != typeid(float)) { + fpga::ComputeFpgaEWAdd(param.FpgaArgs()); + } else { + auto input_x = const_cast(param.InputX()); + auto intput_x_float = const_cast(&(param.float_input_x)); + fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; + args.input_data_type = fpga::DATA_TYPE_FP16; + args.output_data_type = fpga::DATA_TYPE_FP32; + args.input_layout_type = fpga::LAYOUT_CHW; + args.output_layout_type = fpga::LAYOUT_HWC; + args.image.address = input_x->data(); + args.image.channels = (uint32_t)(input_x->fpga_data_num); + args.image.height = 1; + args.image.width = 1; + args.image.pad_height = 0; + args.image.pad_width = 0; + args.output.address = intput_x_float->data(); + args.output.scale_address = intput_x_float->scale; + + // fpga::fpga_flush(input_x->data(),input_x->fpga_data_num * + // sizeof(half)); + fpga::PerformBypass(args); + fpga::fpga_invalidate(args.output.address, + input_x->fpga_data_num * sizeof(float)); + + // just for test + /* { + static int cnt = 0; + if(cnt == 0){ + std::string str= "first_bypass_data"; + float rslt = 0.0f; + fpga::savefile(str, args.output.address, input_x->fpga_data_num, + rslt); cnt++; + } + }*/ + ElementwiseAddCompute(param); + + auto out_float = const_cast(&(param.float_out)); + DLOG << "out float: " << out_float->data(); + fpga::fpga_flush(out_float->data(), + input_x->fpga_data_num * sizeof(float)); + // just for test + /*{ + static int cnt = 0; + if(cnt == 0){ + std::string str= "ew_output_data"; + float rslt = 0.0f; + + fpga::savefile(str, out_float->data(), input_x->fpga_data_num, + rslt); cnt++; + } + }*/ + auto Out = param.Out(); + args.input_data_type = fpga::DATA_TYPE_FP32; + args.output_data_type = fpga::DATA_TYPE_FP16; + args.input_layout_type = fpga::LAYOUT_CHW; + args.output_layout_type = fpga::LAYOUT_HWC; + args.image.address = out_float->data(); + args.image.channels = (uint32_t)(input_x->fpga_data_num); + args.image.height = 1; + args.image.width = 1; + args.image.pad_height = 0; + args.image.pad_width = 0; + args.output.address = Out->data(); + args.output.scale_address = Out->scale; + fpga::PerformBypass(args); + } } } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/kernel/fpga/V1/elementwise_mul_kernel.cpp b/src/operators/kernel/fpga/V1/elementwise_mul_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e421ddb78ff4f1a0f0c51c985db9c26666001d03 --- /dev/null +++ b/src/operators/kernel/fpga/V1/elementwise_mul_kernel.cpp @@ -0,0 +1,93 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef ELEMENTWISEMUL_OP + +#include "operators/kernel/elementwise_mul_kernel.h" +#include "operators/math/elementwise_op_function.h" + +namespace paddle_mobile { +namespace operators { + +template +struct MulFunctor { + inline T operator()(T a, T b) const { return a * b; } +}; +template <> +bool ElementwiseMulKernel::Init(ElementwiseMulParam *param) { + param->float_input_x.Resize(param->InputX()->dims()); + param->float_input_x.init(typeid(float)); + fpga::format_fp32_ofm(&(param->float_input_x)); + + param->float_out.Resize(param->InputX()->dims()); + param->float_out.init(typeid(float)); + fpga::format_fp32_ofm(&(param->float_out)); + + auto *out = param->Out(); + fpga::format_fp16_ofm(out); + return true; +} + +template <> +void ElementwiseMulKernel::Compute( + const ElementwiseMulParam ¶m) { + auto input_x = const_cast(param.InputX()); + auto intput_x_float = const_cast(&(param.float_input_x)); + // auto intput_x_32_ptr = + // const_cast(param.float_input_x.data()); + fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; + args.input_data_type = fpga::DATA_TYPE_FP16; + args.output_data_type = fpga::DATA_TYPE_FP32; + args.input_layout_type = fpga::LAYOUT_CHW; + args.output_layout_type = fpga::LAYOUT_HWC; + args.image.address = input_x->data(); + args.image.channels = (uint32_t)(input_x->fpga_data_num); + args.image.height = 1; + args.image.width = 1; + args.image.pad_height = 0; + args.image.pad_width = 0; + args.output.address = intput_x_float->data(); + args.output.scale_address = intput_x_float->scale; + fpga::PerformBypass(args); + fpga::fpga_invalidate(args.output.address, + input_x->fpga_data_num * sizeof(float)); + + auto input_y = param.InputY(); + int axis = param.Axis(); + auto out_float = const_cast(&(param.float_out)); + ElementwiseComputeEx, float>( + intput_x_float, input_y, axis, MulFunctor(), out_float); + fpga::fpga_flush(out_float->data(), + input_x->fpga_data_num * sizeof(float)); + + Tensor *Out = param.Out(); + args.input_data_type = fpga::DATA_TYPE_FP32; + args.output_data_type = fpga::DATA_TYPE_FP16; + args.input_layout_type = fpga::LAYOUT_CHW; + args.output_layout_type = fpga::LAYOUT_HWC; + args.image.address = out_float->data(); + args.image.channels = (uint32_t)(Out->fpga_data_num); + args.image.height = 1; + args.image.width = 1; + args.image.pad_height = 0; + args.image.pad_width = 0; + args.output.address = Out->data(); + args.output.scale_address = Out->scale; + fpga::PerformBypass(args); +} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/fpga/V1/feed_kernel.cpp b/src/operators/kernel/fpga/V1/feed_kernel.cpp index a4b3ec85f3688066d00b37753a6533a7ef72a552..a661cd642c51a1baff2ac6ec97933831bd034c40 100644 --- a/src/operators/kernel/fpga/V1/feed_kernel.cpp +++ b/src/operators/kernel/fpga/V1/feed_kernel.cpp @@ -26,13 +26,9 @@ bool FeedKernel::Init(FeedParam *param) { input->Resize(output->dims()); if (output->dims().size() != 4) { - auto input_ptr = input->mutable_data(); - size_t size = output->numel() * sizeof(float); - auto p = fpga::fpga_malloc(size); - memcpy(p, input_ptr, size); - output->reset_data_ptr(p); return true; } + fpga::format_fp16_ofm(output); return true; } @@ -42,32 +38,68 @@ void FeedKernel::Compute(const FeedParam ¶m) { auto output = param.Out(); int col = param.Col(); auto input = const_cast(¶m.InputX()->at(col)); + std::type_index input_type = input->type(); + + if (input_type == typeid(float)) { + input->init(typeid(float)); + } else { // input_type == typeid(int8_t) + input->init(typeid(int8_t)); + } + input->Resize(output->dims()); - if (input->dims().size() != 4) { + if (output->dims().size() != 4) { + size_t size = output->numel() * sizeof(float); + auto output_ptr = output->data(); + auto input_ptr = input->data(); + auto external_ptr = reinterpret_cast(input->external_data); + float *p_data = external_ptr == nullptr ? input_ptr : external_ptr; + memcpy(output_ptr, p_data, size); + input->external_data = nullptr; return; } fpga::format_image(input); - auto input_ptr = input->data(); auto output_ptr = output->data(); - fpga::BypassArgs args = {fpga::DATA_TYPE_FP32}; + if (input_type == typeid(float)) { + auto input_ptr = input->data(); + auto external_ptr = reinterpret_cast(input->external_data); + float *p_data = external_ptr == nullptr ? input_ptr : external_ptr; - args.input_data_type = fpga::DATA_TYPE_FP32; - args.output_data_type = fpga::DATA_TYPE_FP16; - args.input_layout_type = fpga::LAYOUT_CHW; - args.output_layout_type = fpga::LAYOUT_HWC; - args.image.address = input_ptr; - args.image.channels = (uint32_t)input->dims()[1]; - args.image.height = (uint32_t)input->dims()[2]; - args.image.width = (uint32_t)input->dims()[3]; - args.image.pad_height = 0; - args.image.pad_width = 0; - args.output.address = output_ptr; - args.output.scale_address = output->scale; - fpga::PerformBypass(args); + args.input_data_type = fpga::DATA_TYPE_FP32; + args.output_data_type = fpga::DATA_TYPE_FP16; + args.input_layout_type = fpga::LAYOUT_CHW; + args.output_layout_type = fpga::LAYOUT_HWC; + args.image.address = p_data; + args.image.channels = (uint32_t)input->dims()[1]; + args.image.height = (uint32_t)input->dims()[2]; + args.image.width = (uint32_t)input->dims()[3]; + args.image.pad_height = 0; + args.image.pad_width = 0; + args.output.address = output_ptr; + args.output.scale_address = output->scale; + fpga::PerformBypass(args); + input->external_data = nullptr; + } else { // input_type == typeid(int8_t) + auto input_ptr = input->data(); + auto external_ptr = reinterpret_cast(input->external_data); + int8_t *p_data = external_ptr == nullptr ? input_ptr : external_ptr; - input->external_data = nullptr; + args.input_data_type = fpga::DATA_TYPE_INT8; + args.output_data_type = fpga::DATA_TYPE_FP16; + args.input_layout_type = fpga::LAYOUT_CHW; + args.output_layout_type = fpga::LAYOUT_HWC; + args.image.address = p_data; + args.image.channels = (uint32_t)input->dims()[1]; + args.image.height = (uint32_t)input->dims()[2]; + args.image.width = (uint32_t)input->dims()[3]; + args.image.pad_height = 0; + args.image.pad_width = 0; + args.output.address = output_ptr; + args.output.scale_address = output->scale; + fpga::PerformBypass(args); + input->external_data = nullptr; + } } template class FeedKernel; diff --git a/src/operators/kernel/fpga/V1/fetch_kernel.cpp b/src/operators/kernel/fpga/V1/fetch_kernel.cpp index 545fff88168a6cb245cfe4cdfd26d8e3de64a825..2aea5a770c674a7d70dc2abf0d691598444f9a25 100644 --- a/src/operators/kernel/fpga/V1/fetch_kernel.cpp +++ b/src/operators/kernel/fpga/V1/fetch_kernel.cpp @@ -11,9 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "operators/kernel/fetch_kernel.h" - namespace paddle_mobile { namespace operators { @@ -36,7 +34,7 @@ bool FetchKernel::Init(FetchParam *param) { args.input_layout_type = fpga::LAYOUT_CHW; args.output_layout_type = fpga::LAYOUT_HWC; args.image.address = input->data(); - args.image.channels = (uint32_t)product(input->dims()); + args.image.channels = (uint32_t)(input->fpga_data_num); args.image.height = 1; args.image.width = 1; args.image.pad_height = 0; @@ -47,21 +45,56 @@ bool FetchKernel::Init(FetchParam *param) { return true; } - +void dealign(float *src, float *dst, int input_c, int input_h, int input_w) { + int alignCW = paddle_mobile::fpga::align_to_x(input_c * input_w, 16); + int dealignCW = input_c * input_w; + for (int h = 0; h < input_h; ++h) { + auto input_offset = h * alignCW; + auto output_offset = h * dealignCW; + memcpy((dst + output_offset), (src + input_offset), + dealignCW * sizeof(float)); + } +} template <> void FetchKernel::Compute(const FetchParam ¶m) { - auto input = param.InputX(); + auto input = const_cast(param.InputX()); if (input->type() == typeid(float)) { int col = param.Col(); auto output = &(param.Out()->at(col)); output->ShareDataWith(*input); return; } - fpga::PerformBypass(param.fpga_bypass_args); + + fpga::BypassArgs args = param.fpga_bypass_args; + auto input_address = (input->data()); + args.image.address = static_cast(input_address); + float *outdata_ptr = + reinterpret_cast(param.fpga_bypass_args.output.address); + const int num_th = 32; + if ((param.Out()->fpga_data_num) < num_th) { + fpga::fpga_invalidate(input_address, (input->fpga_data_num) * sizeof(half)); + + for (int idx = 0; idx < product(input->dims()); ++idx) { + outdata_ptr[idx] = fpga::fp16_2_fp32(input_address[idx]); + } + return; + } + + fpga::PerformBypass(args); + auto outC = param.Out()->dims()[1]; + auto outH = param.Out()->dims()[2]; + auto outW = param.Out()->dims()[3]; + fpga::fpga_invalidate(param.fpga_bypass_args.output.address, - param.fpga_bypass_args.image.channels * sizeof(float)); + param.Out()->fpga_data_num * sizeof(float)); - // TODO: DEalign: get rid of extra 0 + if (param.Out()->fpga_data_num != product(input->dims())) { + float *data_tmp = + reinterpret_cast(malloc(outC * outH * outW * sizeof(float))); + dealign(outdata_ptr, data_tmp, outC, outH, outW); + memcpy(outdata_ptr, data_tmp, outC * outH * outW * sizeof(float)); + free(data_tmp); + } } template class FetchKernel; diff --git a/src/operators/kernel/fpga/V1/fusion_fc_kernel.cpp b/src/operators/kernel/fpga/V1/fusion_fc_kernel.cpp index 944dd20a55cbbec0abda2543c1ea6ea09f17bce8..3a29104d0fe0e3c69c9369fb1137b2c94ef04e43 100644 --- a/src/operators/kernel/fpga/V1/fusion_fc_kernel.cpp +++ b/src/operators/kernel/fpga/V1/fusion_fc_kernel.cpp @@ -30,8 +30,8 @@ bool FusionFcKernel::Init(FusionFcParam *param) { auto input_z_ptr = input_z->data(); auto out = param->Out(); - PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0], - "Image channel should be equal to weight number"); + // PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0], + // "Image channel should be equal to weight number"); int channel = (uint32_t)out->dims()[1]; auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT diff --git a/src/operators/kernel/fpga/V1/fusion_fc_relu_kernel.cpp b/src/operators/kernel/fpga/V1/fusion_fc_relu_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6fbeb63fe606aac014f76088210c74a4118e6c78 --- /dev/null +++ b/src/operators/kernel/fpga/V1/fusion_fc_relu_kernel.cpp @@ -0,0 +1,75 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#ifdef FUSION_FCRELU_OP + +#include "operators/kernel/fc_relu_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool FusionFcReluKernel::Init(FusionFcReluParam *param) { + // bool relu_enabled = false; + paddle_mobile::fpga::ActivationType activation_enable = + paddle_mobile::fpga::LEAKYRELU; + int16_t leaky_relu_negative_slope = 0; + auto input_x = const_cast(param->InputX()); + auto filter = const_cast(param->InputY()); + const Tensor *input_z = param->InputZ(); + auto input_z_ptr = input_z->data(); + auto out = param->Out(); + + // PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0], + // "Image channel should be equal to weight number"); + int channel = (uint32_t)out->dims()[1]; + auto bs_ptr = + (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT + for (int i = 0; i < channel; i++) { + bs_ptr[i + channel] = 1; + bs_ptr[i] = input_z_ptr[i]; + } + int num = (uint32_t)filter->dims()[1]; + int chw = (uint32_t)filter->dims()[0]; + PADDLE_MOBILE_ENFORCE( + chw == input_x->numel(), + "Filter element num should be equal to IFM element num"); + int height = (uint32_t)input_x->dims()[2]; + int width = (uint32_t)input_x->dims()[3]; + int filter_channel = chw / height / width; + + out->Resize(framework::make_ddim({1, channel, 1, 1})); + filter->Resize(framework::make_ddim({num, filter_channel, height, width})); + float max_value = fpga::filter_find_max(filter); + fpga::format_fc_filter(filter, max_value); + + int element_num_per_div = fpga::get_filter_num_per_div(filter, 1); + fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); + fpga::format_fp16_ofm(out); + + fpga::SplitConvArgs conv_arg = {0}; + fpga::fill_split_arg(&conv_arg, input_x, out, filter, activation_enable, + leaky_relu_negative_slope, 1, 1, 1, 0, 0, bs_ptr); + param->SetFpgaArgs(conv_arg); + return true; +} + +template <> +void FusionFcReluKernel::Compute( + const FusionFcReluParam ¶m) { + fpga::ComputeFpgaConv(param.FpgaArgs()); +} +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/fpga/V1/pad2d_kernel.cpp b/src/operators/kernel/fpga/V1/pad2d_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f47a585ee412316ce65084c5fa10a622ffb93a4f --- /dev/null +++ b/src/operators/kernel/fpga/V1/pad2d_kernel.cpp @@ -0,0 +1,60 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "operators/kernel/pad2d_kernel.h" +namespace paddle_mobile { +namespace operators { +template <> +bool Pad2dKernel::Init(Pad2dParam *param) { + Tensor *output = param->Out(); + fpga::format_fp16_ofm(output); + return true; +} +void pad2dFunc(const framework::Tensor *input, framework::Tensor *output) { + auto input_data = (input->data()); + auto output_data = (output->data()); + auto input_c = input->dims()[1]; + auto input_h = input->dims()[2]; + auto input_w = input->dims()[3]; + auto output_c = output->dims()[1]; + auto output_w = output->dims()[3]; + auto copysize = input_c * input_w; + for (int h = 0; h < input_h; ++h) { + auto input_offset = h * input_c * input_w; + auto output_offset = h * paddle_mobile::fpga::align_to_x( + output_c * output_w, IMAGE_ALIGNMENT); + memcpy((output_data + output_offset), (input_data + input_offset), + copysize * sizeof(half)); + } +} +template <> +void Pad2dKernel::Compute(const Pad2dParam ¶m) { + auto in_x = param.InputX(); + auto out = param.Out(); + fpga::fpga_invalidate((void *)in_x->data(), // NOLINT + in_x->numel() * sizeof(half)); + pad2dFunc(in_x, out); + (out->scale)[0] = (in_x->scale)[0]; + (out->scale)[1] = (in_x->scale)[1]; + DLOG << (out->scale)[0]; + DLOG << (out->scale)[1]; + size_t outputSize = + out->dims()[2] * + paddle_mobile::fpga::align_to_x((out->dims()[1]) * (out->dims()[3]), + IMAGE_ALIGNMENT) * + sizeof(half); + fpga::fpga_flush(out->data(), outputSize); +} +} // namespace operators +} // namespace paddle_mobile diff --git a/src/operators/kernel/fpga/V1/pool_kernel.cpp b/src/operators/kernel/fpga/V1/pool_kernel.cpp index c249c1a18db7eca9dfe27bbbe8c25ec6acffd7f8..4c0e09e63f2785b535f81b5262afe93099a74aa5 100644 --- a/src/operators/kernel/fpga/V1/pool_kernel.cpp +++ b/src/operators/kernel/fpga/V1/pool_kernel.cpp @@ -22,15 +22,29 @@ namespace operators { template <> bool PoolKernel::Init(PoolParam *param) { auto *input = const_cast(param->Input()); - auto input_ptr = input->data(); - Tensor *output = param->Output(); - fpga::format_fp16_ofm(output); - auto output_ptr = output->mutable_data(); + auto *output = param->Output(); vector ksize = param->Ksize(); vector strides = param->Strides(); vector paddings = param->Paddings(); std::string pooling_type = param->PoolingType(); + if (input->type() == typeid(float)) { + int channels = input->dims()[1]; + int height = input->dims()[2]; + int width = input->dims()[3]; + int num = input->dims()[0]; + int out_width = (width + 2 * paddings[1] - ksize[1]) / strides[1] + 1; + int out_height = (height + 2 * paddings[0] - ksize[0]) / strides[0] + 1; + framework::DDim dim = + framework::make_ddim({num, channels, out_height, out_width}); + output->mutable_data(dim); + return true; + } + + auto input_ptr = input->data(); + fpga::format_fp16_ofm(output); + auto output_ptr = output->mutable_data(); + fpga::PoolingArgs poolArgs = {0}; poolArgs.mode = pooling_type == "max" ? 0 : 1; // max:0, avg:1 poolArgs.kernel_reciprocal = @@ -54,6 +68,34 @@ bool PoolKernel::Init(PoolParam *param) { template <> void PoolKernel::Compute(const PoolParam ¶m) { + auto *input = const_cast(param.Input()); + + if (input->type() == typeid(float)) { + auto *output = param.Output(); + auto in = input->data(); + auto N = input->dims()[0]; + output->Resize( + {N, output->dims()[1], output->dims()[2], output->dims()[3]}); + auto len = output->numel(); + auto out = output->mutable_data(); + int C = input->dims()[1], H = input->dims()[2], // N = input->dims()[0], + W = input->dims()[3]; + int HW = H * W, CHW = C * H * W, WC = W * C; + + for (int n = 0; n < N; n++) { + for (int c = 0; c < C; c++) { + out[n * C + c] = 0; + for (int h = 0; h < H; h++) { + for (int w = 0; w < W; w++) { + out[n * C + c] += in[n * CHW + h * WC + w * C + + c]; // in[n * CHW + c * HW + h * W + w]; // + } + } + out[n * C + c] /= HW; + } + } + return; + } fpga::ComputeFpgaPool(param.FpgaArgs()); } } // namespace operators diff --git a/src/operators/kernel/fpga/V1/proposal_kernel.cpp b/src/operators/kernel/fpga/V1/proposal_kernel.cpp index 9f5f1134a74ef51dce2c28c73b503328f234a370..772c68059ddb85958279639626bfb9e2b36fb91b 100644 --- a/src/operators/kernel/fpga/V1/proposal_kernel.cpp +++ b/src/operators/kernel/fpga/V1/proposal_kernel.cpp @@ -67,8 +67,39 @@ bool ProposalKernel::Init(ProposalParam *param) { args.output.scale_address = param->float_score->scale; param->score_arg = args; + param->score_index_ = std::make_shared(); + param->score_index_->mutable_data({input->numel()}); + auto score_index = param->score_index_->data(); + for (int i = 0; i < input->numel(); ++i) { + score_index[i] = i; + } + return true; } +template +void CPUGather(const Tensor &src, const Tensor &index, Tensor *output) { + PADDLE_MOBILE_ENFORCE(index.dims().size() == 1 || + (index.dims().size() == 2 && index.dims()[1] == 1), + "Dim not correct"); + int64_t index_size = index.dims()[0]; + + auto src_dims = src.dims(); + + const T *p_src = src.data(); + const int *p_index = index.data(); + T *p_output = output->data(); + + // slice size + int slice_size = 1; + for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i]; + + const size_t slice_bytes = slice_size * sizeof(T); + + for (int64_t i = 0; i < index_size; ++i) { + int index_ = p_index[i]; + memcpy(p_output + i * slice_size, p_src + index_ * slice_size, slice_bytes); + } +} void AppendProposals(Tensor *dst, int64_t offset, const Tensor &src) { auto *out_data = dst->data(); @@ -105,38 +136,49 @@ static inline void BoxCoder(Tensor *all_anchors, Tensor *bbox_deltas, T bbox_center_x = 0, bbox_center_y = 0; T bbox_width = 0, bbox_height = 0; - if (variances) { - bbox_center_x = - variances_data[i * len] * bbox_deltas_data[i * len] * anchor_width + - anchor_center_x; - bbox_center_y = variances_data[i * len + 1] * - bbox_deltas_data[i * len + 1] * anchor_height + - anchor_center_y; - bbox_width = std::exp(std::min(variances_data[i * len + 2] * - bbox_deltas_data[i * len + 2], - kBBoxClipDefault)) * - anchor_width; - bbox_height = std::exp(std::min(variances_data[i * len + 3] * - bbox_deltas_data[i * len + 3], - kBBoxClipDefault)) * - anchor_height; - } else { - bbox_center_x = - bbox_deltas_data[i * len] * anchor_width + anchor_center_x; - bbox_center_y = - bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y; - bbox_width = std::exp(std::min(bbox_deltas_data[i * len + 2], - kBBoxClipDefault)) * - anchor_width; - bbox_height = std::exp(std::min(bbox_deltas_data[i * len + 3], - kBBoxClipDefault)) * - anchor_height; - } + /* + if (variances) { + bbox_center_x = + variances_data[i * len] * bbox_deltas_data[i * len] * anchor_width + + anchor_center_x; bbox_center_y = variances_data[i * len + 1] * + bbox_deltas_data[i * len + 1] * anchor_height + + anchor_center_y; + bbox_width = std::exp(std::min(variances_data[i * len + 2] * + bbox_deltas_data[i * len + 2], + kBBoxClipDefault)) * + anchor_width; + bbox_height = std::exp(std::min(variances_data[i * len + 3] * + bbox_deltas_data[i * len + 3], + kBBoxClipDefault)) * + anchor_height; + } else { + */ + bbox_center_x = bbox_deltas_data[i * len] * anchor_width + anchor_center_x; + bbox_center_y = + bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y; + + /* + bbox_width = std::exp(std::min(bbox_deltas_data[i * len + 2], + kBBoxClipDefault)) * + anchor_width; + bbox_height = std::exp(std::min(bbox_deltas_data[i * len + 3], + kBBoxClipDefault)) * + anchor_height; + */ + bbox_width = std::exp(bbox_deltas_data[i * len + 2]) * anchor_width; + bbox_height = std::exp(bbox_deltas_data[i * len + 3]) * anchor_height; + // } proposals_data[i * len] = bbox_center_x - bbox_width / 2; proposals_data[i * len + 1] = bbox_center_y - bbox_height / 2; - proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2 - 1; - proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2 - 1; + /* + //wong + proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2 - 1; + proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2 - 1; + //wong + */ + proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2; + proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2; } // return proposals; } @@ -301,17 +343,20 @@ std::pair ProposalForOneImage( const Tensor &im_info_slice, const Tensor &anchors, const Tensor &variances, const Tensor &bbox_deltas_slice, // [M, 4] const Tensor &scores_slice, // [N, 1] - int pre_nms_top_n, int post_nms_top_n, float nms_thresh, float min_size, - float eta) { + const Tensor &score_index, int pre_nms_top_n, int post_nms_top_n, + float nms_thresh, float min_size, float eta) { auto *scores_data = scores_slice.data(); // Sort index Tensor index_t; index_t.Resize({scores_slice.numel()}); int *index = index_t.mutable_data(); - for (int i = 0; i < scores_slice.numel(); ++i) { + /*for (int i = 0; i < scores_slice.numel(); ++i) { index[i] = i; - } + }*/ + std::memcpy(index, score_index.data(), + scores_slice.numel() * sizeof(int)); + auto compare = [scores_data](const int64_t &i, const int64_t &j) { return scores_data[i] > scores_data[j]; }; @@ -330,9 +375,12 @@ std::pair ProposalForOneImage( anchor_sel.mutable_data({index_t.numel(), 4}); var_sel.mutable_data({index_t.numel(), 4}); + CPUGather(scores_slice, index_t, &scores_sel); + CPUGather(bbox_deltas_slice, index_t, &bbox_sel); + CPUGather(anchors, index_t, &anchor_sel); Tensor proposals; proposals.mutable_data({index_t.numel(), 4}); - BoxCoder(&anchor_sel, &bbox_sel, &var_sel, &proposals); + BoxCoder(&anchor_sel, &bbox_sel, nullptr, &proposals); ClipTiledBoxes(im_info_slice, &proposals); @@ -343,6 +391,8 @@ std::pair ProposalForOneImage( bbox_sel.mutable_data({keep.numel(), 4}); scores_filter.mutable_data({keep.numel(), 1}); + CPUGather(proposals, keep, &bbox_sel); + CPUGather(scores_sel, keep, &scores_filter); if (nms_thresh <= 0) { return std::make_pair(bbox_sel, scores_filter); } @@ -353,14 +403,86 @@ std::pair ProposalForOneImage( keep_nms.Resize({post_nms_top_n}); } - proposals.mutable_data({keep_nms.numel(), 4}); - scores_sel.mutable_data({keep_nms.numel(), 1}); + // proposals.mutable_data({keep_nms.numel(), 4});//original + // scores_sel.mutable_data({keep_nms.numel(), 1});//original + proposals.mutable_data({post_nms_top_n, 4}); // wong + scores_sel.mutable_data({post_nms_top_n, 1}); // wong + CPUGather(bbox_sel, keep_nms, &proposals); + CPUGather(scores_filter, keep_nms, &scores_sel); return std::make_pair(proposals, scores_sel); } template <> void ProposalKernel::Compute(const ProposalParam ¶m) { + auto input_score = param.scores_; + auto input_score_data = input_score->data(); + auto input_score_data_tmp = input_score->data(); + uint32_t score_n, score_height, score_width, score_channels; + + auto input_bbox = param.bbox_deltas_; + auto input_bbox_data = input_bbox->data(); + auto input_bbox_data_tmp = input_bbox->data(); + uint32_t bbox_n, bbox_height, bbox_width, bbox_channels; + + score_n = (uint32_t)(input_score->dims()[0]); + score_channels = (uint32_t)(input_score->dims()[1]); + score_height = (uint32_t)(input_score->dims()[2]); + score_width = (uint32_t)(input_score->dims()[3]); + + bbox_n = (uint32_t)(input_bbox->dims()[0]); + bbox_channels = (uint32_t)(input_bbox->dims()[1]); + bbox_height = (uint32_t)(input_bbox->dims()[2]); + bbox_width = (uint32_t)(input_bbox->dims()[3]); + + // score_tmp->init(typeid(half)); + std::shared_ptr score_tmp = std::make_shared(); + score_tmp->Resize(param.scores_->dims()); + score_tmp->mutable_data(); + + std::shared_ptr bbox_tmp = std::make_shared(); + bbox_tmp->Resize(param.bbox_deltas_->dims()); + bbox_tmp->mutable_data(); + + auto score_tmp_data = score_tmp->data(); + auto bbox_tmp_data = bbox_tmp->data(); + int64_t amount_per_side = score_width * score_height; + int idx = 0; + fpga::fpga_invalidate( + input_score_data_tmp, + score_height * score_width * score_channels * sizeof(half)); + for (int h = 0; h < score_height; h++) { + for (int w = 0; w < score_width; w++) { + for (int c = 0; c < score_channels; c++) { + idx++; + // DLOG << "wong input_score: "<< + // paddle_mobile::fpga::fp16_2_fp32(input_score_data[idx]); + *(score_tmp_data + c * amount_per_side + score_width * h + w) = + (*(input_score_data_tmp++)); + } + } + } + amount_per_side = bbox_width * bbox_height; + fpga::fpga_invalidate(input_bbox_data_tmp, bbox_height * bbox_width * + bbox_channels * sizeof(half)); + for (int h = 0; h < bbox_height; h++) { + for (int w = 0; w < bbox_width; w++) { + for (int c = 0; c < bbox_channels; c++) { + idx++; + // DLOG << "wong input_score: "<< + // paddle_mobile::fpga::fp16_2_fp32(input_score_data[idx]); + *(bbox_tmp_data + c * amount_per_side + bbox_width * h + w) = + (*(input_bbox_data_tmp++)); + } + } + } + struct paddle_mobile::fpga::BypassArgs temp_score_arg; + struct paddle_mobile::fpga::BypassArgs temp_bbox_arg; + temp_score_arg = param.score_arg; + temp_score_arg.image.address = score_tmp->data(); + + temp_bbox_arg = param.bbox_arg; + temp_bbox_arg.image.address = bbox_tmp->data(); auto score_tensor = param.float_score.get(); fpga::PerformBypass(param.score_arg); fpga::fpga_invalidate(score_tensor->data(), @@ -380,9 +502,13 @@ void ProposalKernel::Compute(const ProposalParam ¶m) { auto *rpn_rois = param.rpn_rois_; auto *rpn_roi_probs = param.rpn_probs_; + auto score_index = *(param.score_index_.get()); + int pre_nms_top_n = param.pre_nms_topn_; int post_nms_top_n = param.post_nms_topn_; - float nms_thresh = param.nms_thresh_; + // DLOG << " param.post_nms_topn_ : " << param.post_nms_topn_; + + float nms_thresh = param.nms_thresh_ / 2.0f; float min_size = param.min_size_; float eta = param.eta_; @@ -398,28 +524,28 @@ void ProposalKernel::Compute(const ProposalParam ¶m) { int64_t w_bbox = bbox_dim[3]; // - Tensor bbox_deltas_swap, scores_swap; - bbox_deltas_swap.mutable_data({num, h_bbox, w_bbox, c_bbox}); - scores_swap.mutable_data({num, h_score, w_score, c_score}); + rpn_rois->mutable_data({bbox_deltas->numel(), 4}); + rpn_roi_probs->mutable_data({scores->numel(), 1}); framework::LoD lod; lod.resize(1); auto &lod0 = lod[0]; lod0.push_back(0); - anchors.Resize({anchors.numel() / 4, 4}); + anchors.Resize({anchors.numel(), 4}); + variances.Resize({variances.numel(), 4}); int64_t num_proposals = 0; for (int64_t i = 0; i < num; ++i) { Tensor im_info_slice = im_info->Slice(i, i + 1); - Tensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1); - Tensor scores_slice = scores_swap.Slice(i, i + 1); + Tensor bbox_deltas_slice = (*bbox_tensor).Slice(i, i + 1); + Tensor scores_slice = (*score_tensor).Slice(i, i + 1); - bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox / 4, 4}); + bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox, 4}); scores_slice.Resize({h_score * w_score * c_score, 1}); std::pair tensor_pair = ProposalForOneImage( im_info_slice, anchors, variances, bbox_deltas_slice, scores_slice, - pre_nms_top_n, post_nms_top_n, nms_thresh, min_size, eta); + score_index, pre_nms_top_n, post_nms_top_n, nms_thresh, min_size, eta); Tensor &proposals = tensor_pair.first; Tensor &scores = tensor_pair.second; diff --git a/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp b/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp index 97e820e83c434dc4d552a7b0e83329fc5f6d6888..170d245c0212c06b8a25243a79c4f1bd25d314c4 100644 --- a/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp +++ b/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp @@ -15,9 +15,12 @@ limitations under the License. */ #ifdef PSROI_POOL_OP #include +#include #include #include "operators/kernel/detection_kernel.h" +#include "fpga/V1/api.h" +#include "fpga/V1/image.h" namespace paddle_mobile { namespace operators { @@ -29,8 +32,7 @@ bool PSRoiPoolKernel::Init(PSRoiPoolParam* param) { param->float_input = std::make_shared(); param->float_input->mutable_data(param->input_x_->dims()); - param->float_output = std::make_shared(); - param->float_output->mutable_data(param->output_->dims()); + // param->float_output = std::make_shared(); auto input = param->input_x_; fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; @@ -46,22 +48,108 @@ bool PSRoiPoolKernel::Init(PSRoiPoolParam* param) { args.output.scale_address = param->float_input->scale; param->input_arg = args; - fpga::format_fp16_ofm(param->output_); + auto* rois = param->input_rois_; + int rois_num = rois->dims()[0]; + framework::DDim dims_out_new = framework::make_ddim( + {rois_num, param->output_->dims()[1], param->output_->dims()[2], + param->output_->dims()[3]}); + param->output_->Resize(dims_out_new); + // fpga::format_fp16_ofm(param->output_); - input = param->float_output.get(); - args.input_data_type = fpga::DATA_TYPE_FP32; - args.output_data_type = fpga::DATA_TYPE_FP16; - args.image.address = input->data(); - args.image.height = (uint32_t)input->dims()[2]; - args.image.width = (uint32_t)input->dims()[3]; - args.image.channels = (uint32_t)input->dims()[1]; - args.output.address = param->output_->mutable_data(); - args.output.scale_address = param->output_->scale; - param->input_arg = args; + param->output_->mutable_data(dims_out_new); + // auto output = param->float_output.get(); + // param->output_ = output; + /* args.input_data_type = fpga::DATA_TYPE_FP32; + args.output_data_type = fpga::DATA_TYPE_FP16; + args.image.address = output->data(); + args.image.height = (uint32_t)output->dims()[2]; + args.image.width = (uint32_t)output->dims()[3]; + args.image.channels = (uint32_t)output->dims()[1] ; + args.output.address = param->output_->mutable_data(); + args.output.scale_address = param->output_->scale; + param->output_arg = args;*/ return true; } +template +void PSROIPooling(const Dtype* bottom_data, const int channels, + const int height, const int width, const int pooled_height, + const int pooled_width, const Dtype* bottom_rois, + const int output_dim, const int group_size, Dtype* top_data, + int index, int nid, const Dtype Bin_size_h, + const Dtype Bin_size_w, const Dtype roi_start_h, + const Dtype roi_start_w, const int ctop, const int ph, + const int roi_batch_ind) { + int pw = index; + int hstart = floor(static_cast(ph) * Bin_size_h + roi_start_h); + int wstart = floor(static_cast(pw) * Bin_size_w + roi_start_w); + int hend = ceil(static_cast(ph + 1) * Bin_size_h + roi_start_h); + int wend = ceil(static_cast(pw + 1) * Bin_size_w + roi_start_w); + + // Add roi offsets and clip to input boundaries + hstart = std::min(std::max(hstart, 0), height); + hend = std::min(std::max(hend, 0), height); + wstart = std::min(std::max(wstart, 0), width); + wend = std::min(std::max(wend, 0), width); + bool is_empty = (hend <= hstart) || (wend <= wstart); + + int c = (ctop * group_size + ph) * group_size + pw; + + Dtype bin_area = (hend - hstart) * (wend - wstart); + bottom_data += (roi_batch_ind * channels + c) * height * width; + Dtype out_sum = 0; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + int bottom_index = h * width + w; + out_sum += bottom_data[bottom_index]; + } + } + + top_data[nid + index] = is_empty ? 0. : out_sum / bin_area; +} + +void convert_to_chw(float** data_in, int channel, int height, int width, + int num) { + float* data_in_tmp = *data_in; + float* data_tmp = reinterpret_cast( + fpga::fpga_malloc(channel * height * width * sizeof(float))); // NOLINT + int64_t amount_per_side = width * height; + for (int n = 0; n < num; n++) { + for (int h = 0; h < height; h++) { + for (int w = 0; w < width; w++) { + for (int c = 0; c < channel; c++) { + *(data_tmp + n * height * width * channel + c * amount_per_side + + width * h + w) = *((*data_in)++); + } + } + } + } + *data_in = data_tmp; + fpga::fpga_free(data_in_tmp); +} + +void convert_to_hwc(float** data_in, int channel, int height, int width, + int num) { + float* data_in_tmp = *data_in; + float* data_tmp = reinterpret_cast( + fpga::fpga_malloc(num * channel * height * width * sizeof(float))); + int64_t amount_per_row = width * channel; + for (int n = 0; n < num; n++) { + for (int c = 0; c < channel; c++) { + for (int h = 0; h < height; h++) { + int64_t offset_height = h * amount_per_row; + for (int w = 0; w < width; w++) { + *(data_tmp + n * channel * height * width + offset_height + + w * channel + c) = *((*data_in)++); + } + } + } + } + *data_in = data_tmp; + fpga::fpga_free(data_in_tmp); +} + template <> void PSRoiPoolKernel::Compute(const PSRoiPoolParam& param) { auto input_tensor = param.float_input.get(); @@ -71,7 +159,7 @@ void PSRoiPoolKernel::Compute(const PSRoiPoolParam& param) { auto* in = input_tensor; auto* rois = param.input_rois_; - auto* out = param.float_output.get(); + auto* out = param.output_; // param.float_output.get(); auto pooled_height = param.pooled_height_; auto pooled_width = param.pooled_width_; @@ -85,18 +173,18 @@ void PSRoiPoolKernel::Compute(const PSRoiPoolParam& param) { int width = in_dims[3]; int rois_num = rois->dims()[0]; - // TODO auto in_stride = framework::stride(in_dims); - // TODO auto out_stride = framework::stride(out->dims()); - auto in_stride = - framework::stride({batch_size, height, width, input_channels}); - auto out_stride = framework::stride( - {out->dims()[0], out->dims()[2], out->dims()[3], out->dims()[1]}); + auto data_nhwc = in->mutable_data(); + fpga::image::convert_to_chw(&data_nhwc, input_channels, height, width, 1); + framework::DDim dims_out_new = framework::make_ddim( + {rois_num, (param.output_)->dims()[1], (((param.output_)->dims()[2])), + (param.output_)->dims()[3]}); + (param.output_)->Resize(dims_out_new); - const float* input_data = in->data(); + float* input_data = data_nhwc; // in->data(); + // shared_ptr input_data(data_nhwc); framework::Tensor rois_batch_id_list; rois_batch_id_list.Resize({rois_num}); auto rois_batch_id_data = rois_batch_id_list.mutable_data(); - return; PADDLE_MOBILE_ENFORCE(rois->NumLevels() > 0, "ROIS should not be empty"); @@ -115,19 +203,16 @@ void PSRoiPoolKernel::Compute(const PSRoiPoolParam& param) { "output_channels x pooled_height x pooled_width"); // calculate batch id index for each roi according to LoD - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - rois_batch_id_data[i] = n; - } - } + // for (int n = 0; n < rois_batch_size; ++n) { + // for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { + // rois_batch_id_data[i] = n; + // } + //} auto output_data = out->mutable_data(); auto input_rois = rois->data(); // calculate psroipooling, parallel processing can be implemented per ROI for (int n = 0; n < rois_num; ++n) { - // set roi batch id - int roi_batch_id = rois_batch_id_data[n]; - // [start, end) interval for spatial sampling auto offset_input_rois = input_rois + n * 4; auto roi_start_w = @@ -146,56 +231,28 @@ void PSRoiPoolKernel::Compute(const PSRoiPoolParam& param) { // Compute bin size w and h at input feature map auto bin_size_h = roi_height / static_cast(pooled_height); auto bin_size_w = roi_width / static_cast(pooled_width); - DLOG << 3; - // calculate each pixel of the output feature map. - int out_roi_offset = n * out_stride[0]; + int roi_batch_ind = 0; // rois_batch_id_data[n]; + // std::cout << "roi_batch_ind: " << roi_batch_ind << std::endl; for (int c = 0; c < output_channels; ++c) { - // per category - // int out_plane_offset = out_roi_offset + c * out_stride[1]; - int out_plane_offset = out_roi_offset + c; - for (int ph = 0; ph < pooled_height; ++ph) { - // TODO int out_row_offset = out_plane_offset + ph * - // out_stride[2]; - int out_row_offset = out_plane_offset + ph * out_stride[1]; - for (int pw = 0; pw < pooled_width; ++pw) { - // calculate w and h at input feature map - int hstart = floor(static_cast(ph) * bin_size_h + roi_start_h); - int wstart = floor(static_cast(pw) * bin_size_w + roi_start_w); - int hend = - ceil(static_cast(ph + 1) * bin_size_h + roi_start_h); - int wend = - ceil(static_cast(pw + 1) * bin_size_w + roi_start_w); - // Add roi offsets and clip to input boundaries - hstart = std::min(std::max(hstart, 0), height); - wstart = std::min(std::max(wstart, 0), width); - hend = std::min(std::max(hend, 0), height); - wend = std::min(std::max(wend, 0), width); - - // TODO int output_index = out_row_offset + pw; - int output_index = out_row_offset + pw * output_channels; - int input_channel = (c * pooled_height + ph) * pooled_width + pw; - // TODO int input_plane_offset = - // TODO roi_batch_id * in_stride[0] + input_channel * - // in_stride[1]; - int input_plane_offset = roi_batch_id * in_stride[0] + input_channel; - auto offset_input_data = input_data + input_plane_offset; - float out_sum = 0.; - bool is_empty = (hend <= hstart) || (wend <= wstart); - for (int ih = hstart; ih < hend; ++ih) { - for (int iw = wstart; iw < wend; ++iw) { - int input_index = ih * in_stride[1] + iw * input_channel; - out_sum += offset_input_data[input_index]; - } - } - float bin_area = (hend - hstart) * (wend - wstart); - output_data[output_index] = is_empty ? 0. : out_sum / bin_area; + for (int ph = 0; ph < pooled_height; ph++) { + int index = pooled_width; + int nid = n * output_channels * pooled_height * pooled_width + + c * pooled_width * pooled_height + ph * pooled_width; + for (int idx = 0; idx < index; idx++) { + PSROIPooling(input_data, input_channels, height, width, + pooled_height, pooled_width, input_rois, + output_channels, pooled_height, output_data, idx, + nid, bin_size_h, bin_size_w, roi_start_h, + roi_start_w, c, ph, roi_batch_ind); } } } } - fpga::format_image(out); - fpga::PerformBypass(param.output_arg); + fpga::fpga_free(input_data); + fpga::image::convert_to_hwc(&output_data, output_channels, pooled_height, + pooled_width, rois_num); + out->reset_data_ptr(output_data); } } // namespace operators diff --git a/src/operators/kernel/fpga/V1/relu_kernel.cpp b/src/operators/kernel/fpga/V1/relu_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6fff10f6206212379e865f2041e3d35cca955bfd --- /dev/null +++ b/src/operators/kernel/fpga/V1/relu_kernel.cpp @@ -0,0 +1,33 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef RELU_OP + +#include "operators/kernel/activation_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool ReluKernel::Init(ReluParam *param) { + param->Out()->ShareDataWith(*param->InputX()); + return true; +} + +template <> +void ReluKernel::Compute(const ReluParam ¶m) {} +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/fpga/V1/reshape2_kernel.cpp b/src/operators/kernel/fpga/V1/reshape2_kernel.cpp index 9e5ce02658adb5fe94935b8d7f4d412405a0727e..647ecb5a6501371c74c8762cf81cee206f1dca68 100644 --- a/src/operators/kernel/fpga/V1/reshape2_kernel.cpp +++ b/src/operators/kernel/fpga/V1/reshape2_kernel.cpp @@ -47,21 +47,11 @@ bool Reshape2Kernel::Init(Reshape2Param *param) { void reshape(LoDTensor *input, LoDTensor *output) { // Subscript r means after reshape - // TODO zhangyang verify this function - float *input_ptr_f, *output_ptr_f; - half *input_ptr_h, *output_ptr_h; - bool is_float = false; - - if (input->type() == typeid(float)) { - input_ptr_f = input->data(); - output_ptr_f = output->data(); - is_float = true; - - } else { - input_ptr_h = input->data(); - output_ptr_h = output->data(); - } + auto input_ptr = input->data(); + auto output_ptr = output->data(); + output->scale[0] = input->scale[0]; + output->scale[1] = input->scale[1]; auto C = static_cast(input->dims()[1]); auto H = static_cast(input->dims()[2]); @@ -77,6 +67,8 @@ void reshape(LoDTensor *input, LoDTensor *output) { auto WCr_align = fpga::align_to_x(WCr, IMAGE_ALIGNMENT); auto HWr = Hr * Wr; + fpga::fpga_invalidate(input_ptr, H * WC_align * sizeof(half)); + int offset_align = 0; int offset_r = 0, offset_align_r = 0; int cr = 0, hr = 0, wr = 0; @@ -87,21 +79,17 @@ void reshape(LoDTensor *input, LoDTensor *output) { int offset1 = w * C + offset0; for (int c = 0; c < C; c++) { offset_align = offset1 + c; - offset_r = c * HW + h * W + c; + offset_r = c * HW + h * W + w; cr = offset_r / HWr; hr = offset_r % HWr / Wr; wr = offset_r % Wr; offset_align_r = hr * WCr_align + wr * Cr + cr; - // DLOG << "hwc"<< h<< " " << w << " " << c; - // DLOG << "hrwrcr" << hr<< " " << wr << " " << cr; - if (is_float) { - output_ptr_f[offset_align_r] = input_ptr_f[offset_align]; - } else { - output_ptr_h[offset_align_r] = input_ptr_h[offset_align]; - } + output_ptr[offset_align_r] = input_ptr[offset_align]; } } } + + fpga::fpga_flush(output_ptr, Hr * WCr_align * sizeof(half)); } template <> @@ -123,6 +111,9 @@ void Reshape2Kernel::Compute(const Reshape2Param ¶m) { output->Resize(framework::make_ddim(shape)); if (output->dims() == input->dims()) { DLOG << "No need to reshape"; + output->ShareDataWith(*input); + framework::LoD lod = input->lod(); + output->set_lod(lod); return; } diff --git a/src/operators/kernel/fpga/V1/roialign_pool_kernel.cpp b/src/operators/kernel/fpga/V1/roialign_pool_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ec8d19db800742693516e08215ccd3889ec86c37 --- /dev/null +++ b/src/operators/kernel/fpga/V1/roialign_pool_kernel.cpp @@ -0,0 +1,296 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef ROIALIGN_POOL_OP + +#include +#include +#include "operators/kernel/detection_kernel.h" + +#include "fpga/V1/api.h" +#include "fpga/V1/image.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool RoiAlignPoolKernel::Init(RoiAlignPoolParam* param) { + auto dims = param->input_x_->dims(); + PADDLE_MOBILE_ENFORCE(dims[1] * dims[3] % IMAGE_ALIGNMENT == 0, + "data not aligned"); + + param->float_input = std::make_shared(); + param->float_input->mutable_data(param->input_x_->dims()); + + auto input = param->input_x_; + fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; + args.input_layout_type = fpga::LAYOUT_HWC; + args.output_layout_type = fpga::LAYOUT_HWC; + args.input_data_type = fpga::DATA_TYPE_FP16; + args.output_data_type = fpga::DATA_TYPE_FP32; + args.image.address = input->data(); + args.image.height = (uint32_t)input->dims()[2]; + args.image.width = (uint32_t)input->dims()[3]; + args.image.channels = (uint32_t)input->dims()[1]; + args.output.address = param->float_input->mutable_data(); + args.output.scale_address = param->float_input->scale; + param->input_arg = args; + + auto* rois = param->input_rois_; + int rois_num = rois->dims()[0]; + framework::DDim dims_out_new = framework::make_ddim( + {rois_num, param->output_->dims()[1], param->output_->dims()[2], + param->output_->dims()[3]}); + param->output_->Resize(dims_out_new); + + param->output_->mutable_data(dims_out_new); + + return true; +} + +template +struct PreCalc { + int pos1; + int pos2; + int pos3; + int pos4; + T w1; + T w2; + T w3; + T w4; +}; + +template +void pre_calc_for_bilinear_interpolate( + const int height, const int width, const int pooled_height, + const int pooled_width, const int iy_upper, const int ix_upper, + T roi_start_h, T roi_start_w, T bin_size_h, T bin_size_w, + int roi_bin_grid_h, int roi_bin_grid_w, + std::vector>& pre_calc) { // NOLINT + int pre_calc_index = 0; + for (int ph = 0; ph < pooled_height; ph++) { + for (int pw = 0; pw < pooled_width; pw++) { + for (int iy = 0; iy < iy_upper; iy++) { + const T yy = roi_start_h + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 + for (int ix = 0; ix < ix_upper; ix++) { + const T xx = roi_start_w + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + + T x = xx; + T y = yy; + // deal with: inverse elements are out of feature map boundary + if (y < -1.0 || y > height || x < -1.0 || x > width) { + // empty + PreCalc pc; + pc.pos1 = 0; + pc.pos2 = 0; + pc.pos3 = 0; + pc.pos4 = 0; + pc.w1 = 0; + pc.w2 = 0; + pc.w3 = 0; + pc.w4 = 0; + pre_calc[pre_calc_index] = pc; + pre_calc_index += 1; + continue; + } + + if (y <= 0) { + y = 0; + } + if (x <= 0) { + x = 0; + } + + int y_low = static_cast(y); + int x_low = static_cast(x); + int y_high; + int x_high; + + if (y_low >= height - 1) { + y_high = y_low = height - 1; + y = (T)y_low; + } else { + y_high = y_low + 1; + } + + if (x_low >= width - 1) { + x_high = x_low = width - 1; + x = (T)x_low; + } else { + x_high = x_low + 1; + } + + T ly = y - y_low; + T lx = x - x_low; + T hy = 1. - ly, hx = 1. - lx; + T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; + + // save weights and indeces + PreCalc pc; + pc.pos1 = y_low * width + x_low; + pc.pos2 = y_low * width + x_high; + pc.pos3 = y_high * width + x_low; + pc.pos4 = y_high * width + x_high; + pc.w1 = w1; + pc.w2 = w2; + pc.w3 = w3; + pc.w4 = w4; + pre_calc[pre_calc_index] = pc; + + pre_calc_index += 1; + } + } + } + } +} + +template +void ROIAlignForward(const int nthreads, const T* bottom_data, + const T& spatial_scale, const int channels, + const int height, const int width, const int pooled_height, + const int pooled_width, const int sampling_ratio, + const T* bottom_rois, T* top_data) { + int n_rois = nthreads / channels / pooled_width / pooled_height; + + for (int n = 0; n < n_rois; n++) { + int index_n = n * channels * pooled_width * pooled_height; + + // roi could have 4 or 5 columns + const T* offset_bottom_rois = bottom_rois + n * 4; + int roi_batch_ind = 0; + // if (roi_cols == 5) { + // roi_batch_ind = offset_bottom_rois[0]; + // offset_bottom_rois++; + // } + + // Do not using rounding; this implementation detail is critical + T roi_start_w = offset_bottom_rois[0] * spatial_scale; + T roi_start_h = offset_bottom_rois[1] * spatial_scale; + T roi_end_w = offset_bottom_rois[2] * spatial_scale; + T roi_end_h = offset_bottom_rois[3] * spatial_scale; + // T roi_start_w = round(offset_bottom_rois[0] * spatial_scale); + // T roi_start_h = round(offset_bottom_rois[1] * spatial_scale); + // T roi_end_w = round(offset_bottom_rois[2] * spatial_scale); + // T roi_end_h = round(offset_bottom_rois[3] * spatial_scale); + + // Force malformed ROIs to be 1x1 + T roi_width = std::max(roi_end_w - roi_start_w, (T)1.); + T roi_height = std::max(roi_end_h - roi_start_h, (T)1.); + T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); + T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + + // We use roi_bin_grid to sample the grid and mimic integral + int roi_bin_grid_h = (sampling_ratio > 0) + ? sampling_ratio + : ceil(roi_height / pooled_height); // e.g., = 2 + int roi_bin_grid_w = + (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); + + // We do average (integral) pooling inside a bin + const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 + + // we want to precalculate indeces and weights shared by all chanels, + // this is the key point of optimiation + std::vector> pre_calc(roi_bin_grid_h * roi_bin_grid_w * + pooled_width * pooled_height); + pre_calc_for_bilinear_interpolate( + height, width, pooled_height, pooled_width, roi_bin_grid_h, + roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w, + roi_bin_grid_h, roi_bin_grid_w, pre_calc); + + for (int c = 0; c < channels; c++) { + int index_n_c = index_n + c * pooled_width * pooled_height; + const T* offset_bottom_data = + bottom_data + (roi_batch_ind * channels + c) * height * width; + int pre_calc_index = 0; + + for (int ph = 0; ph < pooled_height; ph++) { + for (int pw = 0; pw < pooled_width; pw++) { + int index = index_n_c + ph * pooled_width + pw; + + T output_val = 0.; + for (int iy = 0; iy < roi_bin_grid_h; iy++) { + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + PreCalc pc = pre_calc[pre_calc_index]; + output_val += pc.w1 * offset_bottom_data[pc.pos1] + + pc.w2 * offset_bottom_data[pc.pos2] + + pc.w3 * offset_bottom_data[pc.pos3] + + pc.w4 * offset_bottom_data[pc.pos4]; + + pre_calc_index += 1; + } + } + output_val /= count; + + top_data[index] = output_val; + } // for pw + } // for ph + } // for c + } // for n +} + +template <> +void RoiAlignPoolKernel::Compute( + const RoiAlignPoolParam& param) { + auto input_tensor = param.float_input.get(); + fpga::PerformBypass(param.input_arg); + fpga::fpga_invalidate(input_tensor->data(), + input_tensor->numel() * sizeof(float)); + + auto* in = input_tensor; + auto* rois = param.input_rois_; + auto* out = param.output_; // param.float_output.get(); + + auto pooled_height = param.pooled_height_; + auto pooled_width = param.pooled_width_; + auto spatial_scale = param.spatial_scale_; + auto sampe_ratio = param.sampling_ratio_; + + auto in_dims = in->dims(); + int batch_size = in_dims[0]; + int input_channels = in_dims[1]; + int height = in_dims[2]; + int width = in_dims[3]; + int rois_num = rois->dims()[0]; + + auto data_nhwc = in->mutable_data(); + + fpga::image::convert_to_chw(&data_nhwc, input_channels, height, width); + framework::DDim dims_out_new = framework::make_ddim( + {rois_num, (param.output_)->dims()[1], (((param.output_)->dims()[2])), + (param.output_)->dims()[3]}); + (param.output_)->Resize(dims_out_new); + + const int index = input_channels * pooled_height * pooled_width * rois_num; + auto rois_data = rois->data(); + auto top_data = param.output_->mutable_data(); + for (int i = 0; i < index; ++i) { + ROIAlignForward(index, data_nhwc, spatial_scale, input_channels, + height, width, pooled_height, pooled_width, + sampe_ratio, rois_data, top_data); + } + + fpga::image::convert_to_hwc(&top_data, input_channels, pooled_height, + pooled_width, rois_num); + out->reset_data_ptr(top_data); +} + +} // namespace operators +} // namespace paddle_mobile + +#endif // ROIALIGN_POOL_OP diff --git a/src/operators/kernel/fpga/V1/slice_kernel.cpp b/src/operators/kernel/fpga/V1/slice_kernel.cpp index 5d0ac1fe61caa9cce0e1af6f8ac5c53b315573db..39e5c64b34c2a6b0629a7f2ab07a8683e9c45edd 100644 --- a/src/operators/kernel/fpga/V1/slice_kernel.cpp +++ b/src/operators/kernel/fpga/V1/slice_kernel.cpp @@ -33,13 +33,18 @@ bool SliceKernel::Init(SliceParam* param) { template <> void SliceKernel::Compute(const SliceParam& param) { // Only support slicing in channel dimension + // Only support half data + // W must be aligned to 16 auto input = param.input_; - DLOG << input; + auto output = param.output_; int HW = input->dims()[2] * input->dims()[3]; int channel = input->dims()[1]; auto input_ptr = input->data(); - auto output_ptr = param.output_->data(); + auto output_ptr = output->data(); + + output->scale[0] = input->scale[0]; + output->scale[1] = input->scale[1]; int start = param.starts_[0], end = param.ends_[0]; start = start < 0 ? start + channel : start; @@ -47,9 +52,10 @@ void SliceKernel::Compute(const SliceParam& param) { start = start > channel ? channel : start; end = end > channel ? channel : end; int len = end - start; + size_t size = len * sizeof(half); for (int i = 0; i < HW; i++) { - memcpy(output_ptr + len * i, input_ptr + i * channel + start, len); + memcpy(output_ptr + len * i, input_ptr + i * channel + start, size); } } } // namespace operators diff --git a/src/operators/kernel/fpga/V1/softmax_kernel.cpp b/src/operators/kernel/fpga/V1/softmax_kernel.cpp index 683c5953b3c90bb387dce14b7941764272906ceb..5537565bc2a4dc7563148617daf47eaa9a50ba91 100644 --- a/src/operators/kernel/fpga/V1/softmax_kernel.cpp +++ b/src/operators/kernel/fpga/V1/softmax_kernel.cpp @@ -23,14 +23,21 @@ namespace operators { template <> bool SoftmaxKernel::Init(SoftmaxParam *param) { auto input = const_cast(param->InputX()); - auto input_ptr = input->data(); + auto dims = framework::vectorize(input->dims()); + half *input_ptr; auto out = param->Out(); + if (input->type() == typeid(float)) { + out->Resize(framework::make_ddim(dims)); + out->mutable_data(framework::make_ddim(dims)); + } else { + input_ptr = input->data(); + } auto float_input = new LoDTensor; PADDLE_MOBILE_ENFORCE(input->dims().size() == 4, "Softmax should have 4-order input"); - auto dims = framework::vectorize(input->dims()); + auto channel = dims[3]; if (channel == 1) { // This input is generated by FC op, dims = [N C 1 1] PADDLE_MOBILE_ENFORCE(dims[2] == 1, "Softmax input must come from FC op"); @@ -41,9 +48,12 @@ bool SoftmaxKernel::Init(SoftmaxParam *param) { float_input->Resize(framework::make_ddim(dims)); if (channel != 2) { // Use CPU + out->Resize(framework::make_ddim(dims)); + out->mutable_data(framework::make_ddim(dims)); float_input->init(typeid(float)); - fpga::format_fp32_ofm(float_input); - fpga::format_fp32_ofm(out); + float_input->mutable_data(framework::make_ddim(dims)); + // fpga::format_fp32_ofm(float_input); + // fpga::format_fp32_ofm(out); fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; args.input_layout_type = fpga::LAYOUT_HWC; @@ -51,7 +61,7 @@ bool SoftmaxKernel::Init(SoftmaxParam *param) { args.input_data_type = fpga::DATA_TYPE_FP16; args.output_data_type = fpga::DATA_TYPE_FP32; args.image.address = input_ptr; - args.image.height = (uint32_t)dims[1]; + args.image.height = (uint32_t)dims[1] * dims[0]; args.image.width = (uint32_t)dims[2]; args.image.channels = (uint32_t)dims[3]; args.output.address = float_input->data(); @@ -80,14 +90,25 @@ bool SoftmaxKernel::Init(SoftmaxParam *param) { template <> void SoftmaxKernel::Compute(const SoftmaxParam ¶m) { - fpga::PerformBypass(param.FpgaArgs()); - - if (param.FpgaArgs().output.activation.activation_type != fpga::SOFTMAX) { - Tensor *out = param.Out(); - Tensor *in_x = param.FloatInput(); - fpga::fpga_invalidate(in_x->data(), in_x->numel() * sizeof(float)); - math::SoftmaxFuntor()(in_x, out); - fpga::fpga_flush(out->data(), out->memory_size()); + auto *in_x = (param.InputX()); + if (in_x->type() == typeid(half)) { + fpga::PerformBypass(param.FpgaArgs()); + if (param.FpgaArgs().output.activation.activation_type != fpga::SOFTMAX) { + Tensor *out = param.Out(); + Tensor *in_x2 = param.FloatInput(); + + fpga::fpga_invalidate(in_x2->data(), + in_x2->numel() * sizeof(float)); + math::SoftmaxFuntor()(in_x2, out); + fpga::fpga_flush(out->data(), out->memory_size()); + } + } else { + if (param.FpgaArgs().output.activation.activation_type != fpga::SOFTMAX) { + Tensor *out = param.Out(); + out->Resize( + {in_x->dims()[0], out->dims()[1], out->dims()[2], out->dims()[3]}); + math::SoftmaxFuntor()(in_x, out); + } } } diff --git a/src/operators/kernel/fpga/V1/transpose2_kernel.cpp b/src/operators/kernel/fpga/V1/transpose2_kernel.cpp index f74839f1fc06e0b5bf391187f5ecab461f7c00f5..cc839a971ee7f827f150ecdfff0bd75e2a8aafe2 100644 --- a/src/operators/kernel/fpga/V1/transpose2_kernel.cpp +++ b/src/operators/kernel/fpga/V1/transpose2_kernel.cpp @@ -42,6 +42,11 @@ template <> void Transpose2Kernel::Compute( const Transpose2Param ¶m) { // Transpose2Compute(param); + auto input = param.InputX(); + auto output = param.Out(); + + output->Resize({input->dims()[0], output->dims()[1], output->dims()[2], + output->dims()[3]}); } } // namespace operators diff --git a/src/operators/kernel/logical_kernel.h b/src/operators/kernel/logical_kernel.h index 8c49669fa8f276b28e4d3b50db16937f766f70a1..b42ae27005212147a7a7467f974a8f10ca4af299 100644 --- a/src/operators/kernel/logical_kernel.h +++ b/src/operators/kernel/logical_kernel.h @@ -14,6 +14,8 @@ limitations under the License. */ #pragma once +#include + #include "framework/operator.h" #include "operators/op_param.h" diff --git a/src/operators/kernel/while_kernel.h b/src/operators/kernel/while_kernel.h index 64fb7a607e7f9d8fdbd2c6d1091b9da7133831be..149a2e9829f300b536b281f25eaee38400cac1a8 100644 --- a/src/operators/kernel/while_kernel.h +++ b/src/operators/kernel/while_kernel.h @@ -25,11 +25,14 @@ template class WhileParam : public OpParam { public: WhileParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) - : scope_(&scope) { + const AttributeMap &attrs, Scope *scope) + : inputs_(inputs), + outputs_(outputs), + scope_(*scope), + OpParam(inputs, outputs, attrs, scope) { cond_ = - OpParam::GetVarValue("Condition", inputs, scope); - sub_block_ = OpParam::GetAttr("sub_block", attrs); + OpParam::GetVarValue("Condition", inputs, *scope); + sub_block_ = OpParam::GetAttr("sub_block", attrs); } public: diff --git a/src/operators/op_param.cpp b/src/operators/op_param.cpp index 4d1689911686198612eb4df4dfe8f99450ba503d..be12ecbe73c9b3c1fe7ea6b7380288cbe42838b1 100644 --- a/src/operators/op_param.cpp +++ b/src/operators/op_param.cpp @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "op_param.h" +#include "operators/op_param.h" namespace paddle_mobile { namespace operators { @@ -47,6 +47,9 @@ template class ConvParam; template class ElementwiseAddParam; template class ElementwiseAddParam; template class ElementwiseAddParam; +template class ElementwiseMulParam; +template class ElementwiseMulParam; +template class ElementwiseMulParam; #ifdef MUL_OP template class MulParam; diff --git a/src/operators/op_param.h b/src/operators/op_param.h index 6bd2470cb4ffef476d7f868c3f9ee4eceaa429bb..bc45c69d1da2bcf801b39e52582444cbc9c436dd 100644 --- a/src/operators/op_param.h +++ b/src/operators/op_param.h @@ -69,6 +69,30 @@ struct DtypeTensorTrait { #endif class OpParam { + public: + OpParam(const VariableNameMap &inputs, const VariableNameMap &outputs, + const AttributeMap &attrs, Scope *scope) { + scope_pointer_ = scope; + inputs_ = inputs; + } + + template + T *CreateNewScale() { + std::string scale_key = Getkey("Scale", inputs_, 0); + auto var = scope_pointer_->Var(scale_key + "_new"); + return var->GetMutable(); + } + + template + T *CreateNewBiase() { + std::string biase_key = Getkey("Bias", inputs_, 0); + auto var = scope_pointer_->Var(biase_key + "_new"); + return var->GetMutable(); + } + + VariableNameMap inputs_; + Scope *scope_pointer_ = nullptr; + protected: template static T *InputH0From(const VariableNameMap &inputs, const Scope &scope) { @@ -359,8 +383,10 @@ class OpParam { } } - static std::string getkey(const string &key, const VariableNameMap &var_map, + static std::string Getkey(const string &key, const VariableNameMap &var_map, int index) { + PADDLE_MOBILE_ENFORCE(var_map.count(key) > index, + "%s is not contained in var_map", key.c_str()) auto var_vec = var_map.at(key); return var_vec[index]; } @@ -414,11 +440,12 @@ class ConvParam : public OpParam { public: ConvParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) { - filter_ = OpParam::FilterFrom(inputs, scope); - input_ = OpParam::InputFrom(inputs, scope); + const AttributeMap &attrs, Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + filter_ = OpParam::FilterFrom(inputs, *scope); + input_ = OpParam::InputFrom(inputs, *scope); if (outputs.count("Output")) { - output_ = OpParam::OutputFrom(outputs, scope); + output_ = OpParam::OutputFrom(outputs, *scope); } strides_ = OpParam::GetAttr>("strides", attrs); paddings_ = OpParam::GetAttr>("paddings", attrs); @@ -498,17 +525,18 @@ template Print &operator<<(Print &printer, const ConvParam &conv_param); template -class ElementwiseAddParam : OpParam { +class ElementwiseAddParam : public OpParam { typedef typename DtypeTensorTrait::gtype GType; typedef typename DtypeTensorTrait::rtype RType; public: ElementwiseAddParam(const VariableNameMap &inputs, const VariableNameMap &outputs, const AttributeMap &attrs, - const Scope &scope) { - input_x_ = InputXFrom(inputs, scope); - input_y_ = InputYFrom(inputs, scope); - out_ = OutFrom(outputs, scope); + Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + input_x_ = InputXFrom(inputs, *scope); + input_y_ = InputYFrom(inputs, *scope); + out_ = OutFrom(outputs, *scope); axis_ = GetAttr("axis", attrs); } @@ -533,22 +561,27 @@ class ElementwiseAddParam : OpParam { public: const fpga::EWAddArgs &FpgaArgs() const { return fpga_EW_add_args; } void SetFpgaArgs(const fpga::EWAddArgs &args) { fpga_EW_add_args = args; } + + public: + Tensor float_input_x, float_out; + #endif }; #ifdef ELEMENTWISEMUL_OP template -class ElementwiseMulParam : OpParam { +class ElementwiseMulParam : public OpParam { typedef typename DtypeTensorTrait::gtype GType; typedef typename DtypeTensorTrait::rtype RType; public: ElementwiseMulParam(const VariableNameMap &inputs, const VariableNameMap &outputs, const AttributeMap &attrs, - const Scope &scope) { - input_x_ = InputXFrom(inputs, scope); - input_y_ = InputYFrom(inputs, scope); - out_ = OutFrom(outputs, scope); + Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + input_x_ = InputXFrom(inputs, *scope); + input_y_ = InputYFrom(inputs, *scope); + out_ = OutFrom(outputs, *scope); axis_ = GetAttr("axis", attrs); } @@ -565,6 +598,12 @@ class ElementwiseMulParam : OpParam { GType *input_y_; GType *out_; int axis_; +#ifdef PADDLE_MOBILE_FPGA + + public: + Tensor float_input_x, float_out; + +#endif }; #endif @@ -575,17 +614,18 @@ using ElementwiseAddReluParam = ElementwiseAddParam; #ifdef ELEMENTWISESUB_OP template -class ElementwiseSubParam : OpParam { +class ElementwiseSubParam : public OpParam { typedef typename DtypeTensorTrait::gtype GType; typedef typename DtypeTensorTrait::rtype RType; public: ElementwiseSubParam(const VariableNameMap &inputs, const VariableNameMap &outputs, const AttributeMap &attrs, - const Scope &scope) { - input_x_ = InputXFrom(inputs, scope); - input_y_ = InputYFrom(inputs, scope); - out_ = OutFrom(outputs, scope); + Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + input_x_ = InputXFrom(inputs, *scope); + input_y_ = InputYFrom(inputs, *scope); + out_ = OutFrom(outputs, *scope); axis_ = GetAttr("axis", attrs); } @@ -607,16 +647,17 @@ class ElementwiseSubParam : OpParam { #ifdef MUL_OP template -class MulParam : OpParam { +class MulParam : public OpParam { typedef typename DtypeTensorTrait::gtype GType; typedef typename DtypeTensorTrait::rtype RType; public: MulParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) { - input_x_ = InputXFrom(inputs, scope); - input_y_ = InputYFrom(inputs, scope); - out_ = OutFrom(outputs, scope); + const AttributeMap &attrs, Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + input_x_ = InputXFrom(inputs, *scope); + input_y_ = InputYFrom(inputs, *scope); + out_ = OutFrom(outputs, *scope); x_num_col_dims_ = GetAttr("x_num_col_dims", attrs); y_num_col_dims_ = GetAttr("y_num_col_dims", attrs); } @@ -648,9 +689,10 @@ class ConcatParam : public OpParam { public: ConcatParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) { - inputs_ = InputMultiFrom(inputs, scope); - out_ = OutFrom(outputs, scope); + const AttributeMap &attrs, Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + inputs_ = InputMultiFrom(inputs, *scope); + out_ = OutFrom(outputs, *scope); axis_ = GetAttr("axis", attrs); } @@ -684,11 +726,12 @@ class SumParam : public OpParam { public: SumParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) { - inputs_vars_ = InputMultiVarsFrom(inputs, scope); - out_var_ = OutVarFrom(outputs, scope); - inputs_ = InputMultiFrom(inputs, scope); - out_ = OutFrom(outputs, scope); + const AttributeMap &attrs, Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + inputs_vars_ = InputMultiVarsFrom(inputs, *scope); + out_var_ = OutVarFrom(outputs, *scope); + inputs_ = InputMultiFrom(inputs, *scope); + out_ = OutFrom(outputs, *scope); } vector InputsVars() const { return inputs_vars_; } @@ -715,10 +758,11 @@ class LrnParam : public OpParam { public: LrnParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) { - input_x_ = InputXFrom(inputs, scope); - out_ = OutFrom(outputs, scope); - mid_out_ = MidOutFrom(outputs, scope); + const AttributeMap &attrs, Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + input_x_ = InputXFrom(inputs, *scope); + out_ = OutFrom(outputs, *scope); + mid_out_ = MidOutFrom(outputs, *scope); n_ = GetAttr("n", attrs); alpha_ = GetAttr("alpha", attrs); beta_ = GetAttr("beta", attrs); @@ -756,16 +800,17 @@ class LrnParam : public OpParam { #ifdef NORM_OP template -class NormParam : OpParam { +class NormParam : public OpParam { typedef typename DtypeTensorTrait::gtype GType; typedef typename DtypeTensorTrait::rtype RType; public: NormParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) { - input_x_ = InputXFrom(inputs, scope); - out_ = OutFrom(outputs, scope); - output_norm_ = OutputNormFrom(outputs, scope); + const AttributeMap &attrs, Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + input_x_ = InputXFrom(inputs, *scope); + out_ = OutFrom(outputs, *scope); + output_norm_ = OutputNormFrom(outputs, *scope); epsilon_ = GetAttr("epsilon", attrs); axis_ = GetAttr("axis", attrs); } @@ -791,19 +836,20 @@ class NormParam : OpParam { #ifdef BATCHNORM_OP template -class BatchNormParam : OpParam { +class BatchNormParam : public OpParam { typedef typename DtypeTensorTrait::gtype GType; typedef typename DtypeTensorTrait::rtype RType; public: BatchNormParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) { - input_x_ = InputXFrom(inputs, scope); - output_y_ = OutputYFrom(outputs, scope); - input_bias_ = InputBiasFrom(inputs, scope); - input_mean_ = InputMeanFrom(inputs, scope); - input_scale_ = InputScaleFrom(inputs, scope); - input_variance_ = InputVarianceFrom(inputs, scope); + const AttributeMap &attrs, Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + input_x_ = InputXFrom(inputs, *scope); + output_y_ = OutputYFrom(outputs, *scope); + input_bias_ = InputBiasFrom(inputs, *scope); + input_mean_ = InputMeanFrom(inputs, *scope); + input_scale_ = InputScaleFrom(inputs, *scope); + input_variance_ = InputVarianceFrom(inputs, *scope); epsilon_ = GetAttr("epsilon", attrs); momentum_ = GetAttr("momentum", attrs); // is_test_ = GetAttr("is_test", attrs); @@ -861,10 +907,11 @@ class PoolParam : public OpParam { public: PoolParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) { - input_ = InputXFrom(inputs, scope); + const AttributeMap &attrs, Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + input_ = InputXFrom(inputs, *scope); - output_ = OutFrom(outputs, scope); + output_ = OutFrom(outputs, *scope); pooling_type_ = GetStringAttr("pooling_type", attrs); ksize_ = GetAttr>("ksize", attrs); strides_ = GetAttr>("strides", attrs); @@ -918,11 +965,12 @@ class PriorBoxParam : public OpParam { public: PriorBoxParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) { - input_ = InputFrom(inputs, scope); - input_image_ = InputImageFrom(inputs, scope); - output_boxes_ = OutputBoxesFrom(outputs, scope); - output_variances_ = OutputVariancesFrom(outputs, scope); + const AttributeMap &attrs, Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + input_ = InputFrom(inputs, *scope); + input_image_ = InputImageFrom(inputs, *scope); + output_boxes_ = OutputBoxesFrom(outputs, *scope); + output_variances_ = OutputVariancesFrom(outputs, *scope); min_sizes_ = GetAttr>("min_sizes", attrs); max_sizes_ = GetAttr>("max_sizes", attrs); aspect_ratios_ = GetAttr>("aspect_ratios", attrs); @@ -996,11 +1044,12 @@ class BoxCoderParam : public OpParam { public: BoxCoderParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) { - input_priorbox_ = InputPriorBoxFrom(inputs, scope); - input_priorboxvar_ = InputPriorBoxVarFrom(inputs, scope); - input_targetbox_ = InputTargetBoxFrom(inputs, scope); - output_box_ = OutputBoxFrom(outputs, scope); + const AttributeMap &attrs, Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + input_priorbox_ = InputPriorBoxFrom(inputs, *scope); + input_priorboxvar_ = InputPriorBoxVarFrom(inputs, *scope); + input_targetbox_ = InputTargetBoxFrom(inputs, *scope); + output_box_ = OutputBoxFrom(outputs, *scope); code_type_ = GetStringAttr("code_type", attrs); } const GType *InputPriorBox() const { return input_priorbox_; } @@ -1030,9 +1079,10 @@ class SoftmaxParam : public OpParam { public: SoftmaxParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) { - input_x_ = InputXFrom(inputs, scope); - out_ = OutFrom(outputs, scope); + const AttributeMap &attrs, Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + input_x_ = InputXFrom(inputs, *scope); + out_ = OutFrom(outputs, *scope); } const GType *InputX() const { return input_x_; } GType *Out() const { return out_; } @@ -1066,9 +1116,10 @@ class SigmoidParam : public OpParam { public: SigmoidParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) { - input_x_ = InputXFrom(inputs, scope); - out_ = OutFrom(outputs, scope); + const AttributeMap &attrs, Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + input_x_ = InputXFrom(inputs, *scope); + out_ = OutFrom(outputs, *scope); } const GType *InputX() const { return input_x_; } GType *Out() const { return out_; } @@ -1097,10 +1148,11 @@ class MultiClassNMSParam : public OpParam { public: MultiClassNMSParam(const VariableNameMap &inputs, const VariableNameMap &outputs, const AttributeMap &attrs, - const Scope &scope) { - input_bboxes_ = InputBBoxesFrom(inputs, scope); - input_scores_ = InputScoresFrom(inputs, scope); - out_ = OutFrom(outputs, scope); + Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + input_bboxes_ = InputBBoxesFrom(inputs, *scope); + input_scores_ = InputScoresFrom(inputs, *scope); + out_ = OutFrom(outputs, *scope); background_label_ = GetAttr("background_label", attrs); nms_top_k_ = GetAttr("nms_top_k", attrs); keep_top_k_ = GetAttr("keep_top_k", attrs); @@ -1149,9 +1201,10 @@ class PolygonBoxTransformParam : public OpParam { public: PolygonBoxTransformParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) { - input_ = InputFrom(inputs, scope); - output_ = OutputFrom(outputs, scope); + const AttributeMap &attrs, Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + input_ = InputFrom(inputs, *scope); + output_ = OutputFrom(outputs, *scope); } const GType *Input() const { return input_; } GType *Output() const { return output_; } @@ -1169,7 +1222,8 @@ class FeedParam : public OpParam { public: FeedParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) { + const AttributeMap &attrs, const Scope &scope) + : OpParam(inputs, outputs, attrs, scope) { input_x_ = InputXFrom(inputs, scope); out_ = OutFrom(outputs, scope); col_ = GetAttr("col", attrs); @@ -1195,7 +1249,8 @@ class FetchParam : public OpParam { public: FetchParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) { + const AttributeMap &attrs, const Scope &scope) + : OpParam(inputs, outputs, attrs, scope) { input_x_ = InputXFrom(inputs, scope); out_ = OutFrom(outputs, scope); col_ = GetAttr("col", attrs); @@ -1210,6 +1265,7 @@ class FetchParam : public OpParam { framework::LoDTensorArray *out_; int col_; #ifdef PADDLE_MOBILE_FPGA + public: fpga::BypassArgs fpga_bypass_args; #endif @@ -1224,9 +1280,10 @@ class FillConstantParam : public OpParam { public: FillConstantParam(const VariableNameMap &inputs, const VariableNameMap &outputs, const AttributeMap &attrs, - const Scope &scope) { - out_var_ = OutVarFrom(outputs, scope); - out_ = OutFrom(outputs, scope); + Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + out_var_ = OutVarFrom(outputs, *scope); + out_ = OutFrom(outputs, *scope); dtype_ = GetAttr("dtype", attrs); shape_ = GetAttr>("shape", attrs); value_ = GetAttr("value", attrs); @@ -1259,9 +1316,10 @@ class TransposeParam : public OpParam { public: TransposeParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) { - input_x_ = InputXFrom(inputs, scope); - out_ = OutFrom(outputs, scope); + const AttributeMap &attrs, Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + input_x_ = InputXFrom(inputs, *scope); + out_ = OutFrom(outputs, *scope); axis_ = GetAttr>("axis", attrs); } @@ -1286,10 +1344,11 @@ class Transpose2Param : public OpParam { public: Transpose2Param(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) { - input_x_ = InputXFrom(inputs, scope); - out_ = OutFrom(outputs, scope); - output_xshape_ = OutputXShapeFrom(outputs, scope); + const AttributeMap &attrs, Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + input_x_ = InputXFrom(inputs, *scope); + out_ = OutFrom(outputs, *scope); + output_xshape_ = OutputXShapeFrom(outputs, *scope); axis_ = GetAttr>("axis", attrs); } @@ -1317,10 +1376,11 @@ class LookupParam : public OpParam { public: LookupParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) { - input_w_ = InputWFrom(inputs, scope); - input_ids_ = InputIdsFrom(inputs, scope); - out_ = OutFrom(outputs, scope); + const AttributeMap &attrs, Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + input_w_ = InputWFrom(inputs, *scope); + input_ids_ = InputIdsFrom(inputs, *scope); + out_ = OutFrom(outputs, *scope); padding_idx_ = GetAttr("padding_idx", attrs); } @@ -1347,12 +1407,13 @@ class CrfParam : public OpParam { // {G_OP_TYPE_CRF, {{"Emission", "Transition", "Label"}, {"ViterbiPath"}}}, CrfParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) { + const AttributeMap &attrs, Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { // todo crf params - input_emission_ = InputEmissionFrom(inputs, scope); - input_transition_ = InputTransitionFrom(inputs, scope); - input_label_ = InputLabelFrom(inputs, scope); - output_viterbipath_ = OutputViterbiPathFrom(outputs, scope); + input_emission_ = InputEmissionFrom(inputs, *scope); + input_transition_ = InputTransitionFrom(inputs, *scope); + input_label_ = InputLabelFrom(inputs, *scope); + output_viterbipath_ = OutputViterbiPathFrom(outputs, *scope); // padding_idx_ = GetAttr("padding_idx", attrs); } const GType *InputEmission() const { return input_emission_; } @@ -1383,10 +1444,11 @@ class ReshapeParam : public OpParam { public: ReshapeParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) { - input_x_ = InputXFrom(inputs, scope); - input_shape_ = InputShapeFrom(inputs, scope); - out_ = OutFrom(outputs, scope); + const AttributeMap &attrs, Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + input_x_ = InputXFrom(inputs, *scope); + input_shape_ = InputShapeFrom(inputs, *scope); + out_ = OutFrom(outputs, *scope); shape_ = GetAttr>("shape", attrs); if (HasAttr("inplace", attrs)) { @@ -1424,11 +1486,12 @@ class Reshape2Param : public OpParam { public: Reshape2Param(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) { - input_x_ = InputXFrom(inputs, scope); - input_shape_ = InputShapeFrom(inputs, scope); - out_ = OutFrom(outputs, scope); - output_xshape_ = OutputXShapeFrom(outputs, scope); + const AttributeMap &attrs, Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + input_x_ = InputXFrom(inputs, *scope); + input_shape_ = InputShapeFrom(inputs, *scope); + out_ = OutFrom(outputs, *scope); + output_xshape_ = OutputXShapeFrom(outputs, *scope); shape_ = GetAttr>("shape", attrs); if (HasAttr("inplace", attrs)) { inplace_ = GetAttr("inplace", attrs); @@ -1467,10 +1530,11 @@ class ScaleParam : public OpParam { public: ScaleParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) { - input_x_ = InputXFrom(inputs, scope); - input_bias_ = InputBiasFrom(inputs, scope); - out_ = OutFrom(outputs, scope); + const AttributeMap &attrs, Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + input_x_ = InputXFrom(inputs, *scope); + input_bias_ = InputBiasFrom(inputs, *scope); + out_ = OutFrom(outputs, *scope); inplace_ = GetAttr("inplace", attrs); has_bias_ = GetAttr("has_bias", attrs); scales_ = GetAttr>("scales", attrs); @@ -1510,9 +1574,10 @@ class SliceParam : public OpParam { public: SliceParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) { - input_ = InputFrom(inputs, scope); - output_ = OutFrom(outputs, scope); + const AttributeMap &attrs, Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + input_ = InputFrom(inputs, *scope); + output_ = OutFrom(outputs, *scope); axes_ = GetAttr>("axes", attrs); starts_ = GetAttr>("starts", attrs); @@ -1536,10 +1601,11 @@ class ResizeParam : public OpParam { public: ResizeParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) { - input_x_ = InputXFrom(inputs, scope); - input_shape_ = InputShapeFrom(inputs, scope); - out_ = OutFrom(outputs, scope); + const AttributeMap &attrs, Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + input_x_ = InputXFrom(inputs, *scope); + input_shape_ = InputShapeFrom(inputs, *scope); + out_ = OutFrom(outputs, *scope); is_pyramid_test_ = GetAttr("is_pyramid_test", attrs); height_ = GetAttr("height", attrs); width_ = GetAttr("width", attrs); @@ -1586,9 +1652,10 @@ class ReluParamBase : public OpParam { public: ReluParamBase(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) { - input_x_ = InputXFrom(inputs, scope); - out_ = OutFrom(outputs, scope); + const AttributeMap &attrs, Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + input_x_ = InputXFrom(inputs, *scope); + out_ = OutFrom(outputs, *scope); } const GType *InputX() const { return input_x_; } @@ -1628,9 +1695,10 @@ class TanhParam : public OpParam { public: TanhParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) { - input_x_ = InputXFrom(inputs, scope); - out_ = OutFrom(outputs, scope); + const AttributeMap &attrs, Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + input_x_ = InputXFrom(inputs, *scope); + out_ = OutFrom(outputs, *scope); } const GType *InputX() const { return input_x_; } GType *Out() const { return out_; } @@ -1663,12 +1731,13 @@ class PReluParam : public OpParam { public: PReluParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) { + const AttributeMap &attrs, Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { DLOG << "PReluParam inputs before"; - input_x_ = InputXFrom(inputs, scope); - alpha_ = InputAlphaFrom(inputs, scope); + input_x_ = InputXFrom(inputs, *scope); + alpha_ = InputAlphaFrom(inputs, *scope); framework::DDim dims = alpha_->dims(); - out_ = OutFrom(outputs, scope); + out_ = OutFrom(outputs, *scope); mode_ = GetStringAttr("mode", attrs); DLOG << "PReluParam mode after" << mode_; } @@ -1692,11 +1761,12 @@ class FusionFcParam : public OpParam { public: FusionFcParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) { - input_x_ = InputXFrom(inputs, scope); - input_y_ = InputYFrom(inputs, scope); - input_z_ = InputZFrom(inputs, scope); - out_ = OutFrom(outputs, scope); + const AttributeMap &attrs, Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + input_x_ = InputXFrom(inputs, *scope); + input_y_ = InputYFrom(inputs, *scope); + input_z_ = InputZFrom(inputs, *scope); + out_ = OutFrom(outputs, *scope); x_num_col_dims_ = GetAttr("x_num_col_dims", attrs); y_num_col_dims_ = GetAttr("y_num_col_dims", attrs); axis_ = GetAttr("axis", attrs); @@ -1747,9 +1817,9 @@ class FusionConvAddParam : public ConvParam { public: FusionConvAddParam(const VariableNameMap &inputs, const VariableNameMap &outputs, const AttributeMap &attrs, - const Scope &scope) + Scope *scope) : ConvParam(inputs, outputs, attrs, scope) { - bias_ = OpParam::InputYFrom(inputs, scope); + bias_ = OpParam::InputYFrom(inputs, *scope); axis_ = OpParam::GetAttr("axis", attrs); this->output_ = OpParam::OutFrom(outputs, scope); } @@ -1771,7 +1841,7 @@ class FusionConvAddReluParam : public FusionConvAddParam { public: FusionConvAddReluParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) + const AttributeMap &attrs, Scope *scope) : FusionConvAddParam(inputs, outputs, attrs, scope) {} }; #endif @@ -1785,12 +1855,12 @@ class FusionConvAddPReluParam : public ConvParam { public: FusionConvAddPReluParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) + const AttributeMap &attrs, Scope *scope) : ConvParam(inputs, outputs, attrs, scope) { - alpha_ = OpParam::InputAlphaFrom(inputs, scope); + alpha_ = OpParam::InputAlphaFrom(inputs, *scope); mode_ = OpParam::GetStringAttr("mode", attrs); framework::DDim dims = alpha_->dims(); - bias_ = OpParam::InputYFrom(inputs, scope); + bias_ = OpParam::InputYFrom(inputs, *scope); axis_ = OpParam::GetAttr("axis", attrs); this->output_ = OpParam::OutFrom(outputs, scope); } @@ -1816,21 +1886,21 @@ class FusionConvAddAddPReluParam : public ConvParam { public: FusionConvAddAddPReluParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) + const AttributeMap &attrs, Scope *scope) : ConvParam(inputs, outputs, attrs, scope) { - bias1_ = OpParam::InputYFrom1(inputs, scope); - alpha_ = OpParam::InputAlphaFrom(inputs, scope); + bias1_ = OpParam::InputYFrom1(inputs, *scope); + alpha_ = OpParam::InputAlphaFrom(inputs, *scope); mode_ = OpParam::GetStringAttr("mode", attrs); framework::DDim dims = alpha_->dims(); bias_ = OpParam::InputYFrom(inputs, scope); axis_ = OpParam::GetAttr("axis", attrs); - keyOutput_ = OpParam::getkey("addOut", inputs, 0); - keyX1_ = OpParam::getkey("addX", inputs, 1); - keyY1_ = OpParam::getkey("Y", inputs, 1); + keyOutput_ = OpParam::Getkey("addOut", inputs, 0); + keyX1_ = OpParam::Getkey("addX", inputs, 1); + keyY1_ = OpParam::Getkey("Y", inputs, 1); if (keyX1_ == keyOutput_) { - bias1_ = OpParam::InputYFrom1(inputs, scope); + bias1_ = OpParam::InputYFrom1(inputs, *scope); } else if (keyY1_ == keyOutput_) { - bias1_ = OpParam::InputXFrom1(inputs, scope); + bias1_ = OpParam::InputXFrom1(inputs, *scope); } this->output_ = OpParam::OutFrom(outputs, scope); } @@ -1863,9 +1933,9 @@ class FusionConvAddBNReluParam : public ConvParam { public: FusionConvAddBNReluParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) + const AttributeMap &attrs, Scope *scope) : ConvParam(inputs, outputs, attrs, scope) { - bias_ = OpParam::InputYFrom(inputs, scope); + bias_ = OpParam::InputYFrom(inputs, *scope); axis_ = OpParam::GetAttr("axis", attrs); input_bias_ = OpParam::InputBiasFrom(inputs, scope); input_mean_ = OpParam::InputMeanFrom(inputs, scope); @@ -1922,9 +1992,9 @@ class FusionConvBNAddReluParam : public ConvParam { public: FusionConvBNAddReluParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) + const AttributeMap &attrs, Scope *scope) : ConvParam(inputs, outputs, attrs, scope) { - bias_ = OpParam::InputYFrom(inputs, scope); + bias_ = OpParam::InputYFrom(inputs, *scope); axis_ = OpParam::GetAttr("axis", attrs); input_bias_ = OpParam::InputBiasFrom(inputs, scope); input_mean_ = OpParam::InputMeanFrom(inputs, scope); @@ -1932,13 +2002,13 @@ class FusionConvBNAddReluParam : public ConvParam { input_variance_ = OpParam::InputVarianceFrom(inputs, scope); epsilon_ = OpParam::GetAttr("epsilon", attrs); momentum_ = OpParam::GetAttr("momentum", attrs); - keyBNY_ = OpParam::getkey("BNY", inputs, 0); - keyX_ = OpParam::getkey("X", inputs, 0); - keyY_ = OpParam::getkey("Y", inputs, 0); + keyBNY_ = OpParam::Getkey("BNY", inputs, 0); + keyX_ = OpParam::Getkey("X", inputs, 0); + keyY_ = OpParam::Getkey("Y", inputs, 0); if (keyX_ == keyBNY_) { - bias_ = OpParam::InputYFrom(inputs, scope); + bias_ = OpParam::InputYFrom(inputs, *scope); } else if (keyY_ == keyBNY_) { - bias_ = OpParam::InputXFrom(inputs, scope); + bias_ = OpParam::InputXFrom(inputs, *scope); } this->output_ = OpParam::OutFrom(outputs, scope); } @@ -1992,7 +2062,7 @@ class FusionConvBNParam : public ConvParam { public: FusionConvBNParam(const VariableNameMap &inputs, const VariableNameMap &outputs, const AttributeMap &attrs, - const Scope &scope) + Scope *scope) : ConvParam(inputs, outputs, attrs, scope) { input_bias_ = OpParam::InputBiasFrom(inputs, scope); input_mean_ = OpParam::InputMeanFrom(inputs, scope); @@ -2044,9 +2114,9 @@ class FusionConvAddBNParam : public ConvParam { public: FusionConvAddBNParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) + const AttributeMap &attrs, Scope *scope) : ConvParam(inputs, outputs, attrs, scope) { - bias_ = OpParam::InputYFrom(inputs, scope); + bias_ = OpParam::InputYFrom(inputs, *scope); axis_ = OpParam::GetAttr("axis", attrs); input_bias_ = OpParam::InputBiasFrom(inputs, scope); input_mean_ = OpParam::InputMeanFrom(inputs, scope); @@ -2103,7 +2173,7 @@ class FusionDWConvBNReluParam : public ConvParam { public: FusionDWConvBNReluParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) + const AttributeMap &attrs, Scope *scope) : ConvParam(inputs, outputs, attrs, scope) { input_bias_ = OpParam::InputBiasFrom(inputs, scope); input_mean_ = OpParam::InputMeanFrom(inputs, scope); @@ -2156,7 +2226,7 @@ class FusionConvBNReluParam : public ConvParam { public: FusionConvBNReluParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) + const AttributeMap &attrs, Scope *scope) : ConvParam(inputs, outputs, attrs, scope) { input_bias_ = OpParam::InputBiasFrom(inputs, scope); input_mean_ = OpParam::InputMeanFrom(inputs, scope); @@ -2208,9 +2278,10 @@ class Im2SequenceParam : public OpParam { public: Im2SequenceParam(const VariableNameMap &inputs, const VariableNameMap &outputs, const AttributeMap &attrs, - const Scope &scope) { - input_x_ = InputXFrom(inputs, scope); - out_ = OutFrom(outputs, scope); + Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + input_x_ = InputXFrom(inputs, *scope); + out_ = OutFrom(outputs, *scope); kernels_ = GetAttr>("kernels", attrs); strides_ = GetAttr>("strides", attrs); paddings_ = GetAttr>("paddings", attrs); @@ -2243,9 +2314,10 @@ class DropoutParam : public OpParam { public: DropoutParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) { - input_x_ = InputXFrom(inputs, scope); - out_ = OutFrom(outputs, scope); + const AttributeMap &attrs, Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + input_x_ = InputXFrom(inputs, *scope); + out_ = OutFrom(outputs, *scope); dropout_prob_ = GetAttr("dropout_prob", attrs); } @@ -2271,12 +2343,13 @@ class ConvTransposeParam : public OpParam { public: ConvTransposeParam(const VariableNameMap &inputs, const VariableNameMap &outputs, const AttributeMap &attrs, - const Scope &scope) { - filter_ = FilterFrom(inputs, scope); - input_ = InputFrom(inputs, scope); + Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + filter_ = FilterFrom(inputs, *scope); + input_ = InputFrom(inputs, *scope); // output_ = OutputFrom(outputs, scope); if (outputs.count("Output")) { - output_ = OpParam::OutputFrom(outputs, scope); + output_ = OpParam::OutputFrom(outputs, *scope); } strides_ = GetAttr>("strides", attrs); paddings_ = GetAttr>("paddings", attrs); @@ -2334,11 +2407,11 @@ class FusionDeconvAddParam : public ConvTransposeParam { public: FusionDeconvAddParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) + const AttributeMap &attrs, Scope *scope) : ConvTransposeParam(inputs, outputs, attrs, scope) { - bias_ = OpParam::InputYFrom(inputs, scope); + bias_ = OpParam::InputYFrom(inputs, *scope); axis_ = OpParam::GetAttr("axis", attrs); - output_ = OpParam::OutFrom(outputs, scope); + output_ = OpParam::OutFrom(outputs, *scope); } GType *Bias() const { return bias_; } @@ -2357,6 +2430,176 @@ class FusionDeconvAddParam : public ConvTransposeParam { template using FusionDeconvAddReluParam = FusionDeconvAddParam; #endif +#ifdef FUSION_DECONVADDBN_OP +template +class FusionDeconvAddBNParam : public ConvTransposeParam { + typedef typename DtypeTensorTrait::gtype GType; + typedef typename DtypeTensorTrait::rtype RType; + + public: + FusionDeconvAddBNParam(const VariableNameMap &inputs, + const VariableNameMap &outputs, + const AttributeMap &attrs, Scope *scope) + : ConvTransposeParam(inputs, outputs, attrs, scope) { + output_ = OpParam::OutFrom(outputs, *scope); + input_bias_ = OpParam::InputBiasFrom(inputs, *scope); + input_mean_ = OpParam::InputMeanFrom(inputs, *scope); + input_scale_ = OpParam::InputScaleFrom(inputs, *scope); + input_variance_ = OpParam::InputVarianceFrom(inputs, *scope); + epsilon_ = OpParam::GetAttr("epsilon", attrs); + momentum_ = OpParam::GetAttr("momentum", attrs); + // is_test_ = OpParam::GetAttr("is_test", attrs); + } + RType *Output() const { return output_; } + + const RType *InputBias() const { return input_bias_; } + + const RType *InputMean() const { return input_mean_; } + + const RType *InputScale() const { return input_scale_; } + + const RType *InputVariance() const { return input_variance_; } + + const float &Epsilon() const { return epsilon_; } + + const float &Momentum() const { return momentum_; } + + const bool &IsTest() const { return is_test_; } + + void SetNewScale(RType *new_scale) { new_scale_ = new_scale; } + + void SetNewBias(RType *new_bias) { new_bias_ = new_bias; } + + const RType *NewScale() const { return new_scale_; } + + const RType *NewBias() const { return new_bias_; } + + protected: + RType *output_; + RType *input_bias_; + RType *input_mean_; + RType *input_scale_; + RType *input_variance_; + float epsilon_; + float momentum_; + bool is_test_; + RType *new_bias_; + RType *new_scale_; +}; +#endif +#ifdef FUSION_DECONVBNRELU_OP +template +class FusionDeconvBNReluParam : public ConvTransposeParam { + typedef typename DtypeTensorTrait::gtype GType; + typedef typename DtypeTensorTrait::rtype RType; + + public: + FusionDeconvBNReluParam(const VariableNameMap &inputs, + const VariableNameMap &outputs, + const AttributeMap &attrs, Scope *scope) + : ConvTransposeParam(inputs, outputs, attrs, scope) { + output_ = OpParam::OutFrom(outputs, *scope); + input_bias_ = OpParam::InputBiasFrom(inputs, *scope); + input_mean_ = OpParam::InputMeanFrom(inputs, *scope); + input_scale_ = OpParam::InputScaleFrom(inputs, *scope); + input_variance_ = OpParam::InputVarianceFrom(inputs, *scope); + epsilon_ = OpParam::GetAttr("epsilon", attrs); + momentum_ = OpParam::GetAttr("momentum", attrs); + } + RType *Output() const { return output_; } + + const RType *InputBias() const { return input_bias_; } + + const RType *InputMean() const { return input_mean_; } + + const RType *InputScale() const { return input_scale_; } + + const RType *InputVariance() const { return input_variance_; } + + const float &Epsilon() const { return epsilon_; } + + const float &Momentum() const { return momentum_; } + + const bool &IsTest() const { return is_test_; } + + void SetNewScale(RType *new_scale) { new_scale_ = new_scale; } + + void SetNewBias(RType *new_bias) { new_bias_ = new_bias; } + + const RType *NewScale() const { return new_scale_; } + + const RType *NewBias() const { return new_bias_; } + + protected: + RType *output_; + RType *input_bias_; + RType *input_mean_; + RType *input_scale_; + RType *input_variance_; + float epsilon_; + float momentum_; + bool is_test_; + RType *new_bias_; + RType *new_scale_; +}; +#endif +#ifdef FUSION_DECONVADDBNRELU_OP +template +class FusionDeconvAddBNReluParam : public ConvTransposeParam { + typedef typename DtypeTensorTrait::gtype GType; + typedef typename DtypeTensorTrait::rtype RType; + + public: + FusionDeconvAddBNReluParam(const VariableNameMap &inputs, + const VariableNameMap &outputs, + const AttributeMap &attrs, Scope *scope) + : ConvTransposeParam(inputs, outputs, attrs, scope) { + output_ = OpParam::OutFrom(outputs, *scope); + input_bias_ = OpParam::InputBiasFrom(inputs, *scope); + input_mean_ = OpParam::InputMeanFrom(inputs, *scope); + input_scale_ = OpParam::InputScaleFrom(inputs, *scope); + input_variance_ = OpParam::InputVarianceFrom(inputs, *scope); + epsilon_ = OpParam::GetAttr("epsilon", attrs); + momentum_ = OpParam::GetAttr("momentum", attrs); + // is_test_ = OpParam::GetAttr("is_test", attrs); + } + RType *Output() const { return output_; } + + const RType *InputBias() const { return input_bias_; } + + const RType *InputMean() const { return input_mean_; } + + const RType *InputScale() const { return input_scale_; } + + const RType *InputVariance() const { return input_variance_; } + + const float &Epsilon() const { return epsilon_; } + + const float &Momentum() const { return momentum_; } + + const bool &IsTest() const { return is_test_; } + + void SetNewScale(RType *new_scale) { new_scale_ = new_scale; } + + void SetNewBias(RType *new_bias) { new_bias_ = new_bias; } + + const RType *NewScale() const { return new_scale_; } + + const RType *NewBias() const { return new_bias_; } + + protected: + RType *output_; + RType *input_bias_; + RType *input_mean_; + RType *input_scale_; + RType *input_variance_; + float epsilon_; + float momentum_; + bool is_test_; + RType *new_bias_; + RType *new_scale_; +}; +#endif #ifdef FUSION_DECONVRELU_OP template @@ -2377,17 +2620,18 @@ class GruParam : public OpParam { * @param scope * */ GruParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) { - input_input_ = InputFrom(inputs, scope); - input_h0_ = InputH0From(inputs, scope); - input_bias_ = InputBiasFrom(inputs, scope); - input_weight_ = InputWeightFrom(inputs, scope); - - output_batch_gate_ = OutputBatchGateFrom(outputs, scope); + const AttributeMap &attrs, Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + input_input_ = InputFrom(inputs, *scope); + input_h0_ = InputH0From(inputs, *scope); + input_bias_ = InputBiasFrom(inputs, *scope); + input_weight_ = InputWeightFrom(inputs, *scope); + + output_batch_gate_ = OutputBatchGateFrom(outputs, *scope); output_batch_reset_hidden_prev_ = - OutputBatchResetHiddenPrevFrom(outputs, scope); - output_batch_hidden_ = OutputBatchHiddenFrom(outputs, scope); - output_hidden_ = OutputHiddenFrom(outputs, scope); + OutputBatchResetHiddenPrevFrom(outputs, *scope); + output_batch_hidden_ = OutputBatchHiddenFrom(outputs, *scope); + output_hidden_ = OutputHiddenFrom(outputs, *scope); activation_ = GetStringAttr("activation", attrs); gate_activation_ = GetStringAttr("gate_activation", attrs); is_reverse_ = GetAttr("is_reverse", attrs); @@ -2430,16 +2674,17 @@ class GruUnitParam : public OpParam { public: GruUnitParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) { - input_input_ = InputFrom(inputs, scope); - input_hidden_prev_ = InputHiddenPrevFrom(inputs, scope); - input_bias_ = InputBiasFrom(inputs, scope); - input_weight_ = InputWeightFrom(inputs, scope); - - output_gate_ = OutputGateFrom(outputs, scope); + const AttributeMap &attrs, Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + input_input_ = InputFrom(inputs, *scope); + input_hidden_prev_ = InputHiddenPrevFrom(inputs, *scope); + input_bias_ = InputBiasFrom(inputs, *scope); + input_weight_ = InputWeightFrom(inputs, *scope); + + output_gate_ = OutputGateFrom(outputs, *scope); output_reset_hidden_prev_ = - OutputResetHiddenPrevFrom(outputs, scope); - output_hidden_ = OutputHiddenFrom(outputs, scope); + OutputResetHiddenPrevFrom(outputs, *scope); + output_hidden_ = OutputHiddenFrom(outputs, *scope); activation_ = GetAttr("activation", attrs); gate_activation_ = GetAttr("gate_activation", attrs); } @@ -2476,9 +2721,10 @@ class FlattenParam : public OpParam { public: FlattenParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) { - input_x_ = InputXFrom(inputs, scope); - out_ = OutFrom(outputs, scope); + const AttributeMap &attrs, Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + input_x_ = InputXFrom(inputs, *scope); + out_ = OutFrom(outputs, *scope); axis = GetAttr("axis", attrs); } const GType *InputX() const { return input_x_; } @@ -2500,9 +2746,10 @@ class SplitParam : public OpParam { public: SplitParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) { - input_x_ = InputXFrom(inputs, scope); - outs_ = OutMultiFrom(outputs, scope); + const AttributeMap &attrs, Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + input_x_ = InputXFrom(inputs, *scope); + outs_ = OutMultiFrom(outputs, *scope); axis = GetAttr("axis", attrs); num = GetAttr("num", attrs); sections = GetAttr>("sections", attrs); @@ -2546,10 +2793,11 @@ class BilinearInterpParam : public OpParam { public: BilinearInterpParam(const VariableNameMap &inputs, const VariableNameMap &outputs, const AttributeMap &attrs, - const Scope &scope) { - input_x_ = InputXFrom(inputs, scope); - input_outsize_ = InputOutSizeFrom(inputs, scope); - out_ = OutFrom(outputs, scope); + Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + input_x_ = InputXFrom(inputs, *scope); + input_outsize_ = InputOutSizeFrom(inputs, *scope); + out_ = OutFrom(outputs, *scope); out_h_ = GetAttr("out_h", attrs); out_w_ = GetAttr("out_w", attrs); } @@ -2576,9 +2824,10 @@ class ShapeParam : public OpParam { public: ShapeParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) { - input_ = InputFrom(inputs, scope); - out_ = OutFrom(outputs, scope); + const AttributeMap &attrs, Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + input_ = InputFrom(inputs, *scope); + out_ = OutFrom(outputs, *scope); } const GType *Input() const { return input_; } GType *Out() const { return out_; } @@ -2597,10 +2846,11 @@ class TopKParam : public OpParam { public: TopKParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) { - input_ = OpParam::GetVarValue("X", inputs, scope); - output_ = OpParam::GetVarValue("Out", outputs, scope); - indices_ = OpParam::GetVarValue("Indices", outputs, scope); + const AttributeMap &attrs, Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + input_ = OpParam::GetVarValue("X", inputs, *scope); + output_ = OpParam::GetVarValue("Out", outputs, *scope); + indices_ = OpParam::GetVarValue("Indices", outputs, *scope); k_ = OpParam::GetAttr("k", attrs); } @@ -2620,9 +2870,10 @@ class CastParam : public OpParam { public: CastParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) { - input_ = OpParam::GetVarValue("X", inputs, scope); - output_ = OpParam::GetVarValue("Out", outputs, scope); + const AttributeMap &attrs, Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + input_ = OpParam::GetVarValue("X", inputs, *scope); + output_ = OpParam::GetVarValue("Out", outputs, *scope); input_type_ = OpParam::GetAttr("in_dtype", attrs); output_type_ = OpParam::GetAttr("out_dtype", attrs); } @@ -2643,16 +2894,17 @@ class QuantizeParam : public OpParam { public: QuantizeParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) { - input_ = InputXFrom(inputs, scope); - output_ = OutFrom(outputs, scope); + const AttributeMap &attrs, Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + input_ = InputXFrom(inputs, *scope); + output_ = OutFrom(outputs, *scope); // online // scale = max(abs(x)) - online_scale_ = OpParam::GetVarValue("OutScale", outputs, scope); + online_scale_ = OpParam::GetVarValue("OutScale", outputs, *scope); // offline if (inputs.count("InScale")) { offline_ = true; - offline_scale_ = OpParam::GetVarValue("InScale", inputs, scope); + offline_scale_ = OpParam::GetVarValue("InScale", inputs, *scope); } // x = round(scale * x) if (OpParam::HasAttr("round_type", attrs)) { @@ -2684,10 +2936,11 @@ class DequantizeParam : public OpParam { public: DequantizeParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) { - input_ = InputXFrom(inputs, scope); - output_ = OutFrom(outputs, scope); - activation_scale_ = OpParam::GetVarValue("Scale", inputs, scope); + const AttributeMap &attrs, Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + input_ = InputXFrom(inputs, *scope); + output_ = OutFrom(outputs, *scope); + activation_scale_ = OpParam::GetVarValue("Scale", inputs, *scope); // dequantization is performed as x = x / static_scale / online_scale if (OpParam::HasAttr("weight_scale", attrs)) { weight_scale_ = OpParam::GetAttr("weight_scale", attrs); @@ -2719,13 +2972,13 @@ class FusionDequantBNParam : public DequantizeParam { public: FusionDequantBNParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) + const AttributeMap &attrs, Scope *scope) : DequantizeParam(inputs, outputs, attrs, scope) { // batch norm params - bn_mean_ = OpParam::GetVarValue("BNMean", inputs, scope); - bn_variance_ = OpParam::GetVarValue("BNVariance", inputs, scope); - bn_scale_ = OpParam::GetVarValue("BNScale", inputs, scope); - bn_bias_ = OpParam::GetVarValue("BNBias", inputs, scope); + bn_mean_ = OpParam::GetVarValue("BNMean", inputs, *scope); + bn_variance_ = OpParam::GetVarValue("BNVariance", inputs, *scope); + bn_scale_ = OpParam::GetVarValue("BNScale", inputs, *scope); + bn_bias_ = OpParam::GetVarValue("BNBias", inputs, *scope); epsilon_ = OpParam::GetAttr("epsilon", attrs); } @@ -2751,11 +3004,11 @@ class FusionDequantAddBNParam : public FusionDequantBNParam { public: FusionDequantAddBNParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) + const AttributeMap &attrs, Scope *scope) : FusionDequantBNParam(inputs, outputs, attrs, scope) { // element wise add params axis_ = OpParam::GetAttr("axis", attrs); - bias_ = OpParam::InputYFrom(inputs, scope); + bias_ = OpParam::InputYFrom(inputs, *scope); } public: @@ -2774,14 +3027,14 @@ class FusionDequantAddBNQuantParam : public FusionDequantAddBNParam { public: FusionDequantAddBNQuantParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) + const AttributeMap &attrs, Scope *scope) : FusionDequantAddBNParam(inputs, outputs, attrs, scope) { // scale output - online_scale_ = OpParam::GetVarValue("OutScale", outputs, scope); + online_scale_ = OpParam::GetVarValue("OutScale", outputs, *scope); // offline if (inputs.count("InScale")) { offline_ = true; - offline_scale_ = OpParam::GetVarValue("InScale", inputs, scope); + offline_scale_ = OpParam::GetVarValue("InScale", inputs, *scope); } // x = round(scale * x) if (OpParam::HasAttr("round_type", attrs)) { @@ -2810,10 +3063,11 @@ class SequenceExpandParam : public OpParam { public: SequenceExpandParam(const VariableNameMap &inputs, const VariableNameMap &outputs, const AttributeMap &attrs, - const Scope &scope) { - input_x_ = InputXFrom(inputs, scope); - input_y_ = InputYFrom(inputs, scope); - output_ = OutFrom(outputs, scope); + Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + input_x_ = InputXFrom(inputs, *scope); + input_y_ = InputYFrom(inputs, *scope); + output_ = OutFrom(outputs, *scope); ref_level_ = -1; if (OpParam::HasAttr("ref_level", attrs)) { ref_level_ = OpParam::GetAttr("ref_level", attrs); @@ -2837,9 +3091,10 @@ class SequencePoolParam : public OpParam { public: SequencePoolParam(const VariableNameMap &inputs, const VariableNameMap &outputs, const AttributeMap &attrs, - const Scope &scope) { - input_ = InputXFrom(inputs, scope); - output_ = OutFrom(outputs, scope); + Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + input_ = InputXFrom(inputs, *scope); + output_ = OutFrom(outputs, *scope); pool_type_ = "MAX"; if (OpParam::HasAttr("pooltype", attrs)) { pool_type_ = OpParam::GetStringAttr("pooltype", attrs); @@ -2861,12 +3116,13 @@ class LodResetParam : public OpParam { public: LodResetParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) { - input_x_ = InputXFrom(inputs, scope); - output_ = OutFrom(outputs, scope); + const AttributeMap &attrs, Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + input_x_ = InputXFrom(inputs, *scope); + output_ = OutFrom(outputs, *scope); input_y_ = nullptr; if (inputs.count("Y")) { - input_y_ = InputYFrom(inputs, scope); + input_y_ = InputYFrom(inputs, *scope); } else { target_lod_ = OpParam::GetAttr>("target_lod", attrs); } @@ -2888,10 +3144,11 @@ class CompareParam : public OpParam { public: CompareParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) { - input_x_ = InputXFrom(inputs, scope); - input_y_ = InputYFrom(inputs, scope); - output_ = OutFrom(outputs, scope); + const AttributeMap &attrs, Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + input_x_ = InputXFrom(inputs, *scope); + input_y_ = InputYFrom(inputs, *scope); + output_ = OutFrom(outputs, *scope); axis_ = OpParam::GetAttr("axis", attrs); } @@ -2912,10 +3169,11 @@ class LogicalBinaryParam : public OpParam { public: LogicalBinaryParam(const VariableNameMap &inputs, const VariableNameMap &outputs, const AttributeMap &attrs, - const Scope &scope) { - input_x_ = InputXFrom(inputs, scope); - input_y_ = InputYFrom(inputs, scope); - output_ = OutFrom(outputs, scope); + Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + input_x_ = InputXFrom(inputs, *scope); + input_y_ = InputYFrom(inputs, *scope); + output_ = OutFrom(outputs, *scope); } const GType *InputX() const { return input_x_; } @@ -2938,9 +3196,10 @@ class LogicalUnaryParam : public OpParam { public: LogicalUnaryParam(const VariableNameMap &inputs, const VariableNameMap &outputs, const AttributeMap &attrs, - const Scope &scope) { - input_x_ = InputXFrom(inputs, scope); - output_ = OutFrom(outputs, scope); + Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + input_x_ = InputXFrom(inputs, *scope); + output_ = OutFrom(outputs, *scope); } const GType *InputX() const { return input_x_; } @@ -2958,11 +3217,12 @@ class WriteToArrayParam : public OpParam { public: WriteToArrayParam(const VariableNameMap &inputs, const VariableNameMap &outputs, const AttributeMap &attrs, - const Scope &scope) { - input_ = OpParam::GetVarValue("X", inputs, scope); - index_ = OpParam::GetVarValue("I", inputs, scope); + Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + input_ = OpParam::GetVarValue("X", inputs, *scope); + index_ = OpParam::GetVarValue("I", inputs, *scope); output_ = - OpParam::GetVarValue("Out", outputs, scope); + OpParam::GetVarValue("Out", outputs, *scope); } public: @@ -2978,11 +3238,13 @@ class ReadFromArrayParam : public OpParam { public: ReadFromArrayParam(const VariableNameMap &inputs, const VariableNameMap &outputs, const AttributeMap &attrs, - const Scope &scope) { + Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { input_ = - OpParam::GetVarValue("X", inputs, scope); - index_ = OpParam::GetVarValue("I", inputs, scope); - output_ = OpParam::GetVarValue("Out", outputs, scope); + OpParam::GetVarValue("X", inputs, *scope); + index_ = OpParam::GetVarValue("I", inputs, *scope); + output_ = + OpParam::GetVarValue("Out", outputs, *scope); } public: @@ -3000,9 +3262,10 @@ class IsEmptyParam : public OpParam { public: IsEmptyParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) { - input_x_ = InputXFrom(inputs, scope); - output_ = OutFrom(outputs, scope); + const AttributeMap &attrs, Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + input_x_ = InputXFrom(inputs, *scope); + output_ = OutFrom(outputs, *scope); } const GType *InputX() const { return input_x_; } @@ -3022,7 +3285,8 @@ class IncrementParam : public OpParam { public: IncrementParam(const VariableNameMap &inputs, const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) { + const AttributeMap &attrs, const Scope &scope) + : OpParam(inputs, outputs, attrs, scope) { input_x_ = InputXFrom(inputs, scope); output_ = OutFrom(outputs, scope); step_ = OpParam::GetAttr("step", attrs); @@ -3038,6 +3302,27 @@ class IncrementParam : public OpParam { float step_; }; #endif // INCREMENT_OP +#ifdef PAD2D_OP +template +class Pad2dParam : public OpParam { + typedef typename DtypeTensorTrait::gtype GType; + typedef typename DtypeTensorTrait::rtype RType; + + public: + Pad2dParam(const VariableNameMap &inputs, const VariableNameMap &outputs, + const AttributeMap &attrs, Scope *scope) + : OpParam(inputs, outputs, attrs, scope) { + input_x_ = InputXFrom(inputs, *scope); + out_ = OutFrom(outputs, *scope); + } + const RType *InputX() const { return input_x_; } + RType *Out() const { return out_; } + + private: + RType *input_x_; + RType *out_; +}; +#endif } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/pad2d_op.cpp b/src/operators/pad2d_op.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e7eda00d0830f719f8d7aa76ab77544b585d9b45 --- /dev/null +++ b/src/operators/pad2d_op.cpp @@ -0,0 +1,43 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PAD2D_OP + +#include "operators/pad2d_op.h" +namespace paddle_mobile { +namespace operators { + +template +void Pad2dOp::InferShape() const { + auto input_dims = this->param_.InputX()->dims(); + auto input_n = input_dims[0]; + auto input_c = input_dims[1]; + auto input_h = input_dims[2]; + auto input_w = input_dims[3]; + + this->param_.Out()->Resize({input_n, input_c, input_h + 1, input_w + 1}); +} + +} // namespace operators +} // namespace paddle_mobile + +namespace ops = paddle_mobile::operators; +#ifdef PADDLE_MOBILE_CPU +REGISTER_OPERATOR_CPU(pad2d, ops::Pad2dOp); +#endif +#ifdef PADDLE_MOBILE_FPGA +REGISTER_OPERATOR_FPGA(pad2d, ops::Pad2dOp); +#endif + +#endif diff --git a/src/operators/pad2d_op.h b/src/operators/pad2d_op.h new file mode 100644 index 0000000000000000000000000000000000000000..761e2b837d34b8d51629b883a8cd6797037e5d9b --- /dev/null +++ b/src/operators/pad2d_op.h @@ -0,0 +1,49 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PAD2D_OP + +#pragma once + +#include + +#include "framework/operator.h" +#include "operators/kernel/pad2d_kernel.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { +using framework::AttributeMap; +using framework::OperatorWithKernel; +using framework::Scope; +using std::string; +template +class Pad2dOp + : public OperatorWithKernel, + operators::Pad2dKernel> { + public: + Pad2dOp(const string &type, const VariableNameMap &inputs, + const VariableNameMap &outputs, const AttributeMap &attrs, + std::shared_ptr scope) + : OperatorWithKernel, + operators::Pad2dKernel>( + type, inputs, outputs, attrs, scope) {} + void InferShape() const override; + + private: +}; +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index fdd7c46fedc98b3f1811cd10ffe6bcec7d0e3a46..167b374de91ee4224578e4fc30c18fb8e2e5ea9d 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -77,6 +77,15 @@ if (CON GREATER -1) ADD_EXECUTABLE(test-rfcn fpga/test_rfcn.cpp test_helper.h test_include.h executor_for_test.h) target_link_libraries(test-rfcn paddle-mobile) + ADD_EXECUTABLE(test-marker fpga/test_marker.cpp test_helper.h test_include.h executor_for_test.h) + target_link_libraries(test-marker paddle-mobile) + + ADD_EXECUTABLE(test-rfcn-api fpga/test_rfcn_api.cpp) + target_link_libraries(test-rfcn-api paddle-mobile) + + ADD_EXECUTABLE(test-marker2 fpga/test_marker2.cpp test_helper.h test_include.h executor_for_test.h ) + target_link_libraries(test-marker2 paddle-mobile) + set(FOUND_MATCH ON) endif () diff --git a/test/fpga/test_marker.cpp b/test/fpga/test_marker.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6a266773e4e9924ba52d3ced522d8e2821e003f5 --- /dev/null +++ b/test/fpga/test_marker.cpp @@ -0,0 +1,167 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "../test_helper.h" +#include "../test_include.h" + +#ifdef PADDLE_MOBILE_FPGA_V1 +#include "fpga/V1/api.h" +#endif +#ifdef PADDLE_MOBILE_FPGA_V2 +#include "fpga/V2/api.h" +#endif +#include + +void readStream(std::string filename, char *buf) { + std::ifstream in; + in.open(filename, std::ios::in | std::ios::binary); + if (!in.is_open()) { + std::cout << "open File Failed." << std::endl; + return; + } + + in.seekg(0, std::ios::end); // go to the end + auto length = in.tellg(); // report location (this is the length) + in.seekg(0, std::ios::beg); // go back to the beginning + in.read(buf, length); + DLOG << length; + in.close(); +} + +void convert_to_chw(int16_t **data_in, int channel, int height, int width, + int num, int16_t *data_tmp) { + int64_t amount_per_side = width * height; + for (int n = 0; n < num; n++) { + for (int h = 0; h < height; h++) { + for (int w = 0; w < width; w++) { + for (int c = 0; c < channel; c++) { + *(data_tmp + n * amount_per_side * channel + c * amount_per_side + + width * h + w) = *((*data_in)++); + } + } + } + } +} + +void dump_stride_half(std::string filename, Tensor input_tensor, + const int dumpnum, bool use_chw) { + // bool use_chw = true; + if (input_tensor.dims().size() != 4) return; + int c = (input_tensor.dims())[1]; + int h = (input_tensor.dims())[2]; + int w = (input_tensor.dims())[3]; + int n = (input_tensor.dims())[0]; + auto data_ptr = input_tensor.get_data(); + auto *data_ptr_16 = reinterpret_cast(data_ptr); + auto data_tmp = data_ptr_16; + if (use_chw) { + data_tmp = + reinterpret_cast(malloc(n * c * h * w * sizeof(int16_t))); + convert_to_chw(&data_ptr_16, c, h, w, n, data_tmp); + } + std::ofstream out(filename.c_str()); + float result = 0; + int stride = input_tensor.numel() / dumpnum; + stride = stride > 0 ? stride : 1; + for (int i = 0; i < input_tensor.numel(); i += stride) { + result = paddle_mobile::fpga::fp16_2_fp32(data_tmp[i]); + out << result << std::endl; + } + out.close(); + if (data_tmp != data_ptr_16) { + free(data_tmp); + } +} + +void dump_stride_float(std::string filename, Tensor input_tensor, + const int dumpnum) { + auto data_ptr = reinterpret_cast(input_tensor.get_data()); + std::ofstream out(filename.c_str()); + float result = 0; + int stride = input_tensor.numel() / dumpnum; + stride = stride > 0 ? stride : 1; + for (int i = 0; i < input_tensor.numel(); i += stride) { + result = data_ptr[i]; + out << result << std::endl; + } + out.close(); +} + +void dump_stride(std::string filename, Tensor input_tensor, const int dumpnum, + bool use_chw) { + static int i = 0; + if (input_tensor.numel() == 0) { + return; + } + if (input_tensor.type() == typeid(float)) { + DLOG << "op: " << i++ << ", float data " << input_tensor.numel(); + dump_stride_float(filename, input_tensor, dumpnum); + } else { + DLOG << "op: " << i++ << ", half data " << input_tensor.numel(); + dump_stride_half(filename, input_tensor, dumpnum, use_chw); + } + DLOG << "dump input address: " << input_tensor.get_data(); +} + +static const char *g_marker_combine = "../models/marker/model"; +static const char *g_image_src_float = "../models/marker/model/input_0.bin"; +int main() { + paddle_mobile::fpga::open_device(); + paddle_mobile::PaddleMobile paddle_mobile; + + // if (paddle_mobile.Load(std::string(g_rfcn_combine) + "/model", + // std::string(g_rfcn_combine) + "/params", true, false, + // 1, true)) { + if (paddle_mobile.Load(std::string(g_marker_combine), true)) { + float img_info[3] = {720, 1280, 800.0f / 960.0f}; + auto img = reinterpret_cast( + fpga::fpga_malloc(720 * 1280 * 3 * sizeof(float))); + readStream(g_image_src_float, reinterpret_cast(img)); + + std::vector v(3, nullptr); + paddle_mobile.FeedData({img}); + paddle_mobile.Predict_To(-1); + + for (int i = 47; i < 52; i++) { + auto tensor_ptr = paddle_mobile.FetchResult(i); + std::string saveName = "marker_" + std::to_string(i); + // if(i != 58) + paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).get_data(), + tensor_ptr->numel() * sizeof(float)); + // tensor_ptr->numel() * sizeof(float)); + + dump_stride(saveName, (*tensor_ptr), tensor_ptr->numel(), + true); // 20);//tensor_ptr->numel()); + + /* float result = 0; + std::string str = "softmax_input_data"; + float* data = + static_cast(fpga::fpga_malloc(tensor_ptr->numel() * + sizeof(float))); str = "softmax_output_data"; auto output_ptr = + static_cast((*tensor_ptr).get_data()); for (int idx = 0; idx < + tensor_ptr->numel(); ++idx) + { + data[idx] = fpga::fp16_2_fp32(output_ptr[idx]); + } + fpga::savefile(str,data, tensor_ptr->numel(), result ); */ + } + + // paddle_mobile.GetResults(&v); + DLOG << "Computation done"; + fpga::fpga_free(img); + } + + return 0; +} diff --git a/test/fpga/test_marker2.cpp b/test/fpga/test_marker2.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b4af515c7313325bffa7ba0ec465b985d6eb75b8 --- /dev/null +++ b/test/fpga/test_marker2.cpp @@ -0,0 +1,181 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "../test_helper.h" +#include "../test_include.h" + +#ifdef PADDLE_MOBILE_FPGA_V1 +#include "fpga/V1/api.h" +#endif +#ifdef PADDLE_MOBILE_FPGA_V2 +#include "fpga/V2/api.h" +#endif +#include +#ifdef COST_TIME_PRINT +#include +#include +#include +#endif +void readStream(std::string filename, char *buf) { + std::ifstream in; + in.open(filename, std::ios::in | std::ios::binary); + if (!in.is_open()) { + std::cout << "open File Failed." << std::endl; + return; + } + + in.seekg(0, std::ios::end); // go to the end + auto length = in.tellg(); // report location (this is the length) + in.seekg(0, std::ios::beg); // go back to the beginning + in.read(buf, length); + DLOG << length; + in.close(); +} + +void convert_to_chw(int16_t **data_in, int channel, int height, int width, + int num, int16_t *data_tmp) { + int64_t amount_per_side = width * height; + for (int n = 0; n < num; n++) { + for (int h = 0; h < height; h++) { + for (int w = 0; w < width; w++) { + for (int c = 0; c < channel; c++) { + *(data_tmp + n * amount_per_side * channel + c * amount_per_side + + width * h + w) = *((*data_in)++); + } + } + } + } +} + +void dump_stride_half(std::string filename, Tensor input_tensor, + const int dumpnum, bool use_chw) { + // bool use_chw = true; + if (input_tensor.dims().size() != 4) return; + int c = (input_tensor.dims())[1]; + int h = (input_tensor.dims())[2]; + int w = (input_tensor.dims())[3]; + int n = (input_tensor.dims())[0]; + auto data_ptr = input_tensor.get_data(); + auto *data_ptr_16 = reinterpret_cast(data_ptr); + auto data_tmp = data_ptr_16; + if (use_chw) { + data_tmp = + reinterpret_cast(malloc(n * c * h * w * sizeof(int16_t))); + convert_to_chw(&data_ptr_16, c, h, w, n, data_tmp); + } + std::ofstream out(filename.c_str()); + float result = 0; + int stride = input_tensor.numel() / dumpnum; + stride = stride > 0 ? stride : 1; + for (int i = 0; i < input_tensor.numel(); i += stride) { + result = paddle_mobile::fpga::fp16_2_fp32(data_tmp[i]); + out << result << std::endl; + } + out.close(); + if (data_tmp != data_ptr_16) { + free(data_tmp); + } +} + +void dump_stride_float(std::string filename, Tensor input_tensor, + const int dumpnum) { + auto data_ptr = reinterpret_cast(input_tensor.get_data()); + std::ofstream out(filename.c_str()); + float result = 0; + int stride = input_tensor.numel() / dumpnum; + stride = stride > 0 ? stride : 1; + for (int i = 0; i < input_tensor.numel(); i += stride) { + result = data_ptr[i]; + out << result << std::endl; + } + out.close(); +} + +void dump_stride(std::string filename, Tensor input_tensor, const int dumpnum, + bool use_chw) { + static int i = 0; + if (input_tensor.numel() == 0) { + return; + } + if (input_tensor.type() == typeid(float)) { + DLOG << "op: " << i++ << ", float data " << input_tensor.numel(); + dump_stride_float(filename, input_tensor, dumpnum); + } else { + DLOG << "op: " << i++ << ", half data " << input_tensor.numel(); + dump_stride_half(filename, input_tensor, dumpnum, use_chw); + } + DLOG << "dump input address: " << input_tensor.get_data(); +} + +static const char *g_marker_combine = "../models/marker/marker_2segment"; +// static const char *g_marker_combine = "../models/marker/model2"; +static const char *g_image_src_float = + "../models/marker/marker_2segment/marker_2.bin"; +// static const char *g_image_src_float = "../models/marker/model2/data.bin"; +int main() { + paddle_mobile::fpga::open_device(); + paddle_mobile::PaddleMobile paddle_mobile; + + if (paddle_mobile.Load(std::string(g_marker_combine) + "/model", + std::string(g_marker_combine) + "/params", true, false, + 1, true)) { + // if (paddle_mobile.Load(std::string(g_marker_combine), true)) { + float img_info[3] = {432, 1280, 1.0f}; + auto img = reinterpret_cast( + fpga::fpga_malloc(144 * 14 * 14 * sizeof(float))); + readStream(g_image_src_float, reinterpret_cast(img)); + + std::vector v(3, nullptr); + paddle_mobile.FeedData({img}); + // paddle_mobile.Predict_To(-1); +#ifdef COST_TIME_PRINT + timeval start11, end11; + long dif_sec, dif_usec; // NOLINT +#endif + +#ifdef COST_TIME_PRINT + gettimeofday(&start11, NULL); +#endif + + paddle_mobile.Predict_To(-1); + +#ifdef COST_TIME_PRINT + gettimeofday(&end11, NULL); + dif_sec = end11.tv_sec - start11.tv_sec; + dif_usec = end11.tv_usec - start11.tv_usec; + std::cout << "total: " + << " cost time: " << (dif_sec * 1000000 + dif_usec) << " us" + << std::endl; +#endif + + for (int i = 0; i < 8; i++) { + auto tensor_ptr = paddle_mobile.FetchResult(i); + std::string saveName = "marker_" + std::to_string(i); + // if(i != 58) + paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).get_data(), + tensor_ptr->numel() * sizeof(float)); + // tensor_ptr->numel() * sizeof(float)); + + dump_stride(saveName, (*tensor_ptr), tensor_ptr->numel(), + true); // 20);//tensor_ptr->numel()); + } + + // paddle_mobile.GetResults(&v); + DLOG << "Computation done"; + fpga::fpga_free(img); + } + + return 0; +} diff --git a/test/fpga/test_resnet50.cpp b/test/fpga/test_resnet50.cpp index 723e4ea3e3ff35e0d555703391adcafacccb42f1..e48ad33f36cdee1e57ffba9bf64c6546691f0566 100644 --- a/test/fpga/test_resnet50.cpp +++ b/test/fpga/test_resnet50.cpp @@ -1,140 +1,140 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include -#include -#include -#include "../test_include.h" - -#ifdef PADDLE_MOBILE_FPGA_V1 -#include "fpga/V1/api.h" -#endif -#ifdef PADDLE_MOBILE_FPGA_V2 -#include "fpga/V2/api.h" -#endif - -void readStream(std::string filename, float *buf) { - std::ifstream in; - in.open(filename, std::ios::in); - if (!in.is_open()) { - std::cout << "open File Failed." << std::endl; - return; - } - string strOne; - int i = 0; - while (!in.eof()) { - in >> buf[i]; - i++; - } - in.close(); -} - -void convert_to_chw(int16_t **data_in, int channel, int height, int width, - int16_t *data_tmp) { - int64_t amount_per_side = width * height; - for (int h = 0; h < height; h++) { - for (int w = 0; w < width; w++) { - for (int c = 0; c < channel; c++) { - *(data_tmp + c * amount_per_side + width * h + w) = *((*data_in)++); - } - } - } -} - -void dump(std::string filename, Tensor input_tensor) { - auto dataptr = reinterpret_cast(input_tensor.get_data()); - std::ofstream out(filename.c_str()); - float result = 0; - for (int i = 0; i < input_tensor.numel(); ++i) { - result = paddle_mobile::fpga::fp16_2_fp32(dataptr[i]); - out << result << std::endl; - } - out.close(); -} -void dump_stride_half(std::string filename, Tensor input_tensor, - const int dumpnum) { - int c = (input_tensor.dims())[1]; - int h = (input_tensor.dims())[2]; - int w = (input_tensor.dims())[3]; - auto data_ptr = input_tensor.get_data(); - auto *data_tmp = - reinterpret_cast(malloc(c * h * w * sizeof(int16_t))); - auto *data_ptr_16 = reinterpret_cast(data_ptr); - convert_to_chw(&data_ptr_16, c, h, w, data_tmp); - std::ofstream out(filename.c_str()); - float result = 0; - int stride = input_tensor.numel() / dumpnum; - stride = stride > 0 ? stride : 1; - for (int i = 0; i < input_tensor.numel(); i += stride) { - result = paddle_mobile::fpga::fp16_2_fp32(data_tmp[i]); - out << result << std::endl; - } - out.close(); - free(data_tmp); -} - -void dump_stride_float(std::string filename, Tensor input_tensor, - const int dumpnum) { - auto data_ptr = reinterpret_cast(input_tensor.get_data()); - std::ofstream out(filename.c_str()); - float result = 0; - int stride = input_tensor.numel() / dumpnum; - stride = stride > 0 ? stride : 1; - for (int i = 0; i < input_tensor.numel(); i += stride) { - result = data_ptr[i]; - out << result << std::endl; - } - out.close(); -} -static const char *g_resnet50 = "../models/resnet50"; -const std::string g_image_src_float = "../images/image_src_float"; -int main() { - paddle_mobile::fpga::open_device(); - paddle_mobile::PaddleMobile paddle_mobile; - if (paddle_mobile.Load(std::string(g_resnet50), true)) { - Tensor input_tensor; - SetupTensor(&input_tensor, {1, 3, 224, 224}, static_cast(2), - static_cast(2)); - readStream(g_image_src_float, - input_tensor.mutable_data({1, 3, 224, 224})); - paddle_mobile.FeedData(input_tensor); - paddle_mobile.Predict_To(-1); - for (int i = 0; i < 73; i++) { - auto tensor_ptr = paddle_mobile.FetchResult(i); - std::string saveName = "resnet50_result_" + std::to_string(i); - paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).get_data(), - tensor_ptr->numel() * sizeof(half)); - dump_stride_half(saveName, (*tensor_ptr), 20); - // dump(saveName, (*tensor_ptr)); - } - - auto tensor_ptr = paddle_mobile.FetchResult(73); - dump_stride_float("resnet50_result_73", (*tensor_ptr), 20); - tensor_ptr = paddle_mobile.FetchResult(74); - dump_stride_float("resnet50_result_74", (*tensor_ptr), 9999); - - float max = 0; - auto data_ptr = tensor_ptr->data(); - int maximumIdx = 0; - for (int i = 0; i < (*tensor_ptr).numel(); i++) { - if (data_ptr[i] > max) { - maximumIdx = i; - max = data_ptr[i]; - } - } - std::cout << "index : " << std::dec << maximumIdx << ", value : " << max - << std::endl; - std::cout << "Computation done" << std::endl; - return 0; - } -} +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include +#include +#include +#include "../test_include.h" + +#ifdef PADDLE_MOBILE_FPGA_V1 +#include "fpga/V1/api.h" +#endif +#ifdef PADDLE_MOBILE_FPGA_V2 +#include "fpga/V2/api.h" +#endif + +void readStream(std::string filename, float *buf) { + std::ifstream in; + in.open(filename, std::ios::in); + if (!in.is_open()) { + std::cout << "open File Failed." << std::endl; + return; + } + string strOne; + int i = 0; + while (!in.eof()) { + in >> buf[i]; + i++; + } + in.close(); +} + +void convert_to_chw(int16_t **data_in, int channel, int height, int width, + int16_t *data_tmp) { + int64_t amount_per_side = width * height; + for (int h = 0; h < height; h++) { + for (int w = 0; w < width; w++) { + for (int c = 0; c < channel; c++) { + *(data_tmp + c * amount_per_side + width * h + w) = *((*data_in)++); + } + } + } +} + +void dump(std::string filename, Tensor input_tensor) { + auto dataptr = reinterpret_cast(input_tensor.get_data()); + std::ofstream out(filename.c_str()); + float result = 0; + for (int i = 0; i < input_tensor.numel(); ++i) { + result = paddle_mobile::fpga::fp16_2_fp32(dataptr[i]); + out << result << std::endl; + } + out.close(); +} +void dump_stride_half(std::string filename, Tensor input_tensor, + const int dumpnum) { + int c = (input_tensor.dims())[1]; + int h = (input_tensor.dims())[2]; + int w = (input_tensor.dims())[3]; + auto data_ptr = input_tensor.get_data(); + auto *data_tmp = + reinterpret_cast(malloc(c * h * w * sizeof(int16_t))); + auto *data_ptr_16 = reinterpret_cast(data_ptr); + convert_to_chw(&data_ptr_16, c, h, w, data_tmp); + std::ofstream out(filename.c_str()); + float result = 0; + int stride = input_tensor.numel() / dumpnum; + stride = stride > 0 ? stride : 1; + for (int i = 0; i < input_tensor.numel(); i += stride) { + result = paddle_mobile::fpga::fp16_2_fp32(data_tmp[i]); + out << result << std::endl; + } + out.close(); + free(data_tmp); +} + +void dump_stride_float(std::string filename, Tensor input_tensor, + const int dumpnum) { + auto data_ptr = reinterpret_cast(input_tensor.get_data()); + std::ofstream out(filename.c_str()); + float result = 0; + int stride = input_tensor.numel() / dumpnum; + stride = stride > 0 ? stride : 1; + for (int i = 0; i < input_tensor.numel(); i += stride) { + result = data_ptr[i]; + out << result << std::endl; + } + out.close(); +} +static const char *g_resnet50 = "../models/resnet50"; +const std::string g_image_src_float = "../images/image_src_float"; // NOLINT +int main() { + paddle_mobile::fpga::open_device(); + paddle_mobile::PaddleMobile paddle_mobile; + if (paddle_mobile.Load(std::string(g_resnet50), true)) { + Tensor input_tensor; + SetupTensor(&input_tensor, {1, 3, 224, 224}, static_cast(2), + static_cast(2)); + readStream(g_image_src_float, + input_tensor.mutable_data({1, 3, 224, 224})); + paddle_mobile.FeedData(input_tensor); + paddle_mobile.Predict_To(-1); + for (int i = 0; i < 73; i++) { + auto tensor_ptr = paddle_mobile.FetchResult(i); + std::string saveName = "resnet50_result_" + std::to_string(i); + paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).get_data(), + tensor_ptr->numel() * sizeof(half)); + // dump_stride_half(saveName, (*tensor_ptr), 20); + // dump(saveName, (*tensor_ptr)); + } + + auto tensor_ptr = paddle_mobile.FetchResult(73); + // dump_stride_float("resnet50_result_73", (*tensor_ptr), 20); + tensor_ptr = paddle_mobile.FetchResult(74); + // dump_stride_float("resnet50_result_74", (*tensor_ptr), 9999); + + float max = 0; + auto data_ptr = tensor_ptr->data(); + int maximumIdx = 0; + for (int i = 0; i < (*tensor_ptr).numel(); i++) { + if (data_ptr[i] > max) { + maximumIdx = i; + max = data_ptr[i]; + } + } + std::cout << "index : " << std::dec << maximumIdx << ", value : " << max + << std::endl; + std::cout << "Computation done" << std::endl; + return 0; + } +} diff --git a/test/fpga/test_rfcn.cpp b/test/fpga/test_rfcn.cpp index e1d13541ef8000da18ceda4c356d158198d7b9f4..50f8aa863d45c3c118f60367bf7b9921e0667891 100644 --- a/test/fpga/test_rfcn.cpp +++ b/test/fpga/test_rfcn.cpp @@ -1,62 +1,152 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "../test_helper.h" -#include "../test_include.h" - -#ifdef PADDLE_MOBILE_FPGA_V1 -#include "fpga/V1/api.h" -#endif -#ifdef PADDLE_MOBILE_FPGA_V2 -#include "fpga/V2/api.h" -#endif - -void readStream(std::string filename, uint8_t *buf) { - std::ifstream in; - in.open(filename, std::ios::in); - if (!in.is_open()) { - std::cout << "open File Failed." << std::endl; - return; - } - int i = 0; - while (!in.eof()) { - in >> buf[i]; - i++; - } - in.close(); -} - -static const char *g_rfcn_combine = "../models/rfcn"; -static const char *g_image_src_float = "../models/rfcn/data.bin"; -int main() { - paddle_mobile::fpga::open_device(); - paddle_mobile::PaddleMobile paddle_mobile; - - if (paddle_mobile.Load(std::string(g_rfcn_combine) + "/model", - std::string(g_rfcn_combine) + "/params", true, false, - 1, true)) { - float img_info[3] = {768, 1536, 768.0f / 960.0f}; - auto img = fpga::fpga_malloc(768 * 1536 * 3 * sizeof(float)); - readStream(g_image_src_float, reinterpret_cast(img)); - std::vector v(3, nullptr); - paddle_mobile.FeedData({img_info, img}); - paddle_mobile.Predict_To(-1); - paddle_mobile.GetResults(&v); - DLOG << "Computation done"; - fpga::fpga_free(img); - } - - return 0; -} +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "../test_helper.h" +#include "../test_include.h" + +#ifdef PADDLE_MOBILE_FPGA_V1 +#include "fpga/V1/api.h" +#endif +#ifdef PADDLE_MOBILE_FPGA_V2 +#include "fpga/V2/api.h" +#endif + +#include + +void readStream(std::string filename, char *buf) { + std::ifstream in; + in.open(filename, std::ios::in | std::ios::binary); + if (!in.is_open()) { + std::cout << "open File Failed." << std::endl; + return; + } + + in.seekg(0, std::ios::end); // go to the end + auto length = in.tellg(); // report location (this is the length) + in.seekg(0, std::ios::beg); // go back to the beginning + in.read(buf, length); + DLOG << length; + in.close(); +} + +void convert_to_chw(int16_t **data_in, int channel, int height, int width, + int num, int16_t *data_tmp) { + int64_t amount_per_side = width * height; + for (int n = 0; n < num; n++) { + for (int h = 0; h < height; h++) { + for (int w = 0; w < width; w++) { + for (int c = 0; c < channel; c++) { + *(data_tmp + n * amount_per_side * channel + c * amount_per_side + + width * h + w) = *((*data_in)++); + } + } + } + } +} + +void dump_stride_half(std::string filename, Tensor input_tensor, + const int dumpnum, bool use_chw) { + // bool use_chw = true; + if (input_tensor.dims().size() != 4) return; + int c = (input_tensor.dims())[1]; + int h = (input_tensor.dims())[2]; + int w = (input_tensor.dims())[3]; + int n = (input_tensor.dims())[0]; + auto data_ptr = input_tensor.get_data(); + auto *data_ptr_16 = reinterpret_cast(data_ptr); + auto data_tmp = data_ptr_16; + if (use_chw) { + data_tmp = + reinterpret_cast(malloc(n * c * h * w * sizeof(int16_t))); + convert_to_chw(&data_ptr_16, c, h, w, n, data_tmp); + } + std::ofstream out(filename.c_str()); + float result = 0; + int stride = input_tensor.numel() / dumpnum; + stride = stride > 0 ? stride : 1; + for (int i = 0; i < input_tensor.numel(); i += stride) { + result = paddle_mobile::fpga::fp16_2_fp32(data_tmp[i]); + out << result << std::endl; + } + out.close(); + if (data_tmp != data_ptr_16) { + free(data_tmp); + } +} + +void dump_stride_float(std::string filename, Tensor input_tensor, + const int dumpnum) { + auto data_ptr = reinterpret_cast(input_tensor.get_data()); + std::ofstream out(filename.c_str()); + float result = 0; + int stride = input_tensor.numel() / dumpnum; + stride = stride > 0 ? stride : 1; + for (int i = 0; i < input_tensor.numel(); i += stride) { + result = data_ptr[i]; + out << result << std::endl; + } + out.close(); +} + +void dump_stride(std::string filename, Tensor input_tensor, const int dumpnum, + bool use_chw) { + static int i = 0; + if (input_tensor.numel() == 0) { + return; + } + if (input_tensor.type() == typeid(float)) { + DLOG << "op: " << i++ << ", float data " << input_tensor.numel(); + + dump_stride_float(filename, input_tensor, dumpnum); + } else { + DLOG << "op: " << i++ << ", half data " << input_tensor.numel(); + + dump_stride_half(filename, input_tensor, dumpnum, use_chw); + } + DLOG << "dump input address: " << input_tensor.get_data(); +} + +static const char *g_rfcn_combine = "../models/rfcn"; +static const char *g_image_src_float = "../models/rfcn/data.bin"; +int main() { + paddle_mobile::fpga::open_device(); + paddle_mobile::PaddleMobile paddle_mobile; + + if (paddle_mobile.Load(std::string(g_rfcn_combine) + "/model", + std::string(g_rfcn_combine) + "/params", true, false, + 1, true)) { + float img_info[3] = {768, 1536, 768.0f / 960.0f}; + auto img = reinterpret_cast( + fpga::fpga_malloc(768 * 1536 * 3 * sizeof(float))); + readStream(g_image_src_float, reinterpret_cast(img)); + + std::vector v(3, nullptr); + paddle_mobile.FeedData(std::vector({img_info, img})); + paddle_mobile.Predict_To(-1); + + for (int i = 65; i < 69; i++) { + auto tensor_ptr = paddle_mobile.FetchResult(i); + std::string saveName = "rfcn_" + std::to_string(i); + paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).get_data(), + tensor_ptr->numel() * sizeof(float)); + dump_stride(saveName, (*tensor_ptr), tensor_ptr->numel(), true); + } + // paddle_mobile.GetResults(&v); + DLOG << "Computation done"; + fpga::fpga_free(img); + } + + return 0; +} diff --git a/test/fpga/test_rfcn_api.cpp b/test/fpga/test_rfcn_api.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d9e488962597d5df20d2fc25877dde5052a1fb9f --- /dev/null +++ b/test/fpga/test_rfcn_api.cpp @@ -0,0 +1,146 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef PADDLE_MOBILE_FPGA +#define PADDLE_MOBILE_FPGA +#endif +#include +#include +#include "io/paddle_inference_api.h" + +static const char *g_image = "../models/rfcn/data.bin"; +static const char *g_model = "../models/rfcn/model"; +static const char *g_param = "../models/rfcn/params"; + +void readStream(std::string filename, char *buf) { + std::ifstream in; + in.open(filename, std::ios::in | std::ios::binary); + if (!in.is_open()) { + std::cout << "open File Failed." << std::endl; + return; + } + + in.seekg(0, std::ios::end); // go to the end + auto length = in.tellg(); // report location (this is the length) + in.seekg(0, std::ios::beg); // go back to the beginning + in.read(buf, length); + in.close(); +} + +PaddleMobileConfig GetConfig() { + PaddleMobileConfig config; + config.precision = PaddleMobileConfig::FP32; + config.device = PaddleMobileConfig::kFPGA; + config.prog_file = g_model; + config.param_file = g_param; + config.thread_num = 1; + config.batch_size = 1; + config.optimize = true; + config.lod_mode = true; + config.quantification = false; + return config; +} + +int main() { + open_device(); + PaddleMobileConfig config = GetConfig(); + auto predictor = + CreatePaddlePredictor(config); + + std::cout << "Finishing loading model" << std::endl; + + float img_info[3] = {768, 1536, 768.0f / 960.0f}; + int img_length = 768 * 1536 * 3; + auto img = reinterpret_cast(fpga_malloc(img_length * sizeof(float))); + readStream(g_image, reinterpret_cast(img)); + + std::cout << "Finishing initializing data" << std::endl; + /* + predictor->FeedData({img_info, img}); + predictor->Predict_From_To(0, -1); + std::cout << " Finishing predicting " << std::endl; + std::vector v(3, nullptr); + predictor->GetResults(&v); + int post_nms = 300; + for (int num = 0; num < post_nms; num ++){ + for (int i = 0; i < 8; i ++){ + std:: cout << ((float*)(v[0]))[num * 8 + i] << std::endl; + } + } + for (int num = 0; num < post_nms; num ++){ + for (int i = 0; i < 8; i ++){ + std:: cout << ((float*)(v[1]))[num * 8 + i] << std::endl; + } + } + for (int num = 0; num < post_nms; num ++){ + for (int i = 0; i < 4; i ++){ + std:: cout << ((float*)(v[2]))[num * 4 + i] << std::endl; + } + } + */ + + struct PaddleTensor t_img_info, t_img; + t_img_info.dtype = FLOAT32; + t_img_info.layout = LAYOUT_HWC; + t_img_info.shape = std::vector({1, 3}); + t_img_info.name = "Image information"; + t_img_info.data.Reset(img_info, 3 * sizeof(float)); + + t_img.dtype = FLOAT32; + t_img.layout = LAYOUT_HWC; + t_img.shape = std::vector({1, 768, 1536, 3}); + t_img.name = "Image information"; + t_img.data.Reset(img, img_length * sizeof(float)); + predictor->FeedPaddleTensors({t_img_info, t_img}); + + std::cout << "Finishing feeding data " << std::endl; + + predictor->Predict_From_To(0, -1); + std::cout << "Finishing predicting " << std::endl; + + std::vector v; // No need to initialize v + predictor->FetchPaddleTensors(&v); // Old data in v will be cleared + std::cout << "Output number is " << v.size() << std::endl; + + auto post_nms = v[0].data.length() / sizeof(float) / 8; + for (int num = 0; num < post_nms; num++) { + for (int i = 0; i < 8; i++) { + auto p = reinterpret_cast(v[0].data.data()); + std::cout << p[num * 8 + i] << std::endl; + } + } + for (int num = 0; num < post_nms; num++) { + for (int i = 0; i < 8; i++) { + auto p = reinterpret_cast(v[1].data.data()); + std::cout << p[num * 8 + i] << std::endl; + } + } + for (int num = 0; num < post_nms; num++) { + for (int i = 0; i < 4; i++) { + auto p = reinterpret_cast(v[2].data.data()); + std::cout << p[num * 4 + i] << std::endl; + } + } + std::cout << "Finish getting vector values" << std::endl; + + PaddleTensor tensor; + predictor->GetPaddleTensor("fetch2", &tensor); + for (int i = 0; i < post_nms; i++) { + auto p = reinterpret_cast(tensor.data.data()); + std::cout << p[+i] << std::endl; + } + + return 0; +} diff --git a/test/net/test_mobilenet_GPU.cpp b/test/net/test_mobilenet_GPU.cpp index 22ff12f592834efeab1566c2bd4fbd3b5c00dc26..1f3bc921cf812540c0b056f485dfff3ed41e7c78 100644 --- a/test/net/test_mobilenet_GPU.cpp +++ b/test/net/test_mobilenet_GPU.cpp @@ -25,11 +25,11 @@ int main() { paddle_mobile.SetCLPath("/data/local/tmp/bin"); #endif - // auto isok = - // paddle_mobile.Load(std::string(g_mobilenet_mul) + "/model", - // std::string(g_mobilenet_mul) + "/params", true); + auto isok = paddle_mobile.Load( + std::string(g_mobilenet_vision) + "/vision_mobilenet_model", + std::string(g_mobilenet_vision) + "/vision_mobilenet_params", true); - auto isok = paddle_mobile.Load(std::string(g_mobilenet), true); + // auto isok = paddle_mobile.Load(std::string(g_mobilenet), true); if (isok) { auto time2 = paddle_mobile::time(); std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms" @@ -37,12 +37,13 @@ int main() { std::vector input; std::vector dims{1, 3, 224, 224}; - GetInput(g_test_image_1x3x224x224_banana, &input, dims); + GetInput(g_test_image_1x3x224x224_vision_mobilenet_input, &input, + dims); std::vector vec_result = paddle_mobile.Predict(input, dims); auto time3 = paddle_mobile::time(); - int max = 10; + int max = 1; for (int i = 0; i < max; ++i) { vec_result = paddle_mobile.Predict(input, dims); } diff --git a/test/net/test_mobilenet_combine.cpp b/test/net/test_mobilenet_combine.cpp index 073607795967af09c81bc0a0c492d065bce7ed72..af00085b6d919553cfb4669a1c7da807ec24f87d 100644 --- a/test/net/test_mobilenet_combine.cpp +++ b/test/net/test_mobilenet_combine.cpp @@ -20,14 +20,18 @@ int main() { paddle_mobile::PaddleMobile paddle_mobile; paddle_mobile.SetThreadNum(4); auto time1 = time(); - if (paddle_mobile.Load(std::string(g_mobilenet_combined) + "/model", - std::string(g_mobilenet_combined) + "/params", true)) { + + if (paddle_mobile.Load( + std::string(g_mobilenet_vision) + "/vision_mobilenet_model", + std::string(g_mobilenet_vision) + "/vision_mobilenet_params", true)) { auto time2 = time(); std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl; std::vector input; std::vector dims{1, 3, 224, 224}; - GetInput(g_test_image_1x3x224x224_banana, &input, dims); + + GetInput(g_test_image_1x3x224x224_vision_mobilenet_input, &input, + dims); auto vec_result = paddle_mobile.Predict(input, dims); std::vector::iterator biggest = @@ -39,8 +43,9 @@ int main() { for (int i = 0; i < 10; ++i) { auto vec_result = paddle_mobile.Predict(input, dims); } + auto time3 = time(); - for (int i = 0; i < 10; ++i) { + for (int i = 0; i < 1; ++i) { auto vec_result = paddle_mobile.Predict(input, dims); } auto time4 = time(); diff --git a/test/net/test_yolo_combined.cpp b/test/net/test_yolo_combined.cpp index 88b889daa946cfaef1d86ff36f416b4643532c89..5a589878cccf2fddaa3a29d73db7737d71cff722 100644 --- a/test/net/test_yolo_combined.cpp +++ b/test/net/test_yolo_combined.cpp @@ -23,15 +23,15 @@ int main() { // ../../../test/models/mobilenet auto time1 = time(); - if (paddle_mobile.Load(std::string(g_yolo_combined) + "/model", - std::string(g_yolo_combined) + "/params", true)) { + if (paddle_mobile.Load(std::string(g_yolo_vision) + "/model", + std::string(g_yolo_vision) + "/params", true)) { auto time2 = time(); std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl; std::vector dims{1, 3, 416, 416}; std::vector input; - GetInput(g_test_image_desktop_1_3_416_416_nchw_float, &input, dims); + GetInput(g_test_image_1x3x416x416_vision_yolo_input, &input, dims); std::cout << "input.size(): " << input.size() << std::endl; for (int j = 0; j < 100; ++j) { std::cout << j << " : " << input[j] << std::endl; @@ -42,13 +42,6 @@ int main() { // } auto time3 = time(); const vector vector_out = paddle_mobile.Predict(input, dims); - std::cout << "--------------------------------------------" << std::endl; - - for (float i : vector_out) { - std::cout << i << std::endl; - } - - std::cout << "--------------------------------------------" << std::endl; std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl; diff --git a/test/net/test_yologpu.cpp b/test/net/test_yologpu.cpp index 0215ded59e5f74f0c103d4b51abe06b487bd50ab..37f4a7801920eed1722d390002345f3b9ae86036 100644 --- a/test/net/test_yologpu.cpp +++ b/test/net/test_yologpu.cpp @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include -#include +#include // NOLINT #include "../../src/common/types.h" #include "../../src/io/paddle_test_inference_api.h" #include "../test_helper.h" @@ -31,8 +31,9 @@ void t1() { paddle_mobile_gpu.SetCLPath("/data/local/tmp/bin"); #endif auto time1 = paddle_mobile::time(); - auto isok = paddle_mobile_gpu.Load(std::string(g_yolo_mul) + "/model", - std::string(g_yolo_mul) + "/params", true); + auto isok = + paddle_mobile_gpu.Load(std::string(g_yolo_vision) + "/model", + std::string(g_yolo_vision) + "/params", true); // auto isok = paddle_mobile.Load(std::string(g_yolo_mul), true); if (isok) { @@ -42,13 +43,13 @@ void t1() { std::vector input; std::vector dims{1, 3, 416, 416}; - GetInput(g_yolo_img, &input, dims); + GetInput(g_test_image_1x3x416x416_vision_yolo_input, &input, dims); std::vector vec_result; // = paddle_mobile.Predict(input, dims); auto time3 = paddle_mobile::time(); - int max = 10; + int max = 1; for (int i = 0; i < max; ++i) { vec_result = paddle_mobile_gpu.Predict(input, dims); } @@ -129,9 +130,9 @@ void t2() { void t3() { paddle_mobile::PaddleMobile paddle_mobile; // paddle_mobile.SetThreadNum(4); - //#ifdef PADDLE_MOBILE_CL + // #ifdef PADDLE_MOBILE_CL // paddle_mobile.SetCLPath("/data/local/tmp/bin"); - //#endif + // #endif auto time1 = paddle_mobile::time(); auto isok = paddle_mobile.Load(std::string(g_yolo_mul) + "/model", std::string(g_yolo_mul) + "/params", true); diff --git a/test/test_helper.h b/test/test_helper.h index a5000f1971d1fecf31682b49490f17ce487ee330..775a2b8b7b0797ecc637b22539319e8c3e980dae 100644 --- a/test/test_helper.h +++ b/test/test_helper.h @@ -51,6 +51,8 @@ static const char *g_yolo_combined = "../models/yolo_combined"; static const char *g_yolo_mul = "../models/d"; static const char *g_fluid_fssd_new = "../models/fluid_fssd_new"; static const char *g_vgg16_ssd_combined = "../models/vgg16_ssd_combined"; +static const char *g_mobilenet_vision = "../models/vision_mobilenet"; +static const char *g_yolo_vision = "../models/vision_yolo"; static const char *g_test_image_1x3x224x224 = "../images/test_image_1x3x224x224_float"; static const char *g_test_image_1x3x224x224_banana = @@ -65,8 +67,12 @@ static const char *g_img = "../images/img.bin"; static const char *g_yolo_img = "../images/in_put_1_3_416_416_2"; static const char *g_super_img = "../images/mingren_input_data"; static const char *g_mobilenet_img = "../images/image"; +static const char *g_test_image_1x3x224x224_vision_mobilenet_input = + "../images/vision_mobilenet_input"; +static const char *g_test_image_1x3x416x416_vision_yolo_input = + "../images/yolo_input"; -using namespace paddle_mobile; +using namespace paddle_mobile; // NOLINT using paddle_mobile::framework::DDim; using paddle_mobile::framework::LoDTensor; using paddle_mobile::framework::Tensor; diff --git a/tools/op.cmake b/tools/op.cmake old mode 100644 new mode 100755 index 3bdedc15d8e228d5ce69356de8388a0e28cf4a6a..3b5a98023f8d24986081beea0782c8acc265062b --- a/tools/op.cmake +++ b/tools/op.cmake @@ -128,10 +128,21 @@ if (CON GREATER -1) set(FUSION_CONVADDBN_OP ON) set(RESHAPE2_OP ON) set(PSROI_POOL_OP ON) + set(ROIALIGN_POOL_OP ON) set(PROPOSAL_OP ON) set(ANCHOR_GENERATOR_OP ON) set(SLICE_OP ON) - + set(SIGMOID_OP ON) + set(CONCAT_OP ON) + set(PAD2D_OP ON) + set(CONV_TRANSPOSE_OP ON) + set(FUSION_DECONVADDBNRELU_OP ON) + set(FUSION_DECONVADDBN_OP ON) + set(FUSION_DECONVBNRELU_OP ON) + set(CONV_OP ON) + set(ELEMENTWISEMUL_OP ON) + set(FUSION_FCRELU_OP ON) + set(RELU_OP ON) set(FOUND_MATCH ON) endif() @@ -577,7 +588,6 @@ endif() if (FUSION_DECONVADDRELU_OP) add_definitions(-DFUSION_DECONVADDRELU_OP) endif() - if (WHILE_OP) add_definitions(-DWHILE_OP) endif() @@ -603,6 +613,9 @@ endif() if (PSROI_POOL_OP) add_definitions(-DPSROI_POOL_OP) endif() +if (ROIALIGN_POOL_OP) + add_definitions(-DROIALIGN_POOL_OP) +endif() if (ROI_PERSPECTIVE_OP) add_definitions(-DROI_PERSPECTIVE_OP) endif() @@ -611,6 +624,14 @@ if (BEAM_SEARCH_OP) endif() if (BEAM_SEARCH_DECODE_OP) add_definitions(-DBEAM_SEARCH_DECODE_OP) +if (FUSION_DECONVADDBNRELU_OP) + add_definitions(-DFUSION_DECONVADDBNRELU_OP) +endif() +if (FUSION_DECONVBNRELU_OP) + add_definitions(-DFUSION_DECONVBNRELU_OP) +endif() +if (FUSION_DECONVADDBN_OP) + add_definitions(-DFUSION_DECONVADDBN_OP) endif() if (PAD2D_OP) add_definitions(-DPAD2D_OP) diff --git a/tools/pre-commit.hooks/cpplint.hook b/tools/pre-commit.hooks/cpplint.hook index 7f4db9fbfb674fd27319df1f265ddc93140fe412..78ca3cfcdda52a223be609801e6b12ec58b79323 100644 --- a/tools/pre-commit.hooks/cpplint.hook +++ b/tools/pre-commit.hooks/cpplint.hook @@ -5,7 +5,7 @@ TOTAL_ERRORS=0 # The trick to remove deleted files: https://stackoverflow.com/a/2413151 for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}' | \ grep -v ".pb.cpp" | grep -v ".pb.h" | grep -v ".pb-c.h" | grep -v ".pb-c.c" | \ - grep -v "protobuf-c.h" | grep -v "protobuf-c.c" | grep -v "dim.h"); do + grep -v "protobuf-c.h" | grep -v "protobuf-c.c"); do cpplint $file; TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?); done