未验证 提交 6f856926 编写于 作者: R Ruilong Liu 提交者: GitHub

Merge pull request #883 from codeWorm2015/metal

Metal
...@@ -19,10 +19,10 @@ ...@@ -19,10 +19,10 @@
<rect key="frame" x="0.0" y="0.0" width="375" height="667"/> <rect key="frame" x="0.0" y="0.0" width="375" height="667"/>
<autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/> <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
<subviews> <subviews>
<imageView userInteractionEnabled="NO" contentMode="scaleAspectFit" horizontalHuggingPriority="251" verticalHuggingPriority="251" ambiguous="YES" image="hand.jpg" translatesAutoresizingMaskIntoConstraints="NO" id="ZZh-fw-LwK"> <imageView userInteractionEnabled="NO" contentMode="scaleAspectFit" horizontalHuggingPriority="251" verticalHuggingPriority="251" translatesAutoresizingMaskIntoConstraints="NO" id="ZZh-fw-LwK">
<rect key="frame" x="0.0" y="20" width="375" height="247"/> <rect key="frame" x="0.0" y="20" width="375" height="247"/>
</imageView> </imageView>
<label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" ambiguous="YES" text="Thread:" textAlignment="natural" lineBreakMode="tailTruncation" baselineAdjustment="alignBaselines" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="2EB-m2-a3L"> <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" text="Thread:" textAlignment="natural" lineBreakMode="tailTruncation" baselineAdjustment="alignBaselines" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="2EB-m2-a3L">
<rect key="frame" x="10" y="538" width="68" height="24"/> <rect key="frame" x="10" y="538" width="68" height="24"/>
<constraints> <constraints>
<constraint firstAttribute="width" constant="68" id="Q5J-tq-JSX"/> <constraint firstAttribute="width" constant="68" id="Q5J-tq-JSX"/>
...@@ -32,19 +32,19 @@ ...@@ -32,19 +32,19 @@
<nil key="textColor"/> <nil key="textColor"/>
<nil key="highlightedColor"/> <nil key="highlightedColor"/>
</label> </label>
<pickerView contentMode="scaleToFill" ambiguous="YES" translatesAutoresizingMaskIntoConstraints="NO" id="DlO-dk-RMr"> <pickerView contentMode="scaleToFill" translatesAutoresizingMaskIntoConstraints="NO" id="DlO-dk-RMr">
<rect key="frame" x="88" y="510.5" width="287" height="80"/> <rect key="frame" x="88" y="510.5" width="287" height="80"/>
<constraints> <constraints>
<constraint firstAttribute="height" constant="80" id="Sbi-05-Mwd"/> <constraint firstAttribute="height" constant="80" id="Sbi-05-Mwd"/>
</constraints> </constraints>
</pickerView> </pickerView>
<pickerView contentMode="scaleToFill" ambiguous="YES" translatesAutoresizingMaskIntoConstraints="NO" id="6MG-gv-hD5"> <pickerView contentMode="scaleToFill" translatesAutoresizingMaskIntoConstraints="NO" id="6MG-gv-hD5">
<rect key="frame" x="85" y="401" width="290" height="80"/> <rect key="frame" x="85" y="401" width="290" height="80"/>
<constraints> <constraints>
<constraint firstAttribute="height" constant="80" id="yAL-JY-G6b"/> <constraint firstAttribute="height" constant="80" id="yAL-JY-G6b"/>
</constraints> </constraints>
</pickerView> </pickerView>
<label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" ambiguous="YES" text="Models" textAlignment="natural" lineBreakMode="tailTruncation" baselineAdjustment="alignBaselines" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="avL-VK-Kha"> <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" text="Models" textAlignment="natural" lineBreakMode="tailTruncation" baselineAdjustment="alignBaselines" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="avL-VK-Kha">
<rect key="frame" x="10" y="429" width="65" height="24"/> <rect key="frame" x="10" y="429" width="65" height="24"/>
<constraints> <constraints>
<constraint firstAttribute="width" constant="65" id="6oA-g2-Xq4"/> <constraint firstAttribute="width" constant="65" id="6oA-g2-Xq4"/>
...@@ -54,7 +54,7 @@ ...@@ -54,7 +54,7 @@
<nil key="textColor"/> <nil key="textColor"/>
<nil key="highlightedColor"/> <nil key="highlightedColor"/>
</label> </label>
<button opaque="NO" contentMode="scaleToFill" ambiguous="YES" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" showsTouchWhenHighlighted="YES" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="wUL-9N-u1V"> <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" showsTouchWhenHighlighted="YES" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="wUL-9N-u1V">
<rect key="frame" x="16" y="597" width="63.5" height="30"/> <rect key="frame" x="16" y="597" width="63.5" height="30"/>
<color key="backgroundColor" white="0.0" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/> <color key="backgroundColor" white="0.0" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
<state key="normal" title="Image"> <state key="normal" title="Image">
...@@ -64,7 +64,7 @@ ...@@ -64,7 +64,7 @@
<action selector="selectImageAct:" destination="BYZ-38-t0r" eventType="touchUpInside" id="5uR-SM-fKO"/> <action selector="selectImageAct:" destination="BYZ-38-t0r" eventType="touchUpInside" id="5uR-SM-fKO"/>
</connections> </connections>
</button> </button>
<button opaque="NO" contentMode="scaleToFill" ambiguous="YES" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" showsTouchWhenHighlighted="YES" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="XpL-9M-UOp"> <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" showsTouchWhenHighlighted="YES" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="XpL-9M-UOp">
<rect key="frame" x="109.5" y="597" width="63" height="30"/> <rect key="frame" x="109.5" y="597" width="63" height="30"/>
<color key="backgroundColor" white="0.0" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/> <color key="backgroundColor" white="0.0" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
<state key="normal" title="Load"> <state key="normal" title="Load">
...@@ -74,7 +74,7 @@ ...@@ -74,7 +74,7 @@
<action selector="loadAct:" destination="BYZ-38-t0r" eventType="touchUpInside" id="fZ5-CQ-jCY"/> <action selector="loadAct:" destination="BYZ-38-t0r" eventType="touchUpInside" id="fZ5-CQ-jCY"/>
</connections> </connections>
</button> </button>
<button opaque="NO" contentMode="scaleToFill" ambiguous="YES" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" showsTouchWhenHighlighted="YES" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="R90-Yf-S6g"> <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" showsTouchWhenHighlighted="YES" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="R90-Yf-S6g">
<rect key="frame" x="202.5" y="597" width="63.5" height="30"/> <rect key="frame" x="202.5" y="597" width="63.5" height="30"/>
<color key="backgroundColor" white="0.0" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/> <color key="backgroundColor" white="0.0" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
<state key="normal" title="Predict"> <state key="normal" title="Predict">
...@@ -84,7 +84,7 @@ ...@@ -84,7 +84,7 @@
<action selector="predictAct:" destination="BYZ-38-t0r" eventType="touchUpInside" id="Iyy-sY-gt4"/> <action selector="predictAct:" destination="BYZ-38-t0r" eventType="touchUpInside" id="Iyy-sY-gt4"/>
</connections> </connections>
</button> </button>
<button opaque="NO" contentMode="scaleToFill" ambiguous="YES" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" showsTouchWhenHighlighted="YES" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="a3K-ri-NVs"> <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" showsTouchWhenHighlighted="YES" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="a3K-ri-NVs">
<rect key="frame" x="296" y="597" width="63" height="30"/> <rect key="frame" x="296" y="597" width="63" height="30"/>
<color key="backgroundColor" white="0.0" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/> <color key="backgroundColor" white="0.0" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
<state key="normal" title="Clear"> <state key="normal" title="Clear">
...@@ -94,7 +94,7 @@ ...@@ -94,7 +94,7 @@
<action selector="clearAct:" destination="BYZ-38-t0r" eventType="touchUpInside" id="JYf-UX-rCR"/> <action selector="clearAct:" destination="BYZ-38-t0r" eventType="touchUpInside" id="JYf-UX-rCR"/>
</connections> </connections>
</button> </button>
<view contentMode="scaleToFill" ambiguous="YES" translatesAutoresizingMaskIntoConstraints="NO" id="w7H-Sk-Rai"> <view contentMode="scaleToFill" translatesAutoresizingMaskIntoConstraints="NO" id="w7H-Sk-Rai">
<rect key="frame" x="79.5" y="597" width="30" height="30"/> <rect key="frame" x="79.5" y="597" width="30" height="30"/>
<color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/> <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
<constraints> <constraints>
...@@ -102,7 +102,7 @@ ...@@ -102,7 +102,7 @@
<constraint firstAttribute="width" constant="30" id="vYd-Fc-KAj"/> <constraint firstAttribute="width" constant="30" id="vYd-Fc-KAj"/>
</constraints> </constraints>
</view> </view>
<view contentMode="scaleToFill" ambiguous="YES" translatesAutoresizingMaskIntoConstraints="NO" id="T4O-nx-ciH"> <view contentMode="scaleToFill" translatesAutoresizingMaskIntoConstraints="NO" id="T4O-nx-ciH">
<rect key="frame" x="266" y="597" width="30" height="30"/> <rect key="frame" x="266" y="597" width="30" height="30"/>
<color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/> <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
<constraints> <constraints>
...@@ -110,7 +110,7 @@ ...@@ -110,7 +110,7 @@
<constraint firstAttribute="width" constant="30" id="fXE-S7-ZXL"/> <constraint firstAttribute="width" constant="30" id="fXE-S7-ZXL"/>
</constraints> </constraints>
</view> </view>
<view contentMode="scaleToFill" ambiguous="YES" translatesAutoresizingMaskIntoConstraints="NO" id="976-fk-Kx2"> <view contentMode="scaleToFill" translatesAutoresizingMaskIntoConstraints="NO" id="976-fk-Kx2">
<rect key="frame" x="172.5" y="597" width="30" height="30"/> <rect key="frame" x="172.5" y="597" width="30" height="30"/>
<color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/> <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
<constraints> <constraints>
...@@ -118,7 +118,7 @@ ...@@ -118,7 +118,7 @@
<constraint firstAttribute="width" constant="30" id="L4p-hP-s5C"/> <constraint firstAttribute="width" constant="30" id="L4p-hP-s5C"/>
</constraints> </constraints>
</view> </view>
<label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" ambiguous="YES" text="耗时:" lineBreakMode="tailTruncation" numberOfLines="0" baselineAdjustment="alignBaselines" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="m5L-O7-P31"> <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" text="耗时:" lineBreakMode="tailTruncation" numberOfLines="0" baselineAdjustment="alignBaselines" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="m5L-O7-P31">
<rect key="frame" x="15" y="277" width="350" height="38"/> <rect key="frame" x="15" y="277" width="350" height="38"/>
<constraints> <constraints>
<constraint firstAttribute="height" constant="38" id="6SS-sb-7I2"/> <constraint firstAttribute="height" constant="38" id="6SS-sb-7I2"/>
...@@ -133,7 +133,7 @@ ...@@ -133,7 +133,7 @@
<constraint firstAttribute="width" secondItem="4ey-Xr-U4e" secondAttribute="height" multiplier="6.5:1" id="8c5-FF-lB9"/> <constraint firstAttribute="width" secondItem="4ey-Xr-U4e" secondAttribute="height" multiplier="6.5:1" id="8c5-FF-lB9"/>
</constraints> </constraints>
</imageView> </imageView>
<textView clipsSubviews="YES" multipleTouchEnabled="YES" contentMode="scaleToFill" ambiguous="YES" editable="NO" text="结果:" textAlignment="natural" translatesAutoresizingMaskIntoConstraints="NO" id="VQn-bS-fWp"> <textView clipsSubviews="YES" multipleTouchEnabled="YES" contentMode="scaleToFill" editable="NO" text="结果:" textAlignment="natural" translatesAutoresizingMaskIntoConstraints="NO" id="VQn-bS-fWp">
<rect key="frame" x="10" y="323" width="355" height="70"/> <rect key="frame" x="10" y="323" width="355" height="70"/>
<color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/> <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
<constraints> <constraints>
...@@ -203,7 +203,6 @@ ...@@ -203,7 +203,6 @@
</scene> </scene>
</scenes> </scenes>
<resources> <resources>
<image name="hand.jpg" width="564" height="664"/>
<image name="paddle-mobile.png" width="402" height="62"/> <image name="paddle-mobile.png" width="402" height="62"/>
</resources> </resources>
</document> </document>
...@@ -33,7 +33,7 @@ class MobileNet_ssd_hand: Net{ ...@@ -33,7 +33,7 @@ class MobileNet_ssd_hand: Net{
return " \(res)" return " \(res)"
} }
func fetchResult(paddleMobileRes: ResultHolder<Float32>) -> [Float32] { func fetchResult(paddleMobileRes: ResultHolder) -> [Float32] {
guard let interRes = paddleMobileRes.intermediateResults else { guard let interRes = paddleMobileRes.intermediateResults else {
fatalError(" need have inter result ") fatalError(" need have inter result ")
...@@ -47,13 +47,17 @@ class MobileNet_ssd_hand: Net{ ...@@ -47,13 +47,17 @@ class MobileNet_ssd_hand: Net{
fatalError() fatalError()
} }
var scoreFormatArr: [Float32] = score.metalTexture.realNHWC(dim: (n: score.originDim[0], h: score.originDim[1], w: score.originDim[2], c: score.originDim[3])) var scoreFormatArr: [Float32] = score.metalTexture.realNHWC(dim: (n: score.padToFourDim[0], h: score.padToFourDim[1], w: score.padToFourDim[2], c: score.padToFourDim[3]))
print("score: ")
print(scoreFormatArr.strideArray())
var bboxArr = bbox.metalTexture.float32Array() var bboxArr = bbox.metalTexture.float32Array()
print("bbox: ")
print(bboxArr.strideArray())
let nmsCompute = NMSCompute.init() let nmsCompute = NMSCompute.init()
nmsCompute.scoreThredshold = 0.01 nmsCompute.scoreThredshold = 0.01
nmsCompute.nmsTopK = 200 nmsCompute.nmsTopK = 400
nmsCompute.keepTopK = 200 nmsCompute.keepTopK = 200
nmsCompute.nmsEta = 1.0 nmsCompute.nmsEta = 1.0
nmsCompute.nmsThreshold = 0.45 nmsCompute.nmsThreshold = 0.45
...@@ -68,6 +72,7 @@ class MobileNet_ssd_hand: Net{ ...@@ -68,6 +72,7 @@ class MobileNet_ssd_hand: Net{
let output: [Float32] = result.map { $0.floatValue } let output: [Float32] = result.map { $0.floatValue }
return output return output
} }
......
...@@ -37,7 +37,7 @@ protocol Net { ...@@ -37,7 +37,7 @@ protocol Net {
var preprocessKernel: CusomKernel { get } var preprocessKernel: CusomKernel { get }
func getTexture(image: CGImage, getTexture: @escaping (MTLTexture) -> Void) func getTexture(image: CGImage, getTexture: @escaping (MTLTexture) -> Void)
func resultStr(res: [Float]) -> String func resultStr(res: [Float]) -> String
func fetchResult(paddleMobileRes: ResultHolder<Float32>) -> [Float32] func fetchResult(paddleMobileRes: ResultHolder) -> [Float32]
mutating func load() throws mutating func load() throws
func predict(inTexture: MTLTexture, completion: @escaping ((time:TimeInterval, resultArray: [Float32])) -> Void) throws func predict(inTexture: MTLTexture, completion: @escaping ((time:TimeInterval, resultArray: [Float32])) -> Void) throws
...@@ -82,7 +82,7 @@ extension Net { ...@@ -82,7 +82,7 @@ extension Net {
} }
} }
func fetchResult(paddleMobileRes: ResultHolder<Float32>) -> [Float32] { func fetchResult(paddleMobileRes: ResultHolder) -> [Float32] {
return paddleMobileRes.resultArr return paddleMobileRes.resultArr
} }
......
...@@ -19,17 +19,17 @@ import MetalPerformanceShaders ...@@ -19,17 +19,17 @@ import MetalPerformanceShaders
let threadSupport = [1] let threadSupport = [1]
let modelHelperMap: [SupportModel : Net] = [.mobilenet_ssd : MobileNet_ssd_hand.init(), .genet : Genet.init()] let modelHelperMap: [SupportModel : Net] = [ .mobilenet : MobileNet.init(), .mobilenet_ssd : MobileNet_ssd_hand.init(), .genet : Genet.init()]
//, .genet : Genet.init() //, .genet : Genet.init()
//let modelHelperMap: [SupportModel : Net] = [.mobilenet : MobileNet.init(), .mobilenet_ssd : MobileNet_ssd_hand.init()] //let modelHelperMap: [SupportModel : Net] = [.mobilenet : MobileNet.init(), .mobilenet_ssd : MobileNet_ssd_hand.init()]
enum SupportModel: String{ enum SupportModel: String{
// case mobilenet = "mobilenet" case mobilenet = "mobilenet"
case mobilenet_ssd = "mobilenetssd" case mobilenet_ssd = "mobilenetssd"
case genet = "genet" case genet = "genet"
static func supportedModels() -> [SupportModel] { static func supportedModels() -> [SupportModel] {
//.mobilenet, //
return [.mobilenet_ssd ,.genet] return [.mobilenet, .mobilenet_ssd ,.genet]
} }
} }
...@@ -87,7 +87,7 @@ class ViewController: UIViewController { ...@@ -87,7 +87,7 @@ class ViewController: UIViewController {
fatalError() fatalError()
} }
// print(result.resultArray) print(result.resultArray.strideArray())
if i == max - 1 { if i == max - 1 {
let time = Date.init().timeIntervalSince(startDate) let time = Date.init().timeIntervalSince(startDate)
DispatchQueue.main.async { DispatchQueue.main.async {
......
...@@ -41,7 +41,6 @@ ...@@ -41,7 +41,6 @@
FC0E2DBE20EE460D009C1FAC /* BatchNormKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC0E2DBD20EE460D009C1FAC /* BatchNormKernel.swift */; }; FC0E2DBE20EE460D009C1FAC /* BatchNormKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC0E2DBD20EE460D009C1FAC /* BatchNormKernel.swift */; };
FC0E2DC020EE461F009C1FAC /* ElementwiseAddKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC0E2DBF20EE461F009C1FAC /* ElementwiseAddKernel.swift */; }; FC0E2DC020EE461F009C1FAC /* ElementwiseAddKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC0E2DBF20EE461F009C1FAC /* ElementwiseAddKernel.swift */; };
FC1B16B320EC9A4F00678B91 /* Kernels.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC1B16B220EC9A4F00678B91 /* Kernels.metal */; }; FC1B16B320EC9A4F00678B91 /* Kernels.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC1B16B220EC9A4F00678B91 /* Kernels.metal */; };
FC1B186620ECF1C600678B91 /* ResizeKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC1B186520ECF1C600678B91 /* ResizeKernel.swift */; };
FC3602CC2108819F00FACB58 /* PaddleMobileUnitTest.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC3602CB2108819F00FACB58 /* PaddleMobileUnitTest.swift */; }; FC3602CC2108819F00FACB58 /* PaddleMobileUnitTest.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC3602CB2108819F00FACB58 /* PaddleMobileUnitTest.swift */; };
FC4CB74920F0B954007C0C6D /* ConvKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC4CB74820F0B954007C0C6D /* ConvKernel.metal */; }; FC4CB74920F0B954007C0C6D /* ConvKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC4CB74820F0B954007C0C6D /* ConvKernel.metal */; };
FC4CB74B20F12C30007C0C6D /* ProgramOptimize.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC4CB74A20F12C30007C0C6D /* ProgramOptimize.swift */; }; FC4CB74B20F12C30007C0C6D /* ProgramOptimize.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC4CB74A20F12C30007C0C6D /* ProgramOptimize.swift */; };
...@@ -133,7 +132,6 @@ ...@@ -133,7 +132,6 @@
FC0E2DBD20EE460D009C1FAC /* BatchNormKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = BatchNormKernel.swift; sourceTree = "<group>"; }; FC0E2DBD20EE460D009C1FAC /* BatchNormKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = BatchNormKernel.swift; sourceTree = "<group>"; };
FC0E2DBF20EE461F009C1FAC /* ElementwiseAddKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ElementwiseAddKernel.swift; sourceTree = "<group>"; }; FC0E2DBF20EE461F009C1FAC /* ElementwiseAddKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ElementwiseAddKernel.swift; sourceTree = "<group>"; };
FC1B16B220EC9A4F00678B91 /* Kernels.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = Kernels.metal; sourceTree = "<group>"; }; FC1B16B220EC9A4F00678B91 /* Kernels.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = Kernels.metal; sourceTree = "<group>"; };
FC1B186520ECF1C600678B91 /* ResizeKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ResizeKernel.swift; sourceTree = "<group>"; };
FC27990D21341016000B6BAD /* BoxCoder.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = BoxCoder.metal; sourceTree = "<group>"; }; FC27990D21341016000B6BAD /* BoxCoder.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = BoxCoder.metal; sourceTree = "<group>"; };
FC3602CB2108819F00FACB58 /* PaddleMobileUnitTest.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PaddleMobileUnitTest.swift; sourceTree = "<group>"; }; FC3602CB2108819F00FACB58 /* PaddleMobileUnitTest.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PaddleMobileUnitTest.swift; sourceTree = "<group>"; };
FC4CB74820F0B954007C0C6D /* ConvKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ConvKernel.metal; sourceTree = "<group>"; }; FC4CB74820F0B954007C0C6D /* ConvKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ConvKernel.metal; sourceTree = "<group>"; };
...@@ -326,7 +324,6 @@ ...@@ -326,7 +324,6 @@
FCEB6837212F00B100D2448E /* metal */, FCEB6837212F00B100D2448E /* metal */,
FCDDC6C7212FA3CA00E5EF74 /* ConvTransposeKernel.swift */, FCDDC6C7212FA3CA00E5EF74 /* ConvTransposeKernel.swift */,
FC0E2DBB20EE45FE009C1FAC /* ConvKernel.swift */, FC0E2DBB20EE45FE009C1FAC /* ConvKernel.swift */,
FC1B186520ECF1C600678B91 /* ResizeKernel.swift */,
FC0E2DB920EE3B8D009C1FAC /* ReluKernel.swift */, FC0E2DB920EE3B8D009C1FAC /* ReluKernel.swift */,
FC0E2DBD20EE460D009C1FAC /* BatchNormKernel.swift */, FC0E2DBD20EE460D009C1FAC /* BatchNormKernel.swift */,
FC0E2DBF20EE461F009C1FAC /* ElementwiseAddKernel.swift */, FC0E2DBF20EE461F009C1FAC /* ElementwiseAddKernel.swift */,
...@@ -506,7 +503,6 @@ ...@@ -506,7 +503,6 @@
FC039BBB20E11CC20081E9F8 /* ProgramDesc.swift in Sources */, FC039BBB20E11CC20081E9F8 /* ProgramDesc.swift in Sources */,
FC9D037920E229E4000F735A /* OpParam.swift in Sources */, FC9D037920E229E4000F735A /* OpParam.swift in Sources */,
FC3602CC2108819F00FACB58 /* PaddleMobileUnitTest.swift in Sources */, FC3602CC2108819F00FACB58 /* PaddleMobileUnitTest.swift in Sources */,
FC1B186620ECF1C600678B91 /* ResizeKernel.swift in Sources */,
FCF2D73820E64E70007AC5F5 /* Kernel.swift in Sources */, FCF2D73820E64E70007AC5F5 /* Kernel.swift in Sources */,
FCDDC6CC212FDFDB00E5EF74 /* ReluKernel.metal in Sources */, FCDDC6CC212FDFDB00E5EF74 /* ReluKernel.metal in Sources */,
FC0226562138F33800F395E2 /* TransposeKernel.metal in Sources */, FC0226562138F33800F395E2 /* TransposeKernel.metal in Sources */,
......
...@@ -113,7 +113,7 @@ extension MTLDevice { ...@@ -113,7 +113,7 @@ extension MTLDevice {
return tensor return tensor
} }
func tensor2texture<P>(value: [P], dim: [Int], transpose: [Int] = [0, 1, 2, 3]) -> MTLTexture { func tensor2texture<P>(value: [P], dim: [Int], transpose: [Int] = [0, 1, 2, 3], inComputePrecision: ComputePrecision = .Float32) -> MTLTexture {
if value.count > 0 { if value.count > 0 {
assert(value.count == dim.reduce(1) { $0 * $1 }) assert(value.count == dim.reduce(1) { $0 * $1 })
} }
...@@ -129,7 +129,13 @@ extension MTLDevice { ...@@ -129,7 +129,13 @@ extension MTLDevice {
textureDesc.height = ndim[1] textureDesc.height = ndim[1]
textureDesc.depth = 1 textureDesc.depth = 1
textureDesc.usage = [.shaderRead, .shaderWrite] textureDesc.usage = [.shaderRead, .shaderWrite]
textureDesc.pixelFormat = .rgba32Float
if inComputePrecision == .Float16 {
textureDesc.pixelFormat = .rgba16Float
} else if inComputePrecision == .Float32 {
textureDesc.pixelFormat = .rgba32Float
}
textureDesc.textureType = .type2DArray textureDesc.textureType = .type2DArray
textureDesc.storageMode = .shared textureDesc.storageMode = .shared
textureDesc.cpuCacheMode = .defaultCache textureDesc.cpuCacheMode = .defaultCache
...@@ -354,13 +360,8 @@ public extension MTLTexture { ...@@ -354,13 +360,8 @@ public extension MTLTexture {
} }
// n c h w - dim // n c h w - dim
func toTensor(dim: (n: Int, c: Int, h: Int, w: Int), texturePrecision: ComputePrecision = .Float16) -> [Float32] { func toTensor(dim: (n: Int, c: Int, h: Int, w: Int)) -> [Float32] {
// print("origin dim: \(dim)")
print("texture: ")
print(self)
var textureArray: [Float32] var textureArray: [Float32]
// if texturePrecision == .Float16
if pixelFormat == .rgba32Float { if pixelFormat == .rgba32Float {
textureArray = floatArray { (i : Float32) -> Float32 in textureArray = floatArray { (i : Float32) -> Float32 in
return i return i
...@@ -388,11 +389,10 @@ public extension MTLTexture { ...@@ -388,11 +389,10 @@ public extension MTLTexture {
} }
} }
} }
print(" tensor count -- \(output.count)")
return output return output
} }
func realNHWC(dim: (n: Int, h: Int, w: Int, c: Int), texturePrecision: ComputePrecision = .Float16) -> [Float32] { func realNHWC(dim: (n: Int, h: Int, w: Int, c: Int)) -> [Float32] {
// print("origin dim: \(dim)") // print("origin dim: \(dim)")
// print("texture: ") // print("texture: ")
// print(self) // print(self)
......
...@@ -14,18 +14,18 @@ ...@@ -14,18 +14,18 @@
import Foundation import Foundation
let testTo = 61 let testTo = 161
var isTest = false var isTest = false
let computePrecision: ComputePrecision = .Float32 let computePrecision: ComputePrecision = .Float16
public class ResultHolder<P: PrecisionType> { public class ResultHolder {
public let dim: [Int] public let dim: [Int]
public let resultArr: [P] public let resultArr: [Float32]
public var intermediateResults: [String : [Variant]]? public var intermediateResults: [String : [Variant]]?
public let elapsedTime: Double public let elapsedTime: Double
public init(inDim: [Int], inResult: [P], inElapsedTime: Double, inIntermediateResults: [String : [Variant]]? = nil) { public init(inDim: [Int], inResult: [Float32], inElapsedTime: Double, inIntermediateResults: [String : [Variant]]? = nil) {
dim = inDim dim = inDim
resultArr = inResult resultArr = inResult
elapsedTime = inElapsedTime elapsedTime = inElapsedTime
...@@ -78,7 +78,7 @@ public class Executor<P: PrecisionType> { ...@@ -78,7 +78,7 @@ public class Executor<P: PrecisionType> {
} }
} }
public func predict(input: MTLTexture, dim: [Int], completionHandle: @escaping (ResultHolder<P>) -> Void, preProcessKernle: CusomKernel? = nil, except: Int = 0) throws { public func predict(input: MTLTexture, dim: [Int], completionHandle: @escaping (ResultHolder) -> Void, preProcessKernle: CusomKernel? = nil, except: Int = 0) throws {
guard let buffer = queue.makeCommandBuffer() else { guard let buffer = queue.makeCommandBuffer() else {
throw PaddleMobileError.predictError(message: "CommandBuffer is nil") throw PaddleMobileError.predictError(message: "CommandBuffer is nil")
} }
...@@ -114,12 +114,10 @@ public class Executor<P: PrecisionType> { ...@@ -114,12 +114,10 @@ public class Executor<P: PrecisionType> {
buffer.addCompletedHandler { (commandbuffer) in buffer.addCompletedHandler { (commandbuffer) in
// let inputArr = resInput.floatArray(res: { (p:P) -> P in // let inputArr = resInput.toTensor(dim: (n: dim[0], c: dim[3], h: dim[1], w: dim[2]))
// return p //// print(inputArr.strideArray())
// })
// print(inputArr.strideArray())
// //
// writeToLibrary(fileName: "genet_input_hand", array: inputArr) // writeToLibrary(fileName: "test_image_ssd", array: inputArr)
// print("write to library done") // print("write to library done")
// return // return
// print(inputArr) // print(inputArr)
...@@ -133,23 +131,23 @@ public class Executor<P: PrecisionType> { ...@@ -133,23 +131,23 @@ public class Executor<P: PrecisionType> {
print(" 第 \(i) 个 op: ") print(" 第 \(i) 个 op: ")
op.delogOutput() op.delogOutput()
} }
// self.ops[59].delogOutput()
// return;
// self.ops[testTo - 2].delogOutput()
// self.ops[testTo - 1].delogOutput()
// self.ops[60].delogOutput() // self.ops[60].delogOutput()
return // return
let afterDate = Date.init() let afterDate = Date.init()
var resultHolder: ResultHolder
var resultHolder: ResultHolder<P>
if except > 0 { if except > 0 {
resultHolder = ResultHolder<P>.init(inDim: [], inResult: [], inElapsedTime: afterDate.timeIntervalSince(beforeDate), inIntermediateResults: outputTextures) resultHolder = ResultHolder.init(inDim: [], inResult: [], inElapsedTime: afterDate.timeIntervalSince(beforeDate), inIntermediateResults: outputTextures)
} else { } else {
let outputVar: Variant = self.program.scope.output()! let outputVar: Variant = self.program.scope.output()!
let output: Texture<P> = outputVar as! Texture<P> let output: Texture<P> = outputVar as! Texture<P>
resultHolder = ResultHolder<P>.init(inDim: output.dim.dims, inResult: output.metalTexture.floatArray(res: { (p:P) -> P in resultHolder = ResultHolder.init(inDim: output.dim.dims, inResult: output.toTensor(), inElapsedTime: afterDate.timeIntervalSince(beforeDate))
return p
}), inElapsedTime: afterDate.timeIntervalSince(beforeDate))
} }
completionHandle(resultHolder) completionHandle(resultHolder)
......
...@@ -159,7 +159,7 @@ public class Loader<P: PrecisionType> { ...@@ -159,7 +159,7 @@ public class Loader<P: PrecisionType> {
} catch let error { } catch let error {
throw error throw error
} }
tensor.convert(to: DataLayout.NHWC()) // tensor.convert(to: DataLayout.NHWC())
// tensor.initBuffer(device: device) // tensor.initBuffer(device: device)
scope[varDesc.name] = tensor scope[varDesc.name] = tensor
} else { } else {
...@@ -168,7 +168,7 @@ public class Loader<P: PrecisionType> { ...@@ -168,7 +168,7 @@ public class Loader<P: PrecisionType> {
} }
} else { } else {
if varDesc.name == fetchKey { if varDesc.name == fetchKey {
scope[varDesc.name] = ResultHolder<P>.init(inDim: [], inResult: [], inElapsedTime: 0.0) scope[varDesc.name] = ResultHolder.init(inDim: [], inResult: [], inElapsedTime: 0.0)
} else if varDesc.name == feedKey { } else if varDesc.name == feedKey {
} }
} }
......
...@@ -59,28 +59,28 @@ class BoxcoderOp<P: PrecisionType>: Operator<BoxcoderKernel<P>, BoxcoderParam<P> ...@@ -59,28 +59,28 @@ class BoxcoderOp<P: PrecisionType>: Operator<BoxcoderKernel<P>, BoxcoderParam<P>
func delogOutput() { func delogOutput() {
print(" \(type) output: ") print(" \(type) output: ")
// let priorBoxOriginDim = para.priorBox.originDim // let priorBoxpadToFourDim = para.priorBox.padToFourDim
// let priorBoxArray: [Float32] = para.priorBox.metalTexture.realNHWC(dim: (n: priorBoxOriginDim[0], h: priorBoxOriginDim[1], w: priorBoxOriginDim[2], c: priorBoxOriginDim[3])) // let priorBoxArray: [Float32] = para.priorBox.metalTexture.realNHWC(dim: (n: priorBoxpadToFourDim[0], h: priorBoxpadToFourDim[1], w: priorBoxpadToFourDim[2], c: priorBoxpadToFourDim[3]))
// print(" prior box ") // print(" prior box ")
// print(priorBoxArray.strideArray()) // print(priorBoxArray.strideArray())
// //
// let priorBoxVarOriginDim = para.priorBoxVar.originDim // let priorBoxVarpadToFourDim = para.priorBoxVar.padToFourDim
// let priorBoxVarArray: [Float32] = para.priorBoxVar.metalTexture.realNHWC(dim: (n: priorBoxVarOriginDim[0], h: priorBoxVarOriginDim[1], w: priorBoxVarOriginDim[2], c: priorBoxVarOriginDim[3])) // let priorBoxVarArray: [Float32] = para.priorBoxVar.metalTexture.realNHWC(dim: (n: priorBoxVarpadToFourDim[0], h: priorBoxVarpadToFourDim[1], w: priorBoxVarpadToFourDim[2], c: priorBoxVarpadToFourDim[3]))
// print(" prior box var ") // print(" prior box var ")
// print(priorBoxVarArray.strideArray()) // print(priorBoxVarArray.strideArray())
// //
// let targetBoxOriginDim = para.targetBox.originDim // let targetBoxpadToFourDim = para.targetBox.padToFourDim
// let targetBoxArray: [Float32] = para.targetBox.metalTexture.realNHWC(dim: (n: targetBoxOriginDim[0], h: targetBoxOriginDim[1], w: targetBoxOriginDim[2], c: targetBoxOriginDim[3])) // let targetBoxArray: [Float32] = para.targetBox.metalTexture.realNHWC(dim: (n: targetBoxpadToFourDim[0], h: targetBoxpadToFourDim[1], w: targetBoxpadToFourDim[2], c: targetBoxpadToFourDim[3]))
// print(" target box ") // print(" target box ")
// print(targetBoxArray.strideArray()) // print(targetBoxArray.strideArray())
let targetBoxOriginDim = para.targetBox.originDim let targetBoxpadToFourDim = para.targetBox.padToFourDim
let targetBoxArray = para.targetBox.metalTexture.realNHWC(dim: (n: targetBoxOriginDim[0], h: targetBoxOriginDim[1], w: targetBoxOriginDim[2], c: targetBoxOriginDim[3]), texturePrecision: computePrecision) let targetBoxArray = para.targetBox.metalTexture.realNHWC(dim: (n: targetBoxpadToFourDim[0], h: targetBoxpadToFourDim[1], w: targetBoxpadToFourDim[2], c: targetBoxpadToFourDim[3]))
print(" target box ") print(" target box ")
print(targetBoxArray.strideArray()) print(targetBoxArray.strideArray())
let originDim = para.output.originDim let padToFourDim = para.output.padToFourDim
let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision) let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
print(" output ") print(" output ")
print(outputArray.strideArray()) print(outputArray.strideArray())
} }
......
...@@ -65,12 +65,12 @@ class ConcatOp<P: PrecisionType>: Operator<ConcatKernel<P>, ConcatParam<P>>, Run ...@@ -65,12 +65,12 @@ class ConcatOp<P: PrecisionType>: Operator<ConcatKernel<P>, ConcatParam<P>>, Run
func delogOutput() { func delogOutput() {
print(" \(type) output: ") print(" \(type) output: ")
let originDim = para.output.originDim let padToFourDim = para.output.padToFourDim
if para.output.transpose == [0, 1, 2, 3] { if para.output.transpose == [0, 1, 2, 3] {
let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision) let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
print(outputArray.strideArray()) print(outputArray.strideArray())
} else if para.output.transpose == [0, 2, 3, 1] { } else if para.output.transpose == [0, 2, 3, 1] {
print(para.output.metalTexture.toTensor(dim: (n: originDim[0], c: originDim[1], h: originDim[2], w: originDim[3]), texturePrecision: computePrecision).strideArray()) print(para.output.metalTexture.toTensor(dim: (n: padToFourDim[0], c: padToFourDim[1], h: padToFourDim[2], w: padToFourDim[3])).strideArray())
} else { } else {
fatalError(" not implemet") fatalError(" not implemet")
} }
......
...@@ -34,7 +34,7 @@ class ConvAddBatchNormReluParam<P: PrecisionType>: OpParam { ...@@ -34,7 +34,7 @@ class ConvAddBatchNormReluParam<P: PrecisionType>: OpParam {
scale = try ConvAddBatchNormReluParam.inputScale(inputs: opDesc.paraInputs, from: inScope) scale = try ConvAddBatchNormReluParam.inputScale(inputs: opDesc.paraInputs, from: inScope)
mean = try ConvAddBatchNormReluParam.inputMean(inputs: opDesc.paraInputs, from: inScope) mean = try ConvAddBatchNormReluParam.inputMean(inputs: opDesc.paraInputs, from: inScope)
y = try ConvAddBatchNormReluParam.inputY(inputs: opDesc.inputs, from: inScope) y = try ConvAddBatchNormReluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
} catch let error { } catch let error {
throw error throw error
} }
...@@ -112,7 +112,7 @@ class ConvAddBatchNormReluOp<P: PrecisionType>: Operator<ConvAddBatchNormReluKer ...@@ -112,7 +112,7 @@ class ConvAddBatchNormReluOp<P: PrecisionType>: Operator<ConvAddBatchNormReluKer
func delogOutput() { func delogOutput() {
print(" conv add batchnorm relu output ") print(" conv add batchnorm relu output ")
print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3])).strideArray()) print(para.output.toTensor().strideArray())
// let _: P? = para.input.metalTexture.logDesc(header: "conv add batchnorm relu input: ", stridable: false) // let _: P? = para.input.metalTexture.logDesc(header: "conv add batchnorm relu input: ", stridable: false)
// para.filter.logDataPointer(header: "filter data pointer: ") // para.filter.logDataPointer(header: "filter data pointer: ")
// print("filter: \(para.filter)") // print("filter: \(para.filter)")
......
...@@ -93,24 +93,24 @@ class ConvAddOp<P: PrecisionType>: Operator<ConvAddKernel<P>, ConvAddParam<P>>, ...@@ -93,24 +93,24 @@ class ConvAddOp<P: PrecisionType>: Operator<ConvAddKernel<P>, ConvAddParam<P>>,
} }
func delogOutput() { func delogOutput() {
// print("op \(type): ")
// print(" padding: ")
// print(para.paddings)
// print("stride: ")
// print(para.stride)
// print("dilations: ")
// print(para.dilations)
// print(" para input dim: ")
// print(para.input.dim)
// print(" para filter dim: ")
// print(para.filter.dim)
// print(" para output dim: ")
// print(para.output.dim)
// print(" biase: ")
// let biase: [Float32] = para.y.buffer.array()
// print(biase)
print(" padding: ")
print(para.paddings)
print("stride: ")
print(para.stride)
print("dilations: ")
print(para.dilations)
print(" \(type) output: ") print(" \(type) output: ")
print(" para input dim: ") print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
print(para.input.dim)
print(" para filter dim: ")
print(para.filter.dim)
print(" para output dim: ")
print(para.output.dim)
print(" biase: ")
let biase: [Float32] = para.y.buffer.array()
print(biase)
print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3]), texturePrecision: computePrecision).strideArray())
} }
} }
...@@ -110,7 +110,7 @@ class ConvBNReluOp<P: PrecisionType>: Operator<ConvBNReluKernel<P>, ConvBNReluPa ...@@ -110,7 +110,7 @@ class ConvBNReluOp<P: PrecisionType>: Operator<ConvBNReluKernel<P>, ConvBNReluPa
func delogOutput() { func delogOutput() {
print(" \(type) output: ") print(" \(type) output: ")
print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3]), texturePrecision: computePrecision).strideArray()) print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray())
} }
} }
...@@ -75,7 +75,7 @@ class ConvOp<P: PrecisionType>: Operator<ConvKernel<P>, ConvParam<P>>, Runable, ...@@ -75,7 +75,7 @@ class ConvOp<P: PrecisionType>: Operator<ConvKernel<P>, ConvParam<P>>, Runable,
func delogOutput() { func delogOutput() {
print("conv output : ") print("conv output : ")
print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3])).strideArray()) print(para.output.toTensor().strideArray())
// let _: Float16? = para.output.metalTexture.logDesc() // let _: Float16? = para.output.metalTexture.logDesc()
} }
} }
...@@ -43,13 +43,15 @@ class ConvTransposeOp<P: PrecisionType>: Operator<ConvTransposeKernel<P>, ConvTr ...@@ -43,13 +43,15 @@ class ConvTransposeOp<P: PrecisionType>: Operator<ConvTransposeKernel<P>, ConvTr
} }
func delogOutput() { func delogOutput() {
print(" \(type) output: ") print(" \(type) output: ")
let originDim = para.output.originDim let padToFourDim = para.output.padToFourDim
if para.output.transpose == [0, 1, 2, 3] { if para.output.transpose == [0, 1, 2, 3] {
let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision) let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
print(outputArray.strideArray()) print(outputArray.strideArray())
} else if para.output.transpose == [0, 2, 3, 1] { } else if para.output.transpose == [0, 2, 3, 1] {
print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3]), texturePrecision: computePrecision).strideArray()) let output = para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3]))
print(output.strideArray())
} else { } else {
print(" not implement") print(" not implement")
} }
......
...@@ -58,6 +58,6 @@ class DepthConvOp<P: PrecisionType>: Operator<ConvKernel<P>, ConvParam<P>>, Runa ...@@ -58,6 +58,6 @@ class DepthConvOp<P: PrecisionType>: Operator<ConvKernel<P>, ConvParam<P>>, Runa
func delogOutput() { func delogOutput() {
print(" \(type) output: ") print(" \(type) output: ")
print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3]), texturePrecision: computePrecision).strideArray()) print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray())
} }
} }
...@@ -65,6 +65,6 @@ class DwConvBNReluOp<P: PrecisionType>: Operator<ConvBNReluKernel<P>, ConvBNRelu ...@@ -65,6 +65,6 @@ class DwConvBNReluOp<P: PrecisionType>: Operator<ConvBNReluKernel<P>, ConvBNRelu
func delogOutput() { func delogOutput() {
print(" \(type) output: ") print(" \(type) output: ")
print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3]), texturePrecision: computePrecision).strideArray()) print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray())
} }
} }
...@@ -71,12 +71,15 @@ class ElementwiseAddOp<P: PrecisionType>: Operator<ElementwiseAddKernel<P>, Elem ...@@ -71,12 +71,15 @@ class ElementwiseAddOp<P: PrecisionType>: Operator<ElementwiseAddKernel<P>, Elem
// print(para.inputY.metalTexture.toTensor(dim: (n: para.inputY.tensorDim[0], c: para.inputY.tensorDim[1], h: para.inputY.tensorDim[2], w: para.inputY.tensorDim[3])).strideArray()) // print(para.inputY.metalTexture.toTensor(dim: (n: para.inputY.tensorDim[0], c: para.inputY.tensorDim[1], h: para.inputY.tensorDim[2], w: para.inputY.tensorDim[3])).strideArray())
print(" \(type) output: ") print(" \(type) output: ")
let originDim = para.output.originDim
print(para.inputY)
let padToFourDim = para.output.padToFourDim
if para.output.transpose == [0, 1, 2, 3] { if para.output.transpose == [0, 1, 2, 3] {
let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision) let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
print(outputArray.strideArray()) print(outputArray.strideArray())
} else if para.output.transpose == [0, 2, 3, 1] { } else if para.output.transpose == [0, 2, 3, 1] {
print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3]), texturePrecision: computePrecision).strideArray()) print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
} else { } else {
print(" not implement") print(" not implement")
} }
......
...@@ -61,7 +61,7 @@ class FeedOp<P: PrecisionType>: Operator<Texture2DTo2DArrayKernel<P>, FeedParam< ...@@ -61,7 +61,7 @@ class FeedOp<P: PrecisionType>: Operator<Texture2DTo2DArrayKernel<P>, FeedParam<
func delogOutput() { func delogOutput() {
print(" \(type) output: ") print(" \(type) output: ")
print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3]), texturePrecision: computePrecision).strideArray()) print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray())
} }
} }
...@@ -15,53 +15,60 @@ ...@@ -15,53 +15,60 @@
import Foundation import Foundation
class BatchNormKernel<P: PrecisionType>: Kernel, Computable { class BatchNormKernel<P: PrecisionType>: Kernel, Computable {
var newScale: MTLBuffer var newScale: MTLBuffer
var newBias: MTLBuffer var newBias: MTLBuffer
required init(device: MTLDevice, param: BatchNormParam<P>) {
guard let newScale = device.makeBuffer(length: param.inputScale.buffer.length) else {
fatalError()
}
guard let newBias = device.makeBuffer(length: param.inputBias.buffer.length) else {
fatalError()
}
self.newScale = newScale
self.newBias = newBias
if computePrecision == .Float32 {
super.init(device: device, inFunctionName: "batchnorm")
} else if computePrecision == .Float16 {
super.init(device: device, inFunctionName: "batchnorm_half")
} else {
fatalError()
}
let varianceBuffer : MTLBuffer = param.inputVariance.buffer
required init(device: MTLDevice, param: BatchNormParam<P>) { var invStd: [Float32] = Array(repeating: 0, count: varianceBuffer.length)
guard let newScale = device.makeBuffer(length: param.inputScale.buffer.length) else { let varianceContents = varianceBuffer.contents().assumingMemoryBound(to: P.self)
fatalError() for i in 0..<(varianceBuffer.length / MemoryLayout<P>.stride) {
} invStd[i] = 1 / (Float32(varianceContents[i]) + param.epsilon).squareRoot()
guard let newBias = device.makeBuffer(length: param.inputBias.buffer.length) else {
fatalError()
}
self.newScale = newScale
self.newBias = newBias
super.init(device: device, inFunctionName: "batchnorm")
let varianceBuffer : MTLBuffer = param.inputVariance.buffer
var invStd: [Float32] = Array(repeating: 0, count: varianceBuffer.length)
let varianceContents = varianceBuffer.contents().assumingMemoryBound(to: P.self)
for i in 0..<(varianceBuffer.length / MemoryLayout<P>.stride) {
invStd[i] = 1 / (Float32(varianceContents[i]) + param.epsilon).squareRoot()
}
let newScaleContents = newScale.contents().assumingMemoryBound(to: P.self)
let newBiasContents = newBias.contents().assumingMemoryBound(to: P.self)
let scale : MTLBuffer = param.inputScale.buffer
let scaleContents = scale.contents().assumingMemoryBound(to: P.self)
let bias : MTLBuffer = param.inputBias.buffer
let biasContents = bias.contents().assumingMemoryBound(to: P.self)
let meanContents = param.inputMean.buffer.contents().assumingMemoryBound(to: P.self)
for i in 0..<(newScale.length / MemoryLayout<P>.stride) {
newScaleContents[i] = P(invStd[i] * Float32(scaleContents[i]))
newBiasContents[i] = P(Float32(biasContents[i]) - Float32(meanContents[i]) * invStd[i] * Float32(scaleContents[i]))
}
} }
func compute(commandBuffer: MTLCommandBuffer, param: BatchNormParam<P>) throws { let newScaleContents = newScale.contents().assumingMemoryBound(to: P.self)
guard let encoder = commandBuffer.makeComputeCommandEncoder() else { let newBiasContents = newBias.contents().assumingMemoryBound(to: P.self)
throw PaddleMobileError.predictError(message: " encoder is nil") let scale : MTLBuffer = param.inputScale.buffer
} let scaleContents = scale.contents().assumingMemoryBound(to: P.self)
print("BatchNorm compute") let bias : MTLBuffer = param.inputBias.buffer
encoder.setTexture(param.input.metalTexture, index: 0) let biasContents = bias.contents().assumingMemoryBound(to: P.self)
encoder.setTexture(param.output.metalTexture, index: 1) let meanContents = param.inputMean.buffer.contents().assumingMemoryBound(to: P.self)
encoder.setBuffer(newScale, offset: 0, index: 0)
encoder.setBuffer(newBias, offset: 0, index: 1) for i in 0..<(newScale.length / MemoryLayout<P>.stride) {
encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) newScaleContents[i] = P(invStd[i] * Float32(scaleContents[i]))
encoder.endEncoding() newBiasContents[i] = P(Float32(biasContents[i]) - Float32(meanContents[i]) * invStd[i] * Float32(scaleContents[i]))
}
}
func compute(commandBuffer: MTLCommandBuffer, param: BatchNormParam<P>) throws {
guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
throw PaddleMobileError.predictError(message: " encoder is nil")
} }
print("BatchNorm compute")
encoder.setTexture(param.input.metalTexture, index: 0)
encoder.setTexture(param.output.metalTexture, index: 1)
encoder.setBuffer(newScale, offset: 0, index: 0)
encoder.setBuffer(newBias, offset: 0, index: 1)
encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
encoder.endEncoding()
}
} }
...@@ -49,26 +49,37 @@ class ConvAddBatchNormReluKernel<P: PrecisionType>: Kernel, Computable, Testable ...@@ -49,26 +49,37 @@ class ConvAddBatchNormReluKernel<P: PrecisionType>: Kernel, Computable, Testable
var metalParam: MetalConvParam! var metalParam: MetalConvParam!
required init(device: MTLDevice, param: ConvAddBatchNormReluParam<P>) { required init(device: MTLDevice, param: ConvAddBatchNormReluParam<P>) {
param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision) param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision)
if param.filter.width == 1 && param.filter.height == 1 {
super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1")
} else if param.filter.channel == 1 {
super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3")
} else {
super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3")
}
param.filter.initBuffer(device: device, precision: computePrecision) param.filter.initBuffer(device: device, precision: computePrecision)
param.y.initBuffer(device: device, precision: computePrecision) param.y.initBuffer(device: device, precision: computePrecision)
param.variance.initBuffer(device: device, precision: .Float32) param.variance.initBuffer(device: device, precision: .Float32)
param.mean.initBuffer(device: device, precision: .Float32) param.mean.initBuffer(device: device, precision: .Float32)
param.scale.initBuffer(device: device, precision: .Float32) param.scale.initBuffer(device: device, precision: .Float32)
param.bias.initBuffer(device: device, precision: .Float32) param.bias.initBuffer(device: device, precision: .Float32)
if computePrecision == .Float32 {
if param.filter.width == 1 && param.filter.height == 1 {
super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1")
} else if param.filter.channel == 1 {
super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3")
} else if param.filter.width == 3 && param.filter.height == 3 {
super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3")
} else {
fatalError(" unsupport ")
}
} else if computePrecision == .Float16 {
if param.filter.width == 1 && param.filter.height == 1 {
super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1_half")
} else if param.filter.channel == 1 {
super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3_half")
} else if param.filter.width == 3 && param.filter.height == 3 {
super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3_half")
} else {
fatalError(" unsupport ")
}
} else {
fatalError()
}
let offsetX = param.filter.width/2 - Int(param.paddings[0]) let offsetX = param.filter.width/2 - Int(param.paddings[0])
let offsetY = param.filter.height/2 - Int(param.paddings[1]) let offsetY = param.filter.height/2 - Int(param.paddings[1])
...@@ -108,10 +119,10 @@ class ConvAddBatchNormReluKernel<P: PrecisionType>: Kernel, Computable, Testable ...@@ -108,10 +119,10 @@ class ConvAddBatchNormReluKernel<P: PrecisionType>: Kernel, Computable, Testable
var newBiaseBuffer: MTLBuffer var newBiaseBuffer: MTLBuffer
var newScaleBuffer: MTLBuffer var newScaleBuffer: MTLBuffer
if computePrecision == .Float16 { if computePrecision == .Float32 {
newBiaseBuffer = device.makeBuffer(bytes: newBiase, length: param.bias.buffer.length)! newBiaseBuffer = device.makeBuffer(bytes: newBiase, length: param.bias.buffer.length)!
newScaleBuffer = device.makeBuffer(bytes: newScale, length: param.scale.buffer.length)! newScaleBuffer = device.makeBuffer(bytes: newScale, length: param.scale.buffer.length)!
} else if computePrecision == .Float32 { } else if computePrecision == .Float16 {
newBiaseBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)! newBiaseBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)!
newScaleBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)! newScaleBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)!
...@@ -138,7 +149,6 @@ class ConvAddBatchNormReluKernel<P: PrecisionType>: Kernel, Computable, Testable ...@@ -138,7 +149,6 @@ class ConvAddBatchNormReluKernel<P: PrecisionType>: Kernel, Computable, Testable
throw PaddleMobileError.predictError(message: " encode is nil") throw PaddleMobileError.predictError(message: " encode is nil")
} }
encoder.setTexture(param.input.metalTexture, index: 0) encoder.setTexture(param.input.metalTexture, index: 0)
encoder.setTexture(param.output.metalTexture, index: 1) encoder.setTexture(param.output.metalTexture, index: 1)
encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0) encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
......
...@@ -17,14 +17,23 @@ import Foundation ...@@ -17,14 +17,23 @@ import Foundation
class ConvAddKernel<P: PrecisionType>: Kernel, Computable { class ConvAddKernel<P: PrecisionType>: Kernel, Computable {
var metalParam: MetalConvParam! var metalParam: MetalConvParam!
required init(device: MTLDevice, param: ConvAddParam<P>) { required init(device: MTLDevice, param: ConvAddParam<P>) {
param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision)
param.filter.initBuffer(device: device, precision: computePrecision)
param.y.initBuffer(device: device, precision: computePrecision)
if computePrecision == .Float16 { if computePrecision == .Float16 {
if param.filter.width == 1 && param.filter.height == 1 { if param.filter.width == 1 && param.filter.height == 1 {
super.init(device: device, inFunctionName: "conv_add_1x1_half") super.init(device: device, inFunctionName: "conv_add_1x1_half")
} else if param.filter.channel == 1 { } else if param.filter.channel == 1 {
super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_half") super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_half")
} else { } else if param.filter.width == 3 && param.filter.height == 3 {
super.init(device: device, inFunctionName: "conv_add_3x3_half") super.init(device: device, inFunctionName: "conv_add_3x3_half")
} else if param.filter.width == 1 && param.filter.height == 5 {
super.init(device: device, inFunctionName: "conv_add_5x1_half")
} else if param.filter.width == 5 && param.filter.height == 1 {
super.init(device: device, inFunctionName: "conv_add_1x5_half")
} else {
fatalError(" unsupport yet ")
} }
} else if computePrecision == .Float32 { } else if computePrecision == .Float32 {
if param.filter.width == 1 && param.filter.height == 1 { if param.filter.width == 1 && param.filter.height == 1 {
...@@ -35,22 +44,21 @@ class ConvAddKernel<P: PrecisionType>: Kernel, Computable { ...@@ -35,22 +44,21 @@ class ConvAddKernel<P: PrecisionType>: Kernel, Computable {
super.init(device: device, inFunctionName: "conv_add_5x1") super.init(device: device, inFunctionName: "conv_add_5x1")
} else if param.filter.width == 5 && param.filter.height == 1 { } else if param.filter.width == 5 && param.filter.height == 1 {
super.init(device: device, inFunctionName: "conv_add_1x5") super.init(device: device, inFunctionName: "conv_add_1x5")
} else { } else if param.filter.width == 3 && param.filter.height == 3 {
super.init(device: device, inFunctionName: "conv_add_3x3") super.init(device: device, inFunctionName: "conv_add_3x3")
} else {
fatalError(" unsupport yet ")
} }
} else { } else {
fatalError() fatalError()
} }
let offsetY = (Int(param.dilations[1]) * (param.filter.height - 1) + 1)/2 - Int(param.paddings[1]) let offsetY = (Int(param.dilations[1]) * (param.filter.height - 1) + 1)/2 - Int(param.paddings[1])
let offsetX = (Int(param.dilations[0]) * (param.filter.width - 1) + 1)/2 - Int(param.paddings[0]) let offsetX = (Int(param.dilations[0]) * (param.filter.width - 1) + 1)/2 - Int(param.paddings[0])
param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision)
param.filter.initBuffer(device: device, precision: computePrecision)
param.y.initBuffer(device: device, precision: computePrecision)
print(" function: \(functionName)") print(" function: \(functionName)")
print("offset x: \(offsetX)") print("offset x: \(offsetX)")
print("offset y: \(offsetY)") print("offset y: \(offsetY)")
......
...@@ -49,35 +49,41 @@ class ConvBNReluKernel<P: PrecisionType>: Kernel, Computable, Testable { ...@@ -49,35 +49,41 @@ class ConvBNReluKernel<P: PrecisionType>: Kernel, Computable, Testable {
} }
var metalParam: MetalConvParam! var metalParam: MetalConvParam!
required init(device: MTLDevice, param: ConvBNReluParam<P>) { required init(device: MTLDevice, param: ConvBNReluParam<P>) {
param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision)
param.filter.initBuffer(device: device, precision: computePrecision)
param.variance.initBuffer(device: device, precision: .Float32)
param.mean.initBuffer(device: device, precision: .Float32)
param.scale.initBuffer(device: device, precision: .Float32)
param.bias.initBuffer(device: device, precision: .Float32)
if computePrecision == .Float32 { if computePrecision == .Float32 {
if param.filter.width == 1 && param.filter.height == 1 { if param.filter.width == 1 && param.filter.height == 1 {
super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1") super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1")
} else if param.filter.channel == 1 { } else if param.filter.channel == 1 {
super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3") super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3")
} else { } else if param.filter.width == 3 && param.filter.height == 3 {
super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3") super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3")
} else {
fatalError(" unsupport ")
} }
} else if computePrecision == .Float16 { } else if computePrecision == .Float16 {
if param.filter.width == 1 && param.filter.height == 1 { if param.filter.width == 1 && param.filter.height == 1 {
super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1_half") super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1_half")
} else if param.filter.channel == 1 { } else if param.filter.channel == 1 {
super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3_half") super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3_half")
} else { } else if param.filter.width == 3 && param.filter.height == 3 {
super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3_half") super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3_half")
} else {
fatalError(" unsupport ")
} }
} else { } else {
fatalError() fatalError()
} }
param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision)
param.filter.initBuffer(device: device, precision: computePrecision)
param.variance.initBuffer(device: device, precision: .Float32)
param.mean.initBuffer(device: device, precision: .Float32)
param.scale.initBuffer(device: device, precision: .Float32)
param.bias.initBuffer(device: device, precision: .Float32)
let offsetX = param.filter.width/2 - Int(param.paddings[0]) let offsetX = param.filter.width/2 - Int(param.paddings[0])
let offsetY = param.filter.height/2 - Int(param.paddings[1]) let offsetY = param.filter.height/2 - Int(param.paddings[1])
......
...@@ -27,18 +27,20 @@ public struct MetalConvParam { ...@@ -27,18 +27,20 @@ public struct MetalConvParam {
class ConvKernel<P: PrecisionType>: Kernel, Computable { class ConvKernel<P: PrecisionType>: Kernel, Computable {
var metalParam: MetalConvParam! var metalParam: MetalConvParam!
required init(device: MTLDevice, param: ConvParam<P>) { required init(device: MTLDevice, param: ConvParam<P>) {
param.filter.initBuffer(device: device, precision: ComputePrecision.Float32)
if param.filter.width == 1 && param.filter.height == 1 { if param.filter.width == 1 && param.filter.height == 1 {
super.init(device: device, inFunctionName: "conv_1x1") super.init(device: device, inFunctionName: "conv_1x1")
} else if param.filter.channel == 1 { } else if param.filter.channel == 1 {
super.init(device: device, inFunctionName: "depthwise_conv_3x3") super.init(device: device, inFunctionName: "depthwise_conv_3x3")
} else { } else if param.filter.width == 3 && param.filter.height == 3 {
super.init(device: device, inFunctionName: "conv_3x3") super.init(device: device, inFunctionName: "conv_3x3")
} else {
fatalError(" unsupport ")
} }
let offsetX = param.filter.dim[2]/2 - Int(param.paddings[0]) let offsetX = param.filter.dim[2]/2 - Int(param.paddings[0])
let offsetY = param.filter.dim[1]/2 - Int(param.paddings[1]) let offsetY = param.filter.dim[1]/2 - Int(param.paddings[1])
let offsetZ = 0.0 let offsetZ = 0.0
param.filter.initBuffer(device: device, precision: ComputePrecision.Float32)
metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
} }
......
...@@ -31,7 +31,27 @@ struct MetalConvTransposeParam { ...@@ -31,7 +31,27 @@ struct MetalConvTransposeParam {
class ConvTransposeKernel<P: PrecisionType>: Kernel, Computable{ class ConvTransposeKernel<P: PrecisionType>: Kernel, Computable{
var metalParam: MetalConvTransposeParam! var metalParam: MetalConvTransposeParam!
required init(device: MTLDevice, param: ConvTransposeParam<P>) { required init(device: MTLDevice, param: ConvTransposeParam<P>) {
super.init(device: device, inFunctionName: "conv_transpose") param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision)
param.filter.initBuffer(device: device, precision: computePrecision, convertToNHWC: false, withTranspose: true)
if computePrecision == .Float32 {
if param.stride == [2, 2] && param.stride == [2, 2] {
super.init(device: device, inFunctionName: "conv_transpose2x2_stride2")
} else {
fatalError(" -- conv transpose unsupported yet -- ")
}
} else if computePrecision == .Float16 {
if param.stride == [2, 2] && param.stride == [2, 2] {
super.init(device: device, inFunctionName: "conv_transpose2x2_stride2_half")
} else {
fatalError(" -- conv transpose unsupported yet -- ")
}
} else {
fatalError()
}
// let filter: [Float32] = param.filter.buffer.array()
// print(" conv transpose filter")
// print(filter)
let kernelWidth = UInt16(param.filter.width) let kernelWidth = UInt16(param.filter.width)
let kernelHeight = UInt16(param.filter.height) let kernelHeight = UInt16(param.filter.height)
...@@ -43,9 +63,7 @@ class ConvTransposeKernel<P: PrecisionType>: Kernel, Computable{ ...@@ -43,9 +63,7 @@ class ConvTransposeKernel<P: PrecisionType>: Kernel, Computable{
let dilationY = UInt16(param.dilations[1]) let dilationY = UInt16(param.dilations[1])
metalParam = MetalConvTransposeParam.init(kernelW: kernelWidth, kernelH: kernelHeight, strideX: strideX, strideY: strideY, paddingX: paddingX, paddingY: paddingY, dilationX: dilationX, dilationY: dilationY) metalParam = MetalConvTransposeParam.init(kernelW: kernelWidth, kernelH: kernelHeight, strideX: strideX, strideY: strideY, paddingX: paddingX, paddingY: paddingY, dilationX: dilationX, dilationY: dilationY)
param.output.initTexture(device: device, inTranspose: param.input.transpose)
param.filter.initBuffer(device: device)
} }
func compute(commandBuffer: MTLCommandBuffer, param: ConvTransposeParam<P>) throws { func compute(commandBuffer: MTLCommandBuffer, param: ConvTransposeParam<P>) throws {
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
import Foundation import Foundation
struct ElementwiseAddMetalParam { struct ElementwiseAddMetalParam {
var unsafe_one_dim: Int32 = 0
var fast: Int32 = 0 var fast: Int32 = 0
var axis: Int32 = 0 var axis: Int32 = 0
var yoff: Int32 = 0 var yoff: Int32 = 0
...@@ -26,8 +27,14 @@ struct ElementwiseAddMetalParam { ...@@ -26,8 +27,14 @@ struct ElementwiseAddMetalParam {
class ElementwiseAddKernel<P: PrecisionType>: Kernel, Computable { class ElementwiseAddKernel<P: PrecisionType>: Kernel, Computable {
required init(device: MTLDevice, param: ElementwiseAddParam<P>) { required init(device: MTLDevice, param: ElementwiseAddParam<P>) {
super.init(device: device, inFunctionName: "elementwise_add")
param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: computePrecision) param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: computePrecision)
if computePrecision == .Float32 {
super.init(device: device, inFunctionName: "elementwise_add")
} else if computePrecision == .Float16 {
super.init(device: device, inFunctionName: "elementwise_add_half")
} else {
fatalError()
}
} }
func compute(commandBuffer: MTLCommandBuffer, param: ElementwiseAddParam<P>) throws { func compute(commandBuffer: MTLCommandBuffer, param: ElementwiseAddParam<P>) throws {
...@@ -59,6 +66,11 @@ class ElementwiseAddKernel<P: PrecisionType>: Kernel, Computable { ...@@ -59,6 +66,11 @@ class ElementwiseAddKernel<P: PrecisionType>: Kernel, Computable {
emp.fast = 1 emp.fast = 1
} }
// TODO:
if param.inputY.tensorDim.cout() == 1 {
emp.unsafe_one_dim = 1;
}
encoder.setBytes(&emp, length: MemoryLayout<ElementwiseAddMetalParam>.size, index: 0) encoder.setBytes(&emp, length: MemoryLayout<ElementwiseAddMetalParam>.size, index: 0)
encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
encoder.endEncoding() encoder.endEncoding()
......
...@@ -27,8 +27,14 @@ struct PoolMetalParam { ...@@ -27,8 +27,14 @@ struct PoolMetalParam {
class PoolKernel<P: PrecisionType>: Kernel, Computable{ class PoolKernel<P: PrecisionType>: Kernel, Computable{
required init(device: MTLDevice, param: PoolParam<P>) { required init(device: MTLDevice, param: PoolParam<P>) {
super.init(device: device, inFunctionName: "pool")
param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision) param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision)
if computePrecision == .Float32 {
super.init(device: device, inFunctionName: "pool")
} else if computePrecision == .Float16 {
super.init(device: device, inFunctionName: "pool_half")
} else {
fatalError()
}
} }
func compute(commandBuffer: MTLCommandBuffer, param: PoolParam<P>) throws { func compute(commandBuffer: MTLCommandBuffer, param: PoolParam<P>) throws {
......
...@@ -10,15 +10,27 @@ import Foundation ...@@ -10,15 +10,27 @@ import Foundation
class PreluKernel<P: PrecisionType>: Kernel, Computable{ class PreluKernel<P: PrecisionType>: Kernel, Computable{
required init(device: MTLDevice, param: PreluParam<P>) { required init(device: MTLDevice, param: PreluParam<P>) {
if param.mode == "channel" {
super.init(device: device, inFunctionName: "prelu_channel")
} else if param.mode == "element" {
super.init(device: device, inFunctionName: "prelu_element")
} else {
super.init(device: device, inFunctionName: "prelu_other")
}
param.alpha.initBuffer(device: device, precision: computePrecision) param.alpha.initBuffer(device: device, precision: computePrecision)
param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision) param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision)
if computePrecision == .Float32 {
if param.mode == "channel" {
super.init(device: device, inFunctionName: "prelu_channel")
} else if param.mode == "element" {
super.init(device: device, inFunctionName: "prelu_element")
} else {
super.init(device: device, inFunctionName: "prelu_other")
}
} else if computePrecision == .Float16 {
if param.mode == "channel" {
super.init(device: device, inFunctionName: "prelu_channel_half")
} else if param.mode == "element" {
super.init(device: device, inFunctionName: "prelu_element_half")
} else {
super.init(device: device, inFunctionName: "prelu_other_half")
}
} else {
fatalError()
}
} }
func compute(commandBuffer: MTLCommandBuffer, param: PreluParam<P>) throws { func compute(commandBuffer: MTLCommandBuffer, param: PreluParam<P>) throws {
......
...@@ -33,6 +33,10 @@ class PriorBoxKernel<P: PrecisionType>: Kernel, Computable{ ...@@ -33,6 +33,10 @@ class PriorBoxKernel<P: PrecisionType>: Kernel, Computable{
var metalParam: PriorBoxMetalParam! var metalParam: PriorBoxMetalParam!
required init(device: MTLDevice, param: PriorBoxParam<P>) { required init(device: MTLDevice, param: PriorBoxParam<P>) {
param.output.initTexture(device: device, inTranspose: [2, 0, 1, 3], computePrecision: computePrecision)
param.outputVariances.initTexture(device: device, inTranspose: [2, 0, 1, 3], computePrecision: computePrecision)
if computePrecision == .Float32 { if computePrecision == .Float32 {
super.init(device: device, inFunctionName: "prior_box") super.init(device: device, inFunctionName: "prior_box")
} else if computePrecision == .Float16 { } else if computePrecision == .Float16 {
...@@ -41,9 +45,6 @@ class PriorBoxKernel<P: PrecisionType>: Kernel, Computable{ ...@@ -41,9 +45,6 @@ class PriorBoxKernel<P: PrecisionType>: Kernel, Computable{
fatalError() fatalError()
} }
param.output.initTexture(device: device, inTranspose: [2, 0, 1, 3], computePrecision: computePrecision)
param.outputVariances.initTexture(device: device, inTranspose: [2, 0, 1, 3], computePrecision: computePrecision)
let n = 1 let n = 1
let h = param.output.dim[1] let h = param.output.dim[1]
let w = param.output.dim[2] let w = param.output.dim[2]
...@@ -52,11 +53,11 @@ class PriorBoxKernel<P: PrecisionType>: Kernel, Computable{ ...@@ -52,11 +53,11 @@ class PriorBoxKernel<P: PrecisionType>: Kernel, Computable{
param.output.dim = Dim.init(inDim: [n, h, w, c]) param.output.dim = Dim.init(inDim: [n, h, w, c])
param.output.transpose = [0, 1, 2, 3] param.output.transpose = [0, 1, 2, 3]
let imageWidth = Float32(param.inputImage.originDim[3]) let imageWidth = Float32(param.inputImage.padToFourDim[3])
let imageHeight = Float32(param.inputImage.originDim[2]) let imageHeight = Float32(param.inputImage.padToFourDim[2])
let featureWidth = param.input.originDim[3] let featureWidth = param.input.padToFourDim[3]
let featureHeight = param.input.originDim[2] let featureHeight = param.input.padToFourDim[2]
if param.stepW == 0 || param.stepH == 0 { if param.stepW == 0 || param.stepH == 0 {
param.stepW = Float32(imageWidth) / Float32(featureWidth) param.stepW = Float32(imageWidth) / Float32(featureWidth)
......
...@@ -15,17 +15,23 @@ ...@@ -15,17 +15,23 @@
import Foundation import Foundation
class ReluKernel<P: PrecisionType>: Kernel, Computable{ class ReluKernel<P: PrecisionType>: Kernel, Computable{
func compute(commandBuffer: MTLCommandBuffer, param: ReluParam<P>) throws { func compute(commandBuffer: MTLCommandBuffer, param: ReluParam<P>) throws {
guard let encoder = commandBuffer.makeComputeCommandEncoder() else { guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
throw PaddleMobileError.predictError(message: " encode is nil") throw PaddleMobileError.predictError(message: " encode is nil")
}
encoder.setTexture(param.input.metalTexture, index: 0)
encoder.setTexture(param.output.metalTexture, index: 1)
encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
encoder.endEncoding()
} }
encoder.setTexture(param.input.metalTexture, index: 0)
required init(device: MTLDevice, param: ReluParam<P>) { encoder.setTexture(param.output.metalTexture, index: 1)
super.init(device: device, inFunctionName: "relu") encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
encoder.endEncoding()
}
required init(device: MTLDevice, param: ReluParam<P>) {
if computePrecision == .Float32 {
super.init(device: device, inFunctionName: "relu")
} else if computePrecision == .Float16 {
super.init(device: device, inFunctionName: "relu_half")
} else {
fatalError()
} }
}
} }
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
//
//import Foundation
//import MetalPerformanceShaders
//
//
//struct ResizeParam: OpParam{
// typealias OutputType = <#type#>
//
// typealias ParamPrecisionType = <#type#>
//
// let input: MTLTexture
// let output: MTLTexture
// let expectDim: Dim
//}
//
//struct OutputDim {
// let width: UInt16
// let height: UInt16
// let strideX: UInt16
// let strideY: UInt16
//}
//
//class ResizeKernel<P: PrecisionType>: Kernel, Computable{
// var lanczos: MPSImageLanczosScale
// required init(device: MTLDevice, param: ResizeParam) {
// lanczos = MPSImageLanczosScale.init(device: device)
// super.init(device: device, inFunctionName: "resize")
// }
// func compute(commandBuffer: MTLCommandBuffer, param: ResizeParam) throws {
//// guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
//// throw PaddleMobileError.predictError(message: " encode is nil")
//// }
// lanczos.encode(commandBuffer: commandBuffer, sourceTexture: param.input, destinationTexture: param.output)
//
//// encoder.setTexture(param.input, index: 0)
//// encoder.setTexture(param.output, index: 1)
//// let strideX = param.input.width/param.expectDim[2]
//// let strideY = param.input.height/param.expectDim[1]
//// var outputDim = OutputDim.init(width: UInt16(param.expectDim[1]), height: UInt16(param.expectDim[2]), strideX: UInt16(strideX), strideY: UInt16(strideY))
//// encoder.setBytes(&outputDim, length: MemoryLayout<OutputDim>.size, index: 0)
//// encoder.dispatch(computePipline: pipline, outTexture: param.output)
//// encoder.endEncoding()
// }
//
//
//
//
//}
...@@ -21,6 +21,17 @@ struct SoftmaxMetalParam { ...@@ -21,6 +21,17 @@ struct SoftmaxMetalParam {
class SoftmaxKernel<P: PrecisionType>: Kernel, Computable{ class SoftmaxKernel<P: PrecisionType>: Kernel, Computable{
required init(device: MTLDevice, param: SoftmaxParam<P>) {
param.output.initTexture(device: device, computePrecision: computePrecision)
if computePrecision == .Float32 {
super.init(device: device, inFunctionName: "softmax")
} else if computePrecision == .Float16 {
super.init(device: device, inFunctionName: "softmax_half")
} else {
fatalError()
}
}
func compute(commandBuffer: MTLCommandBuffer, param: SoftmaxParam<P>) throws { func compute(commandBuffer: MTLCommandBuffer, param: SoftmaxParam<P>) throws {
guard let encoder = commandBuffer.makeComputeCommandEncoder() else { guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
throw PaddleMobileError.predictError(message: " encoder is nil") throw PaddleMobileError.predictError(message: " encoder is nil")
...@@ -32,19 +43,12 @@ class SoftmaxKernel<P: PrecisionType>: Kernel, Computable{ ...@@ -32,19 +43,12 @@ class SoftmaxKernel<P: PrecisionType>: Kernel, Computable{
N: Int32(param.input.tensorDim[0]), N: Int32(param.input.tensorDim[0]),
K: Int32(param.input.tensorDim[1]) K: Int32(param.input.tensorDim[1])
) )
print(" soft max param: ")
print(smp)
encoder.setBytes(&smp, length: MemoryLayout<SoftmaxMetalParam>.size, index: 0) encoder.setBytes(&smp, length: MemoryLayout<SoftmaxMetalParam>.size, index: 0)
encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
encoder.endEncoding() encoder.endEncoding()
} }
required init(device: MTLDevice, param: SoftmaxParam<P>) {
param.output.initTexture(device: device, computePrecision: computePrecision)
if computePrecision == .Float32 {
super.init(device: device, inFunctionName: "softmax")
} else if computePrecision == .Float16 {
super.init(device: device, inFunctionName: "softmax_half")
} else {
fatalError()
}
}
} }
...@@ -429,7 +429,122 @@ kernel void depthwise_conv_add_3x3_half(texture2d_array<half, access::sample> in ...@@ -429,7 +429,122 @@ kernel void depthwise_conv_add_3x3_half(texture2d_array<half, access::sample> in
} }
kernel void conv_add_5x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
const device half4 *weights [[buffer(1)]],
const device half4 *biase [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
ushort2 stride = ushort2(param.strideX, param.strideY);
const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 5;
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = float4(0.0);
ushort dilation_y = param.dilationY;
half4 input[5];
for (uint i = 0; i < input_arr_size; ++i) {
input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i);
input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i);
input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i);
input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i);
for (int j = 0; j < 5; ++j) {
half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.x += dot(input[j], weight_x);
half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.y += dot(input[j], weight_y);
half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.z += dot(input[j], weight_z);
half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.w += dot(float4(input[j]), float4(weight_w));
}
}
output = output + float4(biase[gid.z]);
outTexture.write(half4(output), gid.xy, gid.z);
}
kernel void conv_add_1x5_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
const device half4 *weights [[buffer(1)]],
const device half4 *biase [[buffer(2)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
ushort2 stride = ushort2(param.strideX, param.strideY);
const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 5;
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = float4(0.0);
ushort dilation_x = param.dilationX;
half4 input[5];
for (uint i = 0; i < input_arr_size; ++i) {
input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i);
input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i);
input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i);
input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i);
for (int j = 0; j < 5; ++j) {
half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.x += dot(input[j], weight_x);
half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.y += dot(input[j], weight_y);
half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.z += dot(input[j], weight_z);
half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.w += dot(input[j], weight_w);
}
}
output = output + float4(biase[gid.z]);
outTexture.write(half4(output), gid.xy, gid.z);
}
kernel void test_conv_add_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]], kernel void test_conv_add_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
...@@ -502,3 +617,6 @@ kernel void test_conv_add_3x3(texture2d_array<float, access::sample> inTexture [ ...@@ -502,3 +617,6 @@ kernel void test_conv_add_3x3(texture2d_array<float, access::sample> inTexture [
// output = output + biase[gid.z]; // output = output + biase[gid.z];
outTexture.write(output, gid.xy, gid.z); outTexture.write(output, gid.xy, gid.z);
} }
...@@ -148,4 +148,133 @@ kernel void conv_1x1(texture2d_array<float, access::sample> inTexture [[texture( ...@@ -148,4 +148,133 @@ kernel void conv_1x1(texture2d_array<float, access::sample> inTexture [[texture(
} }
kernel void conv_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
const device half4 *weights [[buffer(1)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
ushort2 stride = ushort2(param.strideX, param.strideY);
const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 9;
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = float4(0.0);
half4 input[9];
for (uint i = 0; i < input_arr_size; ++i) {
input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i);
input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i);
input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i);
input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i);
input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i);
input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i);
input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i);
input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i);
for (int j = 0; j < 9; ++j) {
half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.x += dot(float4(input[j]), float4(weight_x));
half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.y += dot(float4(input[j]), float4(weight_y));
half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.z += dot(float4(input[j]), float4(weight_z));
half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
output.w += dot(float4(input[j]), float4(weight_w));
}
}
outTexture.write(half4(output), gid.xy, gid.z);
}
kernel void depthwise_conv_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
const device half *weights [[buffer(1)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
uint output_slice = gid.z;
ushort2 stride = ushort2(param.strideX, param.strideY);
ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 9;
uint weithTo = gid.z * kernelHXW * 4;
float4 output = float4(0.0);
half4 inputs[9];
inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice);
inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice);
inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice);
inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice);
inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice);
inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice);
inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice);
inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice);
inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice);
for (int j = 0; j < 9; ++j) {
half4 input = inputs[j];
output.x += float(input.x) * float(weights[weithTo + 0 * kernelHXW + j]);
output.y += float(input.y) * float(weights[weithTo + 1 * kernelHXW + j]);
output.z += float(input.z) * float(weights[weithTo + 2 * kernelHXW + j]);
output.w += float(input.w) * float(weights[weithTo + 3 * kernelHXW + j]);
}
outTexture.write(half4(output), gid.xy, gid.z);
}
kernel void conv_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
constant MetalConvParam &param [[buffer(0)]],
const device half4 *weights [[buffer(1)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
ushort2 stride = ushort2(param.strideX, param.strideY);
ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
const uint kernelHXW = 1;
uint input_arr_size = inTexture.get_array_size();
uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
float4 output = float4(0.0);
half4 input;
for (uint i = 0; i < input_arr_size; ++i) {
input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i];
output.x += dot(float4(input), float4(weight_x));
half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i];
output.y += dot(float4(input), float4(weight_y));
half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i];
output.z += dot(float4(input), float4(weight_z));
half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
output.w += dot(float4(input), float4(weight_w));
}
outTexture.write(half4(output), gid.xy, gid.z);
}
...@@ -29,11 +29,11 @@ struct MetalConvTransposeParam{ ...@@ -29,11 +29,11 @@ struct MetalConvTransposeParam{
ushort dilationY; ushort dilationY;
}; };
kernel void conv_transpose(texture2d_array<float, access::sample> inTexture [[texture(0)]], kernel void conv_transpose2x2_stride2(texture2d_array<float, access::sample> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]], texture2d_array<float, access::write> outTexture [[texture(1)]],
constant MetalConvTransposeParam &param [[buffer(0)]], constant MetalConvTransposeParam &param [[buffer(0)]],
const device float4 *weights [[buffer(1)]], const device float4 *weights [[buffer(1)]],
uint3 gid [[thread_position_in_grid]]){ uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() || if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() || gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) { gid.z >= outTexture.get_array_size()) {
...@@ -41,48 +41,134 @@ kernel void conv_transpose(texture2d_array<float, access::sample> inTexture [[te ...@@ -41,48 +41,134 @@ kernel void conv_transpose(texture2d_array<float, access::sample> inTexture [[te
} }
int input_array_size = inTexture.get_array_size(); int input_array_size = inTexture.get_array_size();
int kernel_index_x = gid.x % 2;
uint kernel_one_output_slice = input_array_size * param.kernelW * param.kernelH; int kernel_index_y = gid.y % 2;
int kernel_index = kernel_index_y * 2 + kernel_index_x;
uint kernel_stride_z = gid.z * 4 * (kernel_one_output_slice); int kernel_to = gid.z * input_array_size * 4 * 4 + (kernel_index * input_array_size);
int input_x = gid.x / 2;
int input_y = gid.y / 2;
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
float4 output = float4(0.0);
for (int i = 0; i < input_array_size; ++i) {
float4 input = inTexture.sample(sample, float2(input_x, input_y), i);
float4 kernel_slice0 = weights[kernel_to + input_array_size * 4 * 0 + i];
float4 kernel_slice1 = weights[kernel_to + input_array_size * 4 * 1 + i];
float4 kernel_slice2 = weights[kernel_to + input_array_size * 4 * 2 + i];
float4 kernel_slice3 = weights[kernel_to + input_array_size * 4 * 3 + i];
output.x += dot(input, kernel_slice0);
output.y += dot(input, kernel_slice1);
output.z += dot(input, kernel_slice2);
output.w += dot(input, kernel_slice3);
}
outTexture.write(output, gid.xy, gid.z);
}
kernel void conv_transpose2x2_stride2_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
constant MetalConvTransposeParam &param [[buffer(0)]],
const device half4 *weights [[buffer(1)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
float4 output; int input_array_size = inTexture.get_array_size();
int kernel_index_x = gid.x % 2;
int kernel_index_y = gid.y % 2;
int kernel_index = kernel_index_y * 2 + kernel_index_x;
int kernel_to = gid.z * input_array_size * 4 * 4 + (kernel_index * input_array_size);
int input_x = gid.x / 2;
int input_y = gid.y / 2;
for (int w = 0; w < param.kernelW; ++w) { constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
int input_x = (gid.x - w * param.dilationX + param.paddingX) / param.strideX; float4 output = float4(0.0);
if (input_x < 0 || input_x >= int(inTexture.get_width())) { for (int i = 0; i < input_array_size; ++i) {
continue;
}
for (int h = 0; h < param.kernelH; ++h) { half4 input = inTexture.sample(sample, float2(input_x, input_y), i);
int input_y = (gid.y - h * param.dilationY + param.paddingY) / param.strideY;
if (input_y < 0 || input_y >= int(inTexture.get_height())) { half4 kernel_slice0 = weights[kernel_to + input_array_size * 4 * 0 + i];
continue; half4 kernel_slice1 = weights[kernel_to + input_array_size * 4 * 1 + i];
} half4 kernel_slice2 = weights[kernel_to + input_array_size * 4 * 2 + i];
half4 kernel_slice3 = weights[kernel_to + input_array_size * 4 * 3 + i];
uint kernel_index = (w * param.kernelH + h) * inTexture.get_array_size();
output.x += dot(float4(input), float4(kernel_slice0));
for (int slice = 0; slice < input_array_size; ++slice) {
output.y += dot(float4(input), float4(kernel_slice1));
float4 input;
float4 kernel_slice = weights[kernel_stride_z + 0 * kernel_one_output_slice + kernel_index + slice]; output.z += dot(float4(input), float4(kernel_slice2));
float4 kernel_slice1 = weights[kernel_stride_z + 1 * kernel_one_output_slice + kernel_index + slice];
output.w += dot(float4(input), float4(kernel_slice3));
float4 kernel_slice2 = weights[kernel_stride_z + 2 * kernel_one_output_slice + kernel_index + slice];
float4 kernel_slice3 = weights[kernel_stride_z + 3 * kernel_one_output_slice + kernel_index + slice];
input = inTexture.sample(sample, float2(input_x, input_x), slice);
output.x += dot(input, kernel_slice);
output.x += dot(input, kernel_slice1);
output.x += dot(input, kernel_slice2);
output.x += dot(input, kernel_slice3);
}
}
} }
outTexture.write(output, gid.xy, gid.z); outTexture.write(half4(output), gid.xy, gid.z);
} }
//kernel void conv_transpose(texture2d_array<float, access::sample> inTexture [[texture(0)]],
// texture2d_array<float, access::write> outTexture [[texture(1)]],
// constant MetalConvTransposeParam &param [[buffer(0)]],
// const device float4 *weights [[buffer(1)]],
// uint3 gid [[thread_position_in_grid]]){
// if (gid.x >= outTexture.get_width() ||
// gid.y >= outTexture.get_height() ||
// gid.z >= outTexture.get_array_size()) {
// return;
// }
//
// int input_array_size = inTexture.get_array_size();
//
// uint kernel_one_output_slice = input_array_size * param.kernelW * param.kernelH;
//
// uint kernel_stride_z = gid.z * 4 * (kernel_one_output_slice);
//
// constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
//
// float4 output;
//
// for (int w = 0; w < param.kernelW; ++w) {
// int top = gid.x - w * param.dilationX + param.paddingX;
// int input_x = top / param.strideX;
// if (top < 0 || input_x >= int(inTexture.get_width())) {
// continue;
// }
//
// for (int h = 0; h < param.kernelH; ++h) {
// int top_y = gid.y - h * param.dilationY + param.paddingY;
// int input_y = top_y / param.strideY;
// if (top_y < 0 || input_y >= int(inTexture.get_height())) {
// continue;
// }
//
// uint kernel_index = (w * param.kernelH + h) * inTexture.get_array_size();
//
// for (int slice = 0; slice < input_array_size; ++slice) {
//
// float4 input;
// float4 kernel_slice = weights[kernel_stride_z + 0 * kernel_one_output_slice + kernel_index + slice];
// float4 kernel_slice1 = weights[kernel_stride_z + 1 * kernel_one_output_slice + kernel_index + slice];
//
// float4 kernel_slice2 = weights[kernel_stride_z + 2 * kernel_one_output_slice + kernel_index + slice];
//
// float4 kernel_slice3 = weights[kernel_stride_z + 3 * kernel_one_output_slice + kernel_index + slice];
//
// input = inTexture.sample(sample, float2(input_x, input_y), slice);
// output.x += dot(input, kernel_slice);
// output.y += dot(input, kernel_slice1);
// output.z += dot(input, kernel_slice2);
// output.w += dot(input, kernel_slice3);
// }
// }
// }
//
// outTexture.write(output, gid.xy, gid.z);
//}
//
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
using namespace metal; using namespace metal;
struct ElementwiseAddParam { struct ElementwiseAddParam {
int32_t unsafe_one_dim;
int32_t fast; int32_t fast;
int32_t axis; int32_t axis;
int32_t yoff; int32_t yoff;
...@@ -36,7 +37,10 @@ kernel void elementwise_add(texture2d_array<float, access::read> inputX [[textur ...@@ -36,7 +37,10 @@ kernel void elementwise_add(texture2d_array<float, access::read> inputX [[textur
gid.y >= outTexture.get_height() || gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) return; gid.z >= outTexture.get_array_size()) return;
float4 rx, ry; float4 rx, ry;
if (pm.fast == 1) { if (pm.unsafe_one_dim == 1) {
rx = inputX.read(gid.xy, gid.z);
ry = inputY.read(uint2(0, 0), gid.z);
} else if (pm.fast == 1) {
rx = inputX.read(gid.xy, gid.z); rx = inputX.read(gid.xy, gid.z);
ry = inputY.read(gid.xy, gid.z); ry = inputY.read(gid.xy, gid.z);
} else { } else {
...@@ -59,3 +63,39 @@ kernel void elementwise_add(texture2d_array<float, access::read> inputX [[textur ...@@ -59,3 +63,39 @@ kernel void elementwise_add(texture2d_array<float, access::read> inputX [[textur
float4 r = rx + ry; float4 r = rx + ry;
outTexture.write(r, gid.xy, gid.z); outTexture.write(r, gid.xy, gid.z);
} }
kernel void elementwise_add_half(texture2d_array<half, access::read> inputX [[texture(0)]],
texture2d_array<half, access::read> inputY [[texture(1)]],
texture2d_array<half, access::write> outTexture [[texture(2)]],
constant ElementwiseAddParam &pm [[buffer(0)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) return;
half4 rx, ry;
if (pm.unsafe_one_dim == 1) {
rx = inputX.read(gid.xy, gid.z);
ry = inputY.read(uint2(0, 0), gid.z);
} else if (pm.fast == 1) {
rx = inputX.read(gid.xy, gid.z);
ry = inputY.read(gid.xy, gid.z);
} else {
rx = inputX.read(gid.xy, gid.z);
int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
int32_t y_abcd[4] = {1, 1, 1, 1}, y_xyzn[4];
int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]};
int32_t ytrans[4] = {pm.ytrans[0], pm.ytrans[1], pm.ytrans[2], pm.ytrans[3]};
for (int n = 0; n < 4; n++) {
xyzn2abcd(pm.xdim[3], x_xyzn, x_abcd);
invtrans(xtrans, x_abcd, t_abcd);
for (int k = pm.axis; k < (4 - pm.yoff); k++) {
y_abcd[k+pm.yoff] = t_abcd[k];
}
trans(ytrans, y_abcd, t_abcd);
abcd2xyzn(pm.ydim[3], t_abcd, y_xyzn);
ry[n] = inputY.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]];
}
}
half4 r = rx + ry;
outTexture.write(r, gid.xy, gid.z);
}
...@@ -81,3 +81,71 @@ kernel void prelu_other(texture2d_array<float, access::sample> inTexture [[textu ...@@ -81,3 +81,71 @@ kernel void prelu_other(texture2d_array<float, access::sample> inTexture [[textu
outTexture.write(output, gid.xy, gid.z); outTexture.write(output, gid.xy, gid.z);
} }
kernel void prelu_channel_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
const device half4 *alpha [[buffer(0)]],
uint3 gid [[thread_position_in_grid]]){
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
half4 alpha_value = alpha[gid.z];
half4 output;
output.x = input.x > 0 ? input.x : (alpha_value.x * input.x);
output.y = input.y > 0 ? input.y : (alpha_value.y * input.y);
output.z = input.z > 0 ? input.z : (alpha_value.z * input.z);
output.w = input.w > 0 ? input.w : (alpha_value.w * input.w);
outTexture.write(output, gid.xy, gid.z);
}
kernel void prelu_element_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
const device half4 *alpha [[buffer(0)]],
uint3 gid [[thread_position_in_grid]]){
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
int alpha_to = (gid.y * inTexture.get_width() + gid.x) * inTexture.get_array_size();
half4 alpha_value = alpha[alpha_to + gid.z];
half4 output;
output.x = input.x > 0 ? input.x : (alpha_value.x * input.x);
output.y = input.y > 0 ? input.y : (alpha_value.y * input.y);
output.z = input.z > 0 ? input.z : (alpha_value.z * input.z);
output.w = input.w > 0 ? input.w : (alpha_value.w * input.w);
outTexture.write(output, gid.xy, gid.z);
}
kernel void prelu_other_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
const device half *alpha [[buffer(0)]],
uint3 gid [[thread_position_in_grid]]){
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) {
return;
}
constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z);
half alpha_value = alpha[0];
half4 output;
output.x = input.x > 0 ? input.x : (alpha_value * input.x);
output.y = input.y > 0 ? input.y : (alpha_value * input.y);
output.z = input.z > 0 ? input.z : (alpha_value * input.z);
output.w = input.w > 0 ? input.w : (alpha_value * input.w);
outTexture.write(output, gid.xy, gid.z);
}
...@@ -60,7 +60,7 @@ class PoolOp<P: PrecisionType>: Operator<PoolKernel<P>, PoolParam<P>>, Runable, ...@@ -60,7 +60,7 @@ class PoolOp<P: PrecisionType>: Operator<PoolKernel<P>, PoolParam<P>>, Runable,
func delogOutput() { func delogOutput() {
print(" \(type) output: ") print(" \(type) output: ")
print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3]), texturePrecision: computePrecision).strideArray()) print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
// print("pool2d delog") // print("pool2d delog")
......
...@@ -51,13 +51,13 @@ class PreluOp<P: PrecisionType>: Operator<PreluKernel<P>, PreluParam<P>>, Runabl ...@@ -51,13 +51,13 @@ class PreluOp<P: PrecisionType>: Operator<PreluKernel<P>, PreluParam<P>>, Runabl
func delogOutput() { func delogOutput() {
print(" \(type) input: ") print(" \(type) input: ")
print(para.input.metalTexture.toTensor(dim: (n: para.input.originDim[0], c: para.input.originDim[1], h: para.input.originDim[2], w: para.input.originDim[3]), texturePrecision: computePrecision).strideArray()) print(para.input.metalTexture.toTensor(dim: (n: para.input.padToFourDim[0], c: para.input.padToFourDim[1], h: para.input.padToFourDim[2], w: para.input.padToFourDim[3])).strideArray())
print(" \(type) Alpha: ") print(" \(type) Alpha: ")
let _: Float32? = para.alpha.buffer.logDesc(header: " alpha: ", stridable: false) let _: Float32? = para.alpha.buffer.logDesc(header: " alpha: ", stridable: false)
print(" \(type) output: ") print(" \(type) output: ")
print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3]), texturePrecision: computePrecision).strideArray()) print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[1], h: para.output.padToFourDim[2], w: para.output.padToFourDim[3])).strideArray())
} }
// print("softmax delog") // print("softmax delog")
......
...@@ -76,12 +76,12 @@ class PriorBoxOp<P: PrecisionType>: Operator<PriorBoxKernel<P>, PriorBoxParam<P> ...@@ -76,12 +76,12 @@ class PriorBoxOp<P: PrecisionType>: Operator<PriorBoxKernel<P>, PriorBoxParam<P>
print(outputArray) print(outputArray)
// output // output
// print(" \(type) output: ") // print(" \(type) output: ")
// let originDim = para.output.originDim // let padToFourDim = para.output.padToFourDim
// if para.output.transpose == [0, 1, 2, 3] { // if para.output.transpose == [0, 1, 2, 3] {
// let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision) // let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]), texturePrecision: computePrecision)
// print(outputArray.strideArray()) // print(outputArray.strideArray())
// } else if para.output.transpose == [0, 2, 3, 1] { // } else if para.output.transpose == [0, 2, 3, 1] {
// print(para.output.metalTexture.toTensor(dim: (n: originDim[0], c: originDim[1], h: originDim[2], w: originDim[3]), texturePrecision: computePrecision).strideArray()) // print(para.output.metalTexture.toTensor(dim: (n: padToFourDim[0], c: padToFourDim[1], h: padToFourDim[2], w: padToFourDim[3]), texturePrecision: computePrecision).strideArray())
// } else { // } else {
// print(" not implement") // print(" not implement")
// } // }
......
...@@ -46,7 +46,7 @@ class ReluOp<P: PrecisionType>: Operator<ReluKernel<P>, ReluParam<P>>, Runable, ...@@ -46,7 +46,7 @@ class ReluOp<P: PrecisionType>: Operator<ReluKernel<P>, ReluParam<P>>, Runable,
func delogOutput() { func delogOutput() {
print(" \(type) output: ") print(" \(type) output: ")
print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3]), texturePrecision: computePrecision).strideArray()) print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
} }
} }
......
...@@ -41,8 +41,8 @@ class ReshapeParam<P: PrecisionType>: OpParam { ...@@ -41,8 +41,8 @@ class ReshapeParam<P: PrecisionType>: OpParam {
for i in 0..<s.count { for i in 0..<s.count {
dim[4-s.count+i] = s[i] dim[4-s.count+i] = s[i]
} }
output.originDim = Dim.init(inDim: dim) output.padToFourDim = Dim.init(inDim: dim)
output.dim = output.originDim output.dim = output.padToFourDim
inplace = try ReshapeParam.getAttr(key: "inplace", attrs: opDesc.attrs) inplace = try ReshapeParam.getAttr(key: "inplace", attrs: opDesc.attrs)
} catch let error { } catch let error {
...@@ -74,9 +74,9 @@ class ReshapeOp<P: PrecisionType>: Operator<ReshapeKernel<P>, ReshapeParam<P>>, ...@@ -74,9 +74,9 @@ class ReshapeOp<P: PrecisionType>: Operator<ReshapeKernel<P>, ReshapeParam<P>>,
print("reshape delog") print("reshape delog")
// let _: P? = para.input.metalTexture.logDesc(header: "reshape input: ", stridable: false) // let _: P? = para.input.metalTexture.logDesc(header: "reshape input: ", stridable: false)
let originDim = para.output.originDim let padToFourDim = para.output.padToFourDim
let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision) let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
print(outputArray.strideArray()) print(outputArray.strideArray())
} }
......
...@@ -26,7 +26,7 @@ class SoftmaxParam<P: PrecisionType>: OpParam { ...@@ -26,7 +26,7 @@ class SoftmaxParam<P: PrecisionType>: OpParam {
output.dim = input.dim output.dim = input.dim
output.tensorDim = input.tensorDim output.tensorDim = input.tensorDim
output.originDim = input.originDim output.padToFourDim = input.padToFourDim
} catch let error { } catch let error {
throw error throw error
} }
...@@ -52,9 +52,11 @@ class SoftmaxOp<P: PrecisionType>: Operator<SoftmaxKernel<P>, SoftmaxParam<P>>, ...@@ -52,9 +52,11 @@ class SoftmaxOp<P: PrecisionType>: Operator<SoftmaxKernel<P>, SoftmaxParam<P>>,
func delogOutput() { func delogOutput() {
print("softmax delog") print("softmax delog")
print(para.input)
let originDim = para.output.originDim
let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision) print(para.output)
let padToFourDim = para.output.padToFourDim
let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
print(outputArray.strideArray()) print(outputArray.strideArray())
} }
} }
...@@ -48,9 +48,9 @@ class TransposeOp<P: PrecisionType>: Operator<TransposeKernel<P>, TransposeParam ...@@ -48,9 +48,9 @@ class TransposeOp<P: PrecisionType>: Operator<TransposeKernel<P>, TransposeParam
func delogOutput() { func delogOutput() {
print(" \(type) output: ") print(" \(type) output: ")
let originDim = para.output.originDim let padToFourDim = para.output.padToFourDim
if para.output.transpose == [0, 1, 2, 3] { if para.output.transpose == [0, 1, 2, 3] {
let outputArray = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3])) let outputArray = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
print(outputArray.strideArray()) print(outputArray.strideArray())
} else if para.output.transpose == [0, 2, 3, 1] { } else if para.output.transpose == [0, 2, 3, 1] {
print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray()) print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
......
...@@ -95,7 +95,28 @@ class Tensor<P: PrecisionType>: Tensorial { ...@@ -95,7 +95,28 @@ class Tensor<P: PrecisionType>: Tensorial {
func initBuffer(device: MTLDevice, precision: ComputePrecision = .Float16) { func initBuffer(device: MTLDevice, precision: ComputePrecision = .Float16, convertToNHWC: Bool = true, withTranspose: Bool = false) {
if convertToNHWC {
// print(layout)
convert(to: DataLayout.NHWC())
}
if withTranspose {
let transposePointer = UnsafeMutablePointer<P>.allocate(capacity: numel())
let n = dim[0]
let hwc = numel()/n
for j in 0..<hwc {
for i in 0..<n {
//data[i * hwc + j]
transposePointer[j * n + i] = data[i * hwc + j]
}
}
dim.swapeDimAt(index1: 0, index2: 3)
data.release()
data.pointer = transposePointer
}
guard let floatPointer = data.pointer as? UnsafeMutablePointer<Float32> else { guard let floatPointer = data.pointer as? UnsafeMutablePointer<Float32> else {
fatalError(" not support yet ") fatalError(" not support yet ")
} }
...@@ -139,6 +160,8 @@ class Tensor<P: PrecisionType>: Tensorial { ...@@ -139,6 +160,8 @@ class Tensor<P: PrecisionType>: Tensorial {
for j in 0..<paddedC { for j in 0..<paddedC {
if j < C { if j < C {
dstPtr[j] = tmpPointer[j] dstPtr[j] = tmpPointer[j]
} else {
dstPtr[j] = 0
} }
} }
tmpPointer += C tmpPointer += C
...@@ -152,6 +175,47 @@ class Tensor<P: PrecisionType>: Tensorial { ...@@ -152,6 +175,47 @@ class Tensor<P: PrecisionType>: Tensorial {
float32ToFloat16(input: convertedPointer, output: buffer.contents(), count: count) float32ToFloat16(input: convertedPointer, output: buffer.contents(), count: count)
} }
convertedPointer.deinitialize(count: count)
convertedPointer.deallocate()
}
} else {
let C = dim[3]
let cSlices = (C + 3) / 4
let paddedC = cSlices * 4
let count = paddedC * dim[0] * dim[1] * dim[2]
if C == paddedC {
buffer = device.makeBuffer(length: count * precisionSize)
switch precision {
case .Float32:
buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout<P>.stride)
case .Float16:
float32ToFloat16(input: floatPointer, output: buffer.contents(), count: count)
}
} else if C == 1 {
fatalError(" not support ")
} else {
buffer = device.makeBuffer(length: count * precisionSize)
let convertedPointer = UnsafeMutablePointer<Float32>.allocate(capacity: count)
var tmpPointer = floatPointer
var dstPtr = convertedPointer
for _ in 0..<dim[0] * dim[1] * dim[2] {
for j in 0..<paddedC {
if j < C {
dstPtr[j] = tmpPointer[j]
} else {
dstPtr[j] = 0
}
}
tmpPointer += C
dstPtr += paddedC
}
switch precision {
case .Float32:
buffer?.contents().copyMemory(from: convertedPointer, byteCount: count * MemoryLayout<P>.stride)
case .Float16:
float32ToFloat16(input: convertedPointer, output: buffer.contents(), count: count)
}
convertedPointer.deinitialize(count: count) convertedPointer.deinitialize(count: count)
convertedPointer.deallocate() convertedPointer.deallocate()
} }
......
...@@ -41,14 +41,28 @@ extension InputTexture { ...@@ -41,14 +41,28 @@ extension InputTexture {
public class Texture<P: PrecisionType>: Tensorial { public class Texture<P: PrecisionType>: Tensorial {
var dim: Dim var dim: Dim
public var tensorDim: Dim public var tensorDim: Dim
public var originDim: Dim public var padToFourDim: Dim
private var textureDesc: MTLTextureDescriptor! private var textureDesc: MTLTextureDescriptor!
public var metalTexture: MTLTexture! public var metalTexture: MTLTexture!
var transpose: [Int] = [0, 1, 2, 3] var transpose: [Int] = [0, 1, 2, 3]
func toTensor() -> [Float32] {
guard padToFourDim.cout() == 4 else {
fatalError("- not support -")
}
return metalTexture.toTensor(dim: (n: padToFourDim[0], c: padToFourDim[1], h: padToFourDim[2], w: padToFourDim[3]))
}
func realNHWC() -> [Float32] {
guard padToFourDim.cout() == 4 else {
fatalError(" - not support - ")
}
return metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
}
func initTexture(device: MTLDevice, inTranspose: [Int] = [0, 1, 2, 3], computePrecision: ComputePrecision = .Float16) { func initTexture(device: MTLDevice, inTranspose: [Int] = [0, 1, 2, 3], computePrecision: ComputePrecision = .Float16) {
transpose = inTranspose transpose = inTranspose
let newDim = transpose.map { originDim[$0] } let newDim = transpose.map { padToFourDim[$0] }
let newLayout = transpose.map { layout.layoutWithDim[$0] } let newLayout = transpose.map { layout.layoutWithDim[$0] }
...@@ -93,7 +107,7 @@ public class Texture<P: PrecisionType>: Tensorial { ...@@ -93,7 +107,7 @@ public class Texture<P: PrecisionType>: Tensorial {
} }
tensorDim = inDim tensorDim = inDim
dim = fourDim dim = fourDim
originDim = fourDim padToFourDim = fourDim
layout = DataLayout.init([(.N, fourDim[0]), (.C, fourDim[1]), (.H, fourDim[2]), (.W, fourDim[3])]) layout = DataLayout.init([(.N, fourDim[0]), (.C, fourDim[1]), (.H, fourDim[2]), (.W, fourDim[3])])
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册