From bfca5725b5dc5667d26e23804f77c6d6e5cdf3e1 Mon Sep 17 00:00:00 2001
From: Yanzhan Yang <yangyanzhan@gmail.com>
Date: Tue, 28 May 2019 17:50:02 +0800
Subject: [PATCH] 1. enable conv mps with texture bias. 2. add scale mps. 3.
 remove duplicate computation in fetch kernel. 4. fix typo. (#1665)

---
 .../ConvAddReluMetal.metal                    |  8 ++---
 .../FetchKernel.inc.metal                     | 20 ++++++------
 .../paddle-mobile/API/Runner.swift            |  2 +-
 .../Src/Operators/Base/Operator.swift         |  2 +-
 .../Operators/Kernels/ConvAddReluKernel.swift | 32 ++++++++++++++++---
 .../Src/Operators/Kernels/ConvKernel.swift    |  4 +--
 .../Src/Operators/Kernels/ScaleOpKernel.swift | 24 ++++++++++++++
 7 files changed, 69 insertions(+), 23 deletions(-)

diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddReluMetal.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddReluMetal.metal
index d696e15df7..de85897c10 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddReluMetal.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddReluMetal.metal
@@ -644,13 +644,13 @@ kernel void depthwise_conv_add_relu_3x3_half_winograd(texture2d_array<half, acce
     }
     
     if (param.hasAddOp == 1) {
-        half4 base = biasTexture.sample(sample, uint2(tx, ty), tc);
+        half4 base = biasTexture.sample(sample, float2(tx, ty), tc);
         res[0] += base;
-        base = biasTexture.sample(sample, uint2(tx + 1, ty), tc);
+        base = biasTexture.sample(sample, float2(tx + 1, ty), tc);
         res[1] += base;
-        base = biasTexture.sample(sample, uint2(tx, ty + 1), tc);
+        base = biasTexture.sample(sample, float2(tx, ty + 1), tc);
         res[2] += base;
-        base = biasTexture.sample(sample, uint2(tx + 1, ty + 1), tc);
+        base = biasTexture.sample(sample, float2(tx + 1, ty + 1), tc);
         res[3] += base;
     }
 
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/FetchKernel.inc.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/FetchKernel.inc.metal
index 0a3f9b8fdd..efeec0298b 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/FetchKernel.inc.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/FetchKernel.inc.metal
@@ -25,22 +25,22 @@
 kernel void FUNC_T(fetch, P)(texture2d_array<P, access::read> inTexture [[texture(0)]],
                              device float *output [[buffer(0)]],
                              uint3 gid [[thread_position_in_grid]]) {
-    if (gid.x >= inTexture.get_width() ||
-        gid.y >= inTexture.get_height() ||
+    uint input_width = inTexture.get_width();
+    uint input_height = inTexture.get_height();
+    if (gid.x >= input_width ||
+        gid.y >= input_height ||
         gid.z >= inTexture.get_array_size()) {
         return;
     }
     
-    int input_width = inTexture.get_width();
-    int input_height = inTexture.get_height();
     const VECTOR(P, 4) input = inTexture.read(gid.xy, gid.z);
-    int output_to = 4 * input_width * input_height;
-    
-    output[gid.z * output_to + 0 * input_width * input_height + gid.y * input_width + gid.x] = input.x;
+    uint delta = input_width * input_height;
+    uint output_to = 4 * gid.z * delta + gid.y * input_width + gid.x;
     
-    output[gid.z * output_to + 1 * input_width * input_height + gid.y * input_width + gid.x] = input.y;
-    output[gid.z * output_to + 2 * input_width * input_height + gid.y * input_width + gid.x] = input.z;
-    output[gid.z * output_to + 3 * input_width * input_height + gid.y * input_width + gid.x] = input.w;
+    output[output_to] = input.x;
+    output[output_to + delta] = input.y;
+    output[output_to + 2 * delta] = input.z;
+    output[output_to + 3 * delta] = input.w;
 }
 
 kernel void FUNC(fetch, 1or2, P)(texture2d_array<P, access::read> inTexture [[texture(0)]],
diff --git a/metal/paddle-mobile/paddle-mobile/API/Runner.swift b/metal/paddle-mobile/paddle-mobile/API/Runner.swift
index b4f3c686f9..9a42ba4810 100644
--- a/metal/paddle-mobile/paddle-mobile/API/Runner.swift
+++ b/metal/paddle-mobile/paddle-mobile/API/Runner.swift
@@ -116,7 +116,7 @@ import Foundation
             initContext.metalLoadMode = net.metalLoadMode
             initContext.metalLibPath = net.metalLibPath
             initContext.useMPS = net.useMPS
-            initContext.useAggresiveOptimization = net.useAggressiveOptimization
+            initContext.useAggressiveOptimization = net.useAggressiveOptimization
 
             switch net.paramPrecision {
             case .Float16:
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/Operator.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/Operator.swift
index a9eaec5c76..a89dfa3b86 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/Operator.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/Operator.swift
@@ -71,7 +71,7 @@ public class InitContext {
     var useMPS: Bool = false
     
     /// 是否使用最高等级的加速策略
-    var useAggresiveOptimization: Bool = false
+    var useAggressiveOptimization: Bool = false
     
     init() {
         metalLoadMode = .LoadMetalInDefaultLib
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddReluKernel.swift
index a63075c660..1e385b1f2c 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddReluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddReluKernel.swift
@@ -108,6 +108,8 @@ class ConvDataSource<P: PrecisionProtocol>: NSObject, MPSCNNConvolutionDataSourc
 class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
     var metalParam: MetalConvParam!
     var mpsConvOp: Any?
+    var mpsAddOp: Any?
+    var mpsReluOp: Any?
     var blankTexture: Texture?
     
     required init(device: MTLDevice, param: ConvAddReluParam<P>, initContext: InitContext) throws {
@@ -118,9 +120,11 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
         }
         
         var shouldUseMPS = false
-        let functionName = type(of: self).kernelFunctionName(param: param, useAggressiveOptimization: initContext.useAggresiveOptimization)
-        if #available(iOS 11.0, *), (initContext.useMPS || initContext.useAggresiveOptimization) {
-            if param.input.tensorDim[1] > 4 && param.output.tensorDim[1] > 4 {
+        let functionName = type(of: self).kernelFunctionName(param: param, useAggressiveOptimization: initContext.useAggressiveOptimization)
+        if #available(iOS 11.0, *), (initContext.useMPS || initContext.useAggressiveOptimization) {
+            let inputChannel = param.input.tensorDim[1]
+            let outputChannel = param.output.tensorDim[1]
+            if (inputChannel == 1 || inputChannel > 4) && (outputChannel == 1 || outputChannel > 4) {
                 shouldUseMPS = true
             }
         }
@@ -149,6 +153,15 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
                 let inputImage = MPSImage.init(texture: param.input.metalTexture, featureChannels: param.input.tensorDim[1])
                 let outputImage = MPSImage.init(texture: param.output.metalTexture, featureChannels: param.output.tensorDim[1])
                 conv.encode(commandBuffer: commandBuffer, sourceImage: inputImage, destinationImage: outputImage)
+                if #available(iOS 11.3, *) {
+                    if let add = mpsAddOp as? MPSCNNAdd, let y = param.y {
+                        let biasImage = MPSImage.init(texture: y.metalTexture, featureChannels: y.tensorDim[1])
+                        add.encode(commandBuffer: commandBuffer, primaryImage: outputImage, secondaryImage: biasImage, destinationImage: outputImage)
+                    }
+                    if let relu = mpsReluOp as? MPSCNNNeuronReLU {
+                        relu.encode(commandBuffer: commandBuffer, sourceImage: outputImage, destinationImage: outputImage)
+                    }
+                }
                 return
             }
         }
@@ -172,17 +185,26 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
         if #available(iOS 11.0, *) {
             param.input.useMPS = true
             param.output.useMPS = true
+            if #available(iOS 11.3, *) {
+                if param.y != nil {
+                    mpsAddOp = MPSCNNAdd(device: device)
+                    if hasReluOp() {
+                        mpsReluOp = MPSCNNNeuronReLU(device: device, a: 0.0)
+                    }
+                }
+            }
+            let neuronFilter: MPSCNNNeuron? = param.y != nil ? nil : (neuronFilterForMPSLayer(device: device) as? MPSCNNNeuron)
             let desc: MPSCNNConvolutionDescriptor = isDepthWise ?
                 MPSCNNDepthWiseConvolutionDescriptor(kernelWidth: param.filter.tensorDim[3],
                                                      kernelHeight: param.filter.tensorDim[2],
                                                      inputFeatureChannels: param.input.tensorDim[1],
                                                      outputFeatureChannels: param.output.tensorDim[1],
-                                                     neuronFilter: neuronFilterForMPSLayer(device: device) as? MPSCNNNeuron) :
+                                                     neuronFilter: neuronFilter) :
                 MPSCNNConvolutionDescriptor(kernelWidth: param.filter.tensorDim[3],
                                             kernelHeight: param.filter.tensorDim[2],
                                             inputFeatureChannels: param.input.tensorDim[1],
                                             outputFeatureChannels: param.output.tensorDim[1],
-                                            neuronFilter: neuronFilterForMPSLayer(device: device) as? MPSCNNNeuron)
+                                            neuronFilter: neuronFilter)
             desc.strideInPixelsX = Int(param.stride[0])
             desc.strideInPixelsY = Int(param.stride[1])
             let _ = param.filter.convert(converter: MPSPointerConverter<P>.init())
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift
index 2e82aa935e..11f8a2683c 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift
@@ -28,8 +28,8 @@ class ConvKernel<P: PrecisionProtocol>: Kernel, Computable {
         }
         
         var shouldUseMPS = false
-        let functionName = type(of: self).kernelFunctionName(param: param, useAggressiveOptimization: initContext.useAggresiveOptimization)
-        if #available(iOS 11.0, *), (initContext.useMPS || initContext.useAggresiveOptimization) {
+        let functionName = type(of: self).kernelFunctionName(param: param, useAggressiveOptimization: initContext.useAggressiveOptimization)
+        if #available(iOS 11.0, *), (initContext.useMPS || initContext.useAggressiveOptimization) {
             if param.input.tensorDim[1] > 4 && param.output.tensorDim[1] > 4 {
                 shouldUseMPS = true
             }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ScaleOpKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ScaleOpKernel.swift
index c56bb844ab..5f2f20308d 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ScaleOpKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ScaleOpKernel.swift
@@ -13,6 +13,7 @@
  limitations under the License. */
 
 import Foundation
+import MetalPerformanceShaders
 
 struct ScaleMetalParam {
     let scale: Float32
@@ -21,6 +22,8 @@ struct ScaleMetalParam {
 
 class ScaleOpKernel<P: PrecisionProtocol>: Kernel, Computable{
     var metalParam: ScaleMetalParam
+    var mpsScaleOp: AnyObject?
+    
     required init(device: MTLDevice, param: ScaleParam<P>, initContext: InitContext) throws {
         do {
             try param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
@@ -28,6 +31,15 @@ class ScaleOpKernel<P: PrecisionProtocol>: Kernel, Computable{
             throw error
         }
         
+        var shouldUseMPS = false
+        if initContext.useMPS && param.biasAfterScale {
+            let inputChannel = param.input.tensorDim[1]
+            let outputChannel = param.output.tensorDim[1]
+            if (inputChannel == 1 || inputChannel > 4) && (outputChannel == 1 || outputChannel > 4) {
+                shouldUseMPS = true
+            }
+        }
+        
         metalParam = ScaleMetalParam(scale: param.scale, abias: param.bias)
         
         if GlobalConfig.shared.computePrecision == .Float32 {
@@ -45,9 +57,21 @@ class ScaleOpKernel<P: PrecisionProtocol>: Kernel, Computable{
         } else {
             fatalError()
         }
+        
+        if #available(iOS 10.0, *), shouldUseMPS {
+            mpsScaleOp = MPSCNNNeuronLinear(device: device, a: param.scale, b: param.bias)
+            param.input.useMPS = true
+            param.output.useMPS = true
+        }
     }
     
     func compute(commandBuffer: MTLCommandBuffer, param: ScaleParam<P>) throws {
+        if #available(iOS 10.0, *), let mpsScaleOp = mpsScaleOp as? MPSCNNNeuronLinear {
+            let inputImage = MPSImage.init(texture: param.input.metalTexture, featureChannels: param.input.tensorDim[1])
+            let outputImage = MPSImage.init(texture: param.output.metalTexture, featureChannels: param.output.tensorDim[1])
+            mpsScaleOp.encode(commandBuffer: commandBuffer, sourceImage: inputImage, destinationImage: outputImage)
+            return
+        }
         guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
             throw PaddleMobileError.predictError(message: " encoder is nil")
         }
-- 
GitLab