From bfca5725b5dc5667d26e23804f77c6d6e5cdf3e1 Mon Sep 17 00:00:00 2001 From: Yanzhan Yang Date: Tue, 28 May 2019 17:50:02 +0800 Subject: [PATCH] 1. enable conv mps with texture bias. 2. add scale mps. 3. remove duplicate computation in fetch kernel. 4. fix typo. (#1665) --- .../ConvAddReluMetal.metal | 8 ++--- .../FetchKernel.inc.metal | 20 ++++++------ .../paddle-mobile/API/Runner.swift | 2 +- .../Src/Operators/Base/Operator.swift | 2 +- .../Operators/Kernels/ConvAddReluKernel.swift | 32 ++++++++++++++++--- .../Src/Operators/Kernels/ConvKernel.swift | 4 +-- .../Src/Operators/Kernels/ScaleOpKernel.swift | 24 ++++++++++++++ 7 files changed, 69 insertions(+), 23 deletions(-) diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddReluMetal.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddReluMetal.metal index d696e15df7..de85897c10 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddReluMetal.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddReluMetal.metal @@ -644,13 +644,13 @@ kernel void depthwise_conv_add_relu_3x3_half_winograd(texture2d_array inTexture [[texture(0)]], device float *output [[buffer(0)]], uint3 gid [[thread_position_in_grid]]) { - if (gid.x >= inTexture.get_width() || - gid.y >= inTexture.get_height() || + uint input_width = inTexture.get_width(); + uint input_height = inTexture.get_height(); + if (gid.x >= input_width || + gid.y >= input_height || gid.z >= inTexture.get_array_size()) { return; } - int input_width = inTexture.get_width(); - int input_height = inTexture.get_height(); const VECTOR(P, 4) input = inTexture.read(gid.xy, gid.z); - int output_to = 4 * input_width * input_height; - - output[gid.z * output_to + 0 * input_width * input_height + gid.y * input_width + gid.x] = input.x; + uint delta = input_width * input_height; + uint output_to = 4 * gid.z * delta + gid.y * input_width + gid.x; - output[gid.z * output_to + 1 * input_width * input_height + gid.y * input_width + gid.x] = input.y; - output[gid.z * output_to + 2 * input_width * input_height + gid.y * input_width + gid.x] = input.z; - output[gid.z * output_to + 3 * input_width * input_height + gid.y * input_width + gid.x] = input.w; + output[output_to] = input.x; + output[output_to + delta] = input.y; + output[output_to + 2 * delta] = input.z; + output[output_to + 3 * delta] = input.w; } kernel void FUNC(fetch, 1or2, P)(texture2d_array inTexture [[texture(0)]], diff --git a/metal/paddle-mobile/paddle-mobile/API/Runner.swift b/metal/paddle-mobile/paddle-mobile/API/Runner.swift index b4f3c686f9..9a42ba4810 100644 --- a/metal/paddle-mobile/paddle-mobile/API/Runner.swift +++ b/metal/paddle-mobile/paddle-mobile/API/Runner.swift @@ -116,7 +116,7 @@ import Foundation initContext.metalLoadMode = net.metalLoadMode initContext.metalLibPath = net.metalLibPath initContext.useMPS = net.useMPS - initContext.useAggresiveOptimization = net.useAggressiveOptimization + initContext.useAggressiveOptimization = net.useAggressiveOptimization switch net.paramPrecision { case .Float16: diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/Operator.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/Operator.swift index a9eaec5c76..a89dfa3b86 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/Operator.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Base/Operator.swift @@ -71,7 +71,7 @@ public class InitContext { var useMPS: Bool = false /// 是否使用最高等级的加速策略 - var useAggresiveOptimization: Bool = false + var useAggressiveOptimization: Bool = false init() { metalLoadMode = .LoadMetalInDefaultLib diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddReluKernel.swift index a63075c660..1e385b1f2c 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddReluKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddReluKernel.swift @@ -108,6 +108,8 @@ class ConvDataSource: NSObject, MPSCNNConvolutionDataSourc class ConvAddReluKernel: Kernel, Computable { var metalParam: MetalConvParam! var mpsConvOp: Any? + var mpsAddOp: Any? + var mpsReluOp: Any? var blankTexture: Texture? required init(device: MTLDevice, param: ConvAddReluParam

, initContext: InitContext) throws { @@ -118,9 +120,11 @@ class ConvAddReluKernel: Kernel, Computable { } var shouldUseMPS = false - let functionName = type(of: self).kernelFunctionName(param: param, useAggressiveOptimization: initContext.useAggresiveOptimization) - if #available(iOS 11.0, *), (initContext.useMPS || initContext.useAggresiveOptimization) { - if param.input.tensorDim[1] > 4 && param.output.tensorDim[1] > 4 { + let functionName = type(of: self).kernelFunctionName(param: param, useAggressiveOptimization: initContext.useAggressiveOptimization) + if #available(iOS 11.0, *), (initContext.useMPS || initContext.useAggressiveOptimization) { + let inputChannel = param.input.tensorDim[1] + let outputChannel = param.output.tensorDim[1] + if (inputChannel == 1 || inputChannel > 4) && (outputChannel == 1 || outputChannel > 4) { shouldUseMPS = true } } @@ -149,6 +153,15 @@ class ConvAddReluKernel: Kernel, Computable { let inputImage = MPSImage.init(texture: param.input.metalTexture, featureChannels: param.input.tensorDim[1]) let outputImage = MPSImage.init(texture: param.output.metalTexture, featureChannels: param.output.tensorDim[1]) conv.encode(commandBuffer: commandBuffer, sourceImage: inputImage, destinationImage: outputImage) + if #available(iOS 11.3, *) { + if let add = mpsAddOp as? MPSCNNAdd, let y = param.y { + let biasImage = MPSImage.init(texture: y.metalTexture, featureChannels: y.tensorDim[1]) + add.encode(commandBuffer: commandBuffer, primaryImage: outputImage, secondaryImage: biasImage, destinationImage: outputImage) + } + if let relu = mpsReluOp as? MPSCNNNeuronReLU { + relu.encode(commandBuffer: commandBuffer, sourceImage: outputImage, destinationImage: outputImage) + } + } return } } @@ -172,17 +185,26 @@ class ConvAddReluKernel: Kernel, Computable { if #available(iOS 11.0, *) { param.input.useMPS = true param.output.useMPS = true + if #available(iOS 11.3, *) { + if param.y != nil { + mpsAddOp = MPSCNNAdd(device: device) + if hasReluOp() { + mpsReluOp = MPSCNNNeuronReLU(device: device, a: 0.0) + } + } + } + let neuronFilter: MPSCNNNeuron? = param.y != nil ? nil : (neuronFilterForMPSLayer(device: device) as? MPSCNNNeuron) let desc: MPSCNNConvolutionDescriptor = isDepthWise ? MPSCNNDepthWiseConvolutionDescriptor(kernelWidth: param.filter.tensorDim[3], kernelHeight: param.filter.tensorDim[2], inputFeatureChannels: param.input.tensorDim[1], outputFeatureChannels: param.output.tensorDim[1], - neuronFilter: neuronFilterForMPSLayer(device: device) as? MPSCNNNeuron) : + neuronFilter: neuronFilter) : MPSCNNConvolutionDescriptor(kernelWidth: param.filter.tensorDim[3], kernelHeight: param.filter.tensorDim[2], inputFeatureChannels: param.input.tensorDim[1], outputFeatureChannels: param.output.tensorDim[1], - neuronFilter: neuronFilterForMPSLayer(device: device) as? MPSCNNNeuron) + neuronFilter: neuronFilter) desc.strideInPixelsX = Int(param.stride[0]) desc.strideInPixelsY = Int(param.stride[1]) let _ = param.filter.convert(converter: MPSPointerConverter

.init()) diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift index 2e82aa935e..11f8a2683c 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift @@ -28,8 +28,8 @@ class ConvKernel: Kernel, Computable { } var shouldUseMPS = false - let functionName = type(of: self).kernelFunctionName(param: param, useAggressiveOptimization: initContext.useAggresiveOptimization) - if #available(iOS 11.0, *), (initContext.useMPS || initContext.useAggresiveOptimization) { + let functionName = type(of: self).kernelFunctionName(param: param, useAggressiveOptimization: initContext.useAggressiveOptimization) + if #available(iOS 11.0, *), (initContext.useMPS || initContext.useAggressiveOptimization) { if param.input.tensorDim[1] > 4 && param.output.tensorDim[1] > 4 { shouldUseMPS = true } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ScaleOpKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ScaleOpKernel.swift index c56bb844ab..5f2f20308d 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ScaleOpKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ScaleOpKernel.swift @@ -13,6 +13,7 @@ limitations under the License. */ import Foundation +import MetalPerformanceShaders struct ScaleMetalParam { let scale: Float32 @@ -21,6 +22,8 @@ struct ScaleMetalParam { class ScaleOpKernel: Kernel, Computable{ var metalParam: ScaleMetalParam + var mpsScaleOp: AnyObject? + required init(device: MTLDevice, param: ScaleParam

, initContext: InitContext) throws { do { try param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision) @@ -28,6 +31,15 @@ class ScaleOpKernel: Kernel, Computable{ throw error } + var shouldUseMPS = false + if initContext.useMPS && param.biasAfterScale { + let inputChannel = param.input.tensorDim[1] + let outputChannel = param.output.tensorDim[1] + if (inputChannel == 1 || inputChannel > 4) && (outputChannel == 1 || outputChannel > 4) { + shouldUseMPS = true + } + } + metalParam = ScaleMetalParam(scale: param.scale, abias: param.bias) if GlobalConfig.shared.computePrecision == .Float32 { @@ -45,9 +57,21 @@ class ScaleOpKernel: Kernel, Computable{ } else { fatalError() } + + if #available(iOS 10.0, *), shouldUseMPS { + mpsScaleOp = MPSCNNNeuronLinear(device: device, a: param.scale, b: param.bias) + param.input.useMPS = true + param.output.useMPS = true + } } func compute(commandBuffer: MTLCommandBuffer, param: ScaleParam

) throws { + if #available(iOS 10.0, *), let mpsScaleOp = mpsScaleOp as? MPSCNNNeuronLinear { + let inputImage = MPSImage.init(texture: param.input.metalTexture, featureChannels: param.input.tensorDim[1]) + let outputImage = MPSImage.init(texture: param.output.metalTexture, featureChannels: param.output.tensorDim[1]) + mpsScaleOp.encode(commandBuffer: commandBuffer, sourceImage: inputImage, destinationImage: outputImage) + return + } guard let encoder = commandBuffer.makeComputeCommandEncoder() else { throw PaddleMobileError.predictError(message: " encoder is nil") } -- GitLab