未验证 提交 bfca5725 编写于 作者: Y Yanzhan Yang 提交者: GitHub

1. enable conv mps with texture bias. 2. add scale mps. 3. remove duplicate...

1. enable conv mps with texture bias. 2. add scale mps. 3. remove duplicate computation in fetch kernel. 4. fix typo. (#1665)
上级 5fe89118
...@@ -644,13 +644,13 @@ kernel void depthwise_conv_add_relu_3x3_half_winograd(texture2d_array<half, acce ...@@ -644,13 +644,13 @@ kernel void depthwise_conv_add_relu_3x3_half_winograd(texture2d_array<half, acce
} }
if (param.hasAddOp == 1) { if (param.hasAddOp == 1) {
half4 base = biasTexture.sample(sample, uint2(tx, ty), tc); half4 base = biasTexture.sample(sample, float2(tx, ty), tc);
res[0] += base; res[0] += base;
base = biasTexture.sample(sample, uint2(tx + 1, ty), tc); base = biasTexture.sample(sample, float2(tx + 1, ty), tc);
res[1] += base; res[1] += base;
base = biasTexture.sample(sample, uint2(tx, ty + 1), tc); base = biasTexture.sample(sample, float2(tx, ty + 1), tc);
res[2] += base; res[2] += base;
base = biasTexture.sample(sample, uint2(tx + 1, ty + 1), tc); base = biasTexture.sample(sample, float2(tx + 1, ty + 1), tc);
res[3] += base; res[3] += base;
} }
......
...@@ -25,22 +25,22 @@ ...@@ -25,22 +25,22 @@
kernel void FUNC_T(fetch, P)(texture2d_array<P, access::read> inTexture [[texture(0)]], kernel void FUNC_T(fetch, P)(texture2d_array<P, access::read> inTexture [[texture(0)]],
device float *output [[buffer(0)]], device float *output [[buffer(0)]],
uint3 gid [[thread_position_in_grid]]) { uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= inTexture.get_width() || uint input_width = inTexture.get_width();
gid.y >= inTexture.get_height() || uint input_height = inTexture.get_height();
if (gid.x >= input_width ||
gid.y >= input_height ||
gid.z >= inTexture.get_array_size()) { gid.z >= inTexture.get_array_size()) {
return; return;
} }
int input_width = inTexture.get_width();
int input_height = inTexture.get_height();
const VECTOR(P, 4) input = inTexture.read(gid.xy, gid.z); const VECTOR(P, 4) input = inTexture.read(gid.xy, gid.z);
int output_to = 4 * input_width * input_height; uint delta = input_width * input_height;
uint output_to = 4 * gid.z * delta + gid.y * input_width + gid.x;
output[gid.z * output_to + 0 * input_width * input_height + gid.y * input_width + gid.x] = input.x;
output[gid.z * output_to + 1 * input_width * input_height + gid.y * input_width + gid.x] = input.y; output[output_to] = input.x;
output[gid.z * output_to + 2 * input_width * input_height + gid.y * input_width + gid.x] = input.z; output[output_to + delta] = input.y;
output[gid.z * output_to + 3 * input_width * input_height + gid.y * input_width + gid.x] = input.w; output[output_to + 2 * delta] = input.z;
output[output_to + 3 * delta] = input.w;
} }
kernel void FUNC(fetch, 1or2, P)(texture2d_array<P, access::read> inTexture [[texture(0)]], kernel void FUNC(fetch, 1or2, P)(texture2d_array<P, access::read> inTexture [[texture(0)]],
......
...@@ -116,7 +116,7 @@ import Foundation ...@@ -116,7 +116,7 @@ import Foundation
initContext.metalLoadMode = net.metalLoadMode initContext.metalLoadMode = net.metalLoadMode
initContext.metalLibPath = net.metalLibPath initContext.metalLibPath = net.metalLibPath
initContext.useMPS = net.useMPS initContext.useMPS = net.useMPS
initContext.useAggresiveOptimization = net.useAggressiveOptimization initContext.useAggressiveOptimization = net.useAggressiveOptimization
switch net.paramPrecision { switch net.paramPrecision {
case .Float16: case .Float16:
......
...@@ -71,7 +71,7 @@ public class InitContext { ...@@ -71,7 +71,7 @@ public class InitContext {
var useMPS: Bool = false var useMPS: Bool = false
/// 是否使用最高等级的加速策略 /// 是否使用最高等级的加速策略
var useAggresiveOptimization: Bool = false var useAggressiveOptimization: Bool = false
init() { init() {
metalLoadMode = .LoadMetalInDefaultLib metalLoadMode = .LoadMetalInDefaultLib
......
...@@ -108,6 +108,8 @@ class ConvDataSource<P: PrecisionProtocol>: NSObject, MPSCNNConvolutionDataSourc ...@@ -108,6 +108,8 @@ class ConvDataSource<P: PrecisionProtocol>: NSObject, MPSCNNConvolutionDataSourc
class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable { class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
var metalParam: MetalConvParam! var metalParam: MetalConvParam!
var mpsConvOp: Any? var mpsConvOp: Any?
var mpsAddOp: Any?
var mpsReluOp: Any?
var blankTexture: Texture? var blankTexture: Texture?
required init(device: MTLDevice, param: ConvAddReluParam<P>, initContext: InitContext) throws { required init(device: MTLDevice, param: ConvAddReluParam<P>, initContext: InitContext) throws {
...@@ -118,9 +120,11 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable { ...@@ -118,9 +120,11 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
} }
var shouldUseMPS = false var shouldUseMPS = false
let functionName = type(of: self).kernelFunctionName(param: param, useAggressiveOptimization: initContext.useAggresiveOptimization) let functionName = type(of: self).kernelFunctionName(param: param, useAggressiveOptimization: initContext.useAggressiveOptimization)
if #available(iOS 11.0, *), (initContext.useMPS || initContext.useAggresiveOptimization) { if #available(iOS 11.0, *), (initContext.useMPS || initContext.useAggressiveOptimization) {
if param.input.tensorDim[1] > 4 && param.output.tensorDim[1] > 4 { let inputChannel = param.input.tensorDim[1]
let outputChannel = param.output.tensorDim[1]
if (inputChannel == 1 || inputChannel > 4) && (outputChannel == 1 || outputChannel > 4) {
shouldUseMPS = true shouldUseMPS = true
} }
} }
...@@ -149,6 +153,15 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable { ...@@ -149,6 +153,15 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
let inputImage = MPSImage.init(texture: param.input.metalTexture, featureChannels: param.input.tensorDim[1]) let inputImage = MPSImage.init(texture: param.input.metalTexture, featureChannels: param.input.tensorDim[1])
let outputImage = MPSImage.init(texture: param.output.metalTexture, featureChannels: param.output.tensorDim[1]) let outputImage = MPSImage.init(texture: param.output.metalTexture, featureChannels: param.output.tensorDim[1])
conv.encode(commandBuffer: commandBuffer, sourceImage: inputImage, destinationImage: outputImage) conv.encode(commandBuffer: commandBuffer, sourceImage: inputImage, destinationImage: outputImage)
if #available(iOS 11.3, *) {
if let add = mpsAddOp as? MPSCNNAdd, let y = param.y {
let biasImage = MPSImage.init(texture: y.metalTexture, featureChannels: y.tensorDim[1])
add.encode(commandBuffer: commandBuffer, primaryImage: outputImage, secondaryImage: biasImage, destinationImage: outputImage)
}
if let relu = mpsReluOp as? MPSCNNNeuronReLU {
relu.encode(commandBuffer: commandBuffer, sourceImage: outputImage, destinationImage: outputImage)
}
}
return return
} }
} }
...@@ -172,17 +185,26 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable { ...@@ -172,17 +185,26 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
if #available(iOS 11.0, *) { if #available(iOS 11.0, *) {
param.input.useMPS = true param.input.useMPS = true
param.output.useMPS = true param.output.useMPS = true
if #available(iOS 11.3, *) {
if param.y != nil {
mpsAddOp = MPSCNNAdd(device: device)
if hasReluOp() {
mpsReluOp = MPSCNNNeuronReLU(device: device, a: 0.0)
}
}
}
let neuronFilter: MPSCNNNeuron? = param.y != nil ? nil : (neuronFilterForMPSLayer(device: device) as? MPSCNNNeuron)
let desc: MPSCNNConvolutionDescriptor = isDepthWise ? let desc: MPSCNNConvolutionDescriptor = isDepthWise ?
MPSCNNDepthWiseConvolutionDescriptor(kernelWidth: param.filter.tensorDim[3], MPSCNNDepthWiseConvolutionDescriptor(kernelWidth: param.filter.tensorDim[3],
kernelHeight: param.filter.tensorDim[2], kernelHeight: param.filter.tensorDim[2],
inputFeatureChannels: param.input.tensorDim[1], inputFeatureChannels: param.input.tensorDim[1],
outputFeatureChannels: param.output.tensorDim[1], outputFeatureChannels: param.output.tensorDim[1],
neuronFilter: neuronFilterForMPSLayer(device: device) as? MPSCNNNeuron) : neuronFilter: neuronFilter) :
MPSCNNConvolutionDescriptor(kernelWidth: param.filter.tensorDim[3], MPSCNNConvolutionDescriptor(kernelWidth: param.filter.tensorDim[3],
kernelHeight: param.filter.tensorDim[2], kernelHeight: param.filter.tensorDim[2],
inputFeatureChannels: param.input.tensorDim[1], inputFeatureChannels: param.input.tensorDim[1],
outputFeatureChannels: param.output.tensorDim[1], outputFeatureChannels: param.output.tensorDim[1],
neuronFilter: neuronFilterForMPSLayer(device: device) as? MPSCNNNeuron) neuronFilter: neuronFilter)
desc.strideInPixelsX = Int(param.stride[0]) desc.strideInPixelsX = Int(param.stride[0])
desc.strideInPixelsY = Int(param.stride[1]) desc.strideInPixelsY = Int(param.stride[1])
let _ = param.filter.convert(converter: MPSPointerConverter<P>.init()) let _ = param.filter.convert(converter: MPSPointerConverter<P>.init())
......
...@@ -28,8 +28,8 @@ class ConvKernel<P: PrecisionProtocol>: Kernel, Computable { ...@@ -28,8 +28,8 @@ class ConvKernel<P: PrecisionProtocol>: Kernel, Computable {
} }
var shouldUseMPS = false var shouldUseMPS = false
let functionName = type(of: self).kernelFunctionName(param: param, useAggressiveOptimization: initContext.useAggresiveOptimization) let functionName = type(of: self).kernelFunctionName(param: param, useAggressiveOptimization: initContext.useAggressiveOptimization)
if #available(iOS 11.0, *), (initContext.useMPS || initContext.useAggresiveOptimization) { if #available(iOS 11.0, *), (initContext.useMPS || initContext.useAggressiveOptimization) {
if param.input.tensorDim[1] > 4 && param.output.tensorDim[1] > 4 { if param.input.tensorDim[1] > 4 && param.output.tensorDim[1] > 4 {
shouldUseMPS = true shouldUseMPS = true
} }
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
limitations under the License. */ limitations under the License. */
import Foundation import Foundation
import MetalPerformanceShaders
struct ScaleMetalParam { struct ScaleMetalParam {
let scale: Float32 let scale: Float32
...@@ -21,6 +22,8 @@ struct ScaleMetalParam { ...@@ -21,6 +22,8 @@ struct ScaleMetalParam {
class ScaleOpKernel<P: PrecisionProtocol>: Kernel, Computable{ class ScaleOpKernel<P: PrecisionProtocol>: Kernel, Computable{
var metalParam: ScaleMetalParam var metalParam: ScaleMetalParam
var mpsScaleOp: AnyObject?
required init(device: MTLDevice, param: ScaleParam<P>, initContext: InitContext) throws { required init(device: MTLDevice, param: ScaleParam<P>, initContext: InitContext) throws {
do { do {
try param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision) try param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
...@@ -28,6 +31,15 @@ class ScaleOpKernel<P: PrecisionProtocol>: Kernel, Computable{ ...@@ -28,6 +31,15 @@ class ScaleOpKernel<P: PrecisionProtocol>: Kernel, Computable{
throw error throw error
} }
var shouldUseMPS = false
if initContext.useMPS && param.biasAfterScale {
let inputChannel = param.input.tensorDim[1]
let outputChannel = param.output.tensorDim[1]
if (inputChannel == 1 || inputChannel > 4) && (outputChannel == 1 || outputChannel > 4) {
shouldUseMPS = true
}
}
metalParam = ScaleMetalParam(scale: param.scale, abias: param.bias) metalParam = ScaleMetalParam(scale: param.scale, abias: param.bias)
if GlobalConfig.shared.computePrecision == .Float32 { if GlobalConfig.shared.computePrecision == .Float32 {
...@@ -45,9 +57,21 @@ class ScaleOpKernel<P: PrecisionProtocol>: Kernel, Computable{ ...@@ -45,9 +57,21 @@ class ScaleOpKernel<P: PrecisionProtocol>: Kernel, Computable{
} else { } else {
fatalError() fatalError()
} }
if #available(iOS 10.0, *), shouldUseMPS {
mpsScaleOp = MPSCNNNeuronLinear(device: device, a: param.scale, b: param.bias)
param.input.useMPS = true
param.output.useMPS = true
}
} }
func compute(commandBuffer: MTLCommandBuffer, param: ScaleParam<P>) throws { func compute(commandBuffer: MTLCommandBuffer, param: ScaleParam<P>) throws {
if #available(iOS 10.0, *), let mpsScaleOp = mpsScaleOp as? MPSCNNNeuronLinear {
let inputImage = MPSImage.init(texture: param.input.metalTexture, featureChannels: param.input.tensorDim[1])
let outputImage = MPSImage.init(texture: param.output.metalTexture, featureChannels: param.output.tensorDim[1])
mpsScaleOp.encode(commandBuffer: commandBuffer, sourceImage: inputImage, destinationImage: outputImage)
return
}
guard let encoder = commandBuffer.makeComputeCommandEncoder() else { guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
throw PaddleMobileError.predictError(message: " encoder is nil") throw PaddleMobileError.predictError(message: " encoder is nil")
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册