提交 fcfeac88 编写于 作者: Y Yanzhan Yang 提交者: GitHub

1. enable conv mps with texture bias. 2. add scale mps. 3. remove duplicate...

1. enable conv mps with texture bias. 2. add scale mps. 3. remove duplicate computation in fetch kernel. 4. fix typo. (#1665)
上级 c1ffeb30
......@@ -644,13 +644,13 @@ kernel void depthwise_conv_add_relu_3x3_half_winograd(texture2d_array<half, acce
}
if (param.hasAddOp == 1) {
half4 base = biasTexture.sample(sample, uint2(tx, ty), tc);
half4 base = biasTexture.sample(sample, float2(tx, ty), tc);
res[0] += base;
base = biasTexture.sample(sample, uint2(tx + 1, ty), tc);
base = biasTexture.sample(sample, float2(tx + 1, ty), tc);
res[1] += base;
base = biasTexture.sample(sample, uint2(tx, ty + 1), tc);
base = biasTexture.sample(sample, float2(tx, ty + 1), tc);
res[2] += base;
base = biasTexture.sample(sample, uint2(tx + 1, ty + 1), tc);
base = biasTexture.sample(sample, float2(tx + 1, ty + 1), tc);
res[3] += base;
}
......
......@@ -25,22 +25,22 @@
kernel void FUNC_T(fetch, P)(texture2d_array<P, access::read> inTexture [[texture(0)]],
device float *output [[buffer(0)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= inTexture.get_width() ||
gid.y >= inTexture.get_height() ||
uint input_width = inTexture.get_width();
uint input_height = inTexture.get_height();
if (gid.x >= input_width ||
gid.y >= input_height ||
gid.z >= inTexture.get_array_size()) {
return;
}
int input_width = inTexture.get_width();
int input_height = inTexture.get_height();
const VECTOR(P, 4) input = inTexture.read(gid.xy, gid.z);
int output_to = 4 * input_width * input_height;
output[gid.z * output_to + 0 * input_width * input_height + gid.y * input_width + gid.x] = input.x;
uint delta = input_width * input_height;
uint output_to = 4 * gid.z * delta + gid.y * input_width + gid.x;
output[gid.z * output_to + 1 * input_width * input_height + gid.y * input_width + gid.x] = input.y;
output[gid.z * output_to + 2 * input_width * input_height + gid.y * input_width + gid.x] = input.z;
output[gid.z * output_to + 3 * input_width * input_height + gid.y * input_width + gid.x] = input.w;
output[output_to] = input.x;
output[output_to + delta] = input.y;
output[output_to + 2 * delta] = input.z;
output[output_to + 3 * delta] = input.w;
}
kernel void FUNC(fetch, 1or2, P)(texture2d_array<P, access::read> inTexture [[texture(0)]],
......
......@@ -116,7 +116,7 @@ import Foundation
initContext.metalLoadMode = net.metalLoadMode
initContext.metalLibPath = net.metalLibPath
initContext.useMPS = net.useMPS
initContext.useAggresiveOptimization = net.useAggressiveOptimization
initContext.useAggressiveOptimization = net.useAggressiveOptimization
switch net.paramPrecision {
case .Float16:
......
......@@ -71,7 +71,7 @@ public class InitContext {
var useMPS: Bool = false
/// 是否使用最高等级的加速策略
var useAggresiveOptimization: Bool = false
var useAggressiveOptimization: Bool = false
init() {
metalLoadMode = .LoadMetalInDefaultLib
......
......@@ -108,6 +108,8 @@ class ConvDataSource<P: PrecisionProtocol>: NSObject, MPSCNNConvolutionDataSourc
class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
var metalParam: MetalConvParam!
var mpsConvOp: Any?
var mpsAddOp: Any?
var mpsReluOp: Any?
var blankTexture: Texture?
required init(device: MTLDevice, param: ConvAddReluParam<P>, initContext: InitContext) throws {
......@@ -118,9 +120,11 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
}
var shouldUseMPS = false
let functionName = type(of: self).kernelFunctionName(param: param, useAggressiveOptimization: initContext.useAggresiveOptimization)
if #available(iOS 11.0, *), (initContext.useMPS || initContext.useAggresiveOptimization) {
if param.input.tensorDim[1] > 4 && param.output.tensorDim[1] > 4 {
let functionName = type(of: self).kernelFunctionName(param: param, useAggressiveOptimization: initContext.useAggressiveOptimization)
if #available(iOS 11.0, *), (initContext.useMPS || initContext.useAggressiveOptimization) {
let inputChannel = param.input.tensorDim[1]
let outputChannel = param.output.tensorDim[1]
if (inputChannel == 1 || inputChannel > 4) && (outputChannel == 1 || outputChannel > 4) {
shouldUseMPS = true
}
}
......@@ -149,6 +153,15 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
let inputImage = MPSImage.init(texture: param.input.metalTexture, featureChannels: param.input.tensorDim[1])
let outputImage = MPSImage.init(texture: param.output.metalTexture, featureChannels: param.output.tensorDim[1])
conv.encode(commandBuffer: commandBuffer, sourceImage: inputImage, destinationImage: outputImage)
if #available(iOS 11.3, *) {
if let add = mpsAddOp as? MPSCNNAdd, let y = param.y {
let biasImage = MPSImage.init(texture: y.metalTexture, featureChannels: y.tensorDim[1])
add.encode(commandBuffer: commandBuffer, primaryImage: outputImage, secondaryImage: biasImage, destinationImage: outputImage)
}
if let relu = mpsReluOp as? MPSCNNNeuronReLU {
relu.encode(commandBuffer: commandBuffer, sourceImage: outputImage, destinationImage: outputImage)
}
}
return
}
}
......@@ -172,17 +185,26 @@ class ConvAddReluKernel<P: PrecisionProtocol>: Kernel, Computable {
if #available(iOS 11.0, *) {
param.input.useMPS = true
param.output.useMPS = true
if #available(iOS 11.3, *) {
if param.y != nil {
mpsAddOp = MPSCNNAdd(device: device)
if hasReluOp() {
mpsReluOp = MPSCNNNeuronReLU(device: device, a: 0.0)
}
}
}
let neuronFilter: MPSCNNNeuron? = param.y != nil ? nil : (neuronFilterForMPSLayer(device: device) as? MPSCNNNeuron)
let desc: MPSCNNConvolutionDescriptor = isDepthWise ?
MPSCNNDepthWiseConvolutionDescriptor(kernelWidth: param.filter.tensorDim[3],
kernelHeight: param.filter.tensorDim[2],
inputFeatureChannels: param.input.tensorDim[1],
outputFeatureChannels: param.output.tensorDim[1],
neuronFilter: neuronFilterForMPSLayer(device: device) as? MPSCNNNeuron) :
neuronFilter: neuronFilter) :
MPSCNNConvolutionDescriptor(kernelWidth: param.filter.tensorDim[3],
kernelHeight: param.filter.tensorDim[2],
inputFeatureChannels: param.input.tensorDim[1],
outputFeatureChannels: param.output.tensorDim[1],
neuronFilter: neuronFilterForMPSLayer(device: device) as? MPSCNNNeuron)
neuronFilter: neuronFilter)
desc.strideInPixelsX = Int(param.stride[0])
desc.strideInPixelsY = Int(param.stride[1])
let _ = param.filter.convert(converter: MPSPointerConverter<P>.init())
......
......@@ -28,8 +28,8 @@ class ConvKernel<P: PrecisionProtocol>: Kernel, Computable {
}
var shouldUseMPS = false
let functionName = type(of: self).kernelFunctionName(param: param, useAggressiveOptimization: initContext.useAggresiveOptimization)
if #available(iOS 11.0, *), (initContext.useMPS || initContext.useAggresiveOptimization) {
let functionName = type(of: self).kernelFunctionName(param: param, useAggressiveOptimization: initContext.useAggressiveOptimization)
if #available(iOS 11.0, *), (initContext.useMPS || initContext.useAggressiveOptimization) {
if param.input.tensorDim[1] > 4 && param.output.tensorDim[1] > 4 {
shouldUseMPS = true
}
......
......@@ -13,6 +13,7 @@
limitations under the License. */
import Foundation
import MetalPerformanceShaders
struct ScaleMetalParam {
let scale: Float32
......@@ -21,6 +22,8 @@ struct ScaleMetalParam {
class ScaleOpKernel<P: PrecisionProtocol>: Kernel, Computable{
var metalParam: ScaleMetalParam
var mpsScaleOp: AnyObject?
required init(device: MTLDevice, param: ScaleParam<P>, initContext: InitContext) throws {
do {
try param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision)
......@@ -28,6 +31,15 @@ class ScaleOpKernel<P: PrecisionProtocol>: Kernel, Computable{
throw error
}
var shouldUseMPS = false
if initContext.useMPS && param.biasAfterScale {
let inputChannel = param.input.tensorDim[1]
let outputChannel = param.output.tensorDim[1]
if (inputChannel == 1 || inputChannel > 4) && (outputChannel == 1 || outputChannel > 4) {
shouldUseMPS = true
}
}
metalParam = ScaleMetalParam(scale: param.scale, abias: param.bias)
if GlobalConfig.shared.computePrecision == .Float32 {
......@@ -45,9 +57,21 @@ class ScaleOpKernel<P: PrecisionProtocol>: Kernel, Computable{
} else {
fatalError()
}
if #available(iOS 10.0, *), shouldUseMPS {
mpsScaleOp = MPSCNNNeuronLinear(device: device, a: param.scale, b: param.bias)
param.input.useMPS = true
param.output.useMPS = true
}
}
func compute(commandBuffer: MTLCommandBuffer, param: ScaleParam<P>) throws {
if #available(iOS 10.0, *), let mpsScaleOp = mpsScaleOp as? MPSCNNNeuronLinear {
let inputImage = MPSImage.init(texture: param.input.metalTexture, featureChannels: param.input.tensorDim[1])
let outputImage = MPSImage.init(texture: param.output.metalTexture, featureChannels: param.output.tensorDim[1])
mpsScaleOp.encode(commandBuffer: commandBuffer, sourceImage: inputImage, destinationImage: outputImage)
return
}
guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
throw PaddleMobileError.predictError(message: " encoder is nil")
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册