// RGBToYCrCb_Y.metal
// paddle-mobile-demo
// Created by liuRuiLong on 2018/12/28.
// Copyright © 2018 orange. All rights reserved.
#include <metal_stdlib>
using namespace metal;
kernel void buffer_to_texture_kernel( const device float *input [[buffer(0)]],
texture2d<float, access::write> outTexture [[texture(0)]],
uint2 gid [[thread_position_in_grid]]){
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height()) {
float y = input[outTexture.get_width() * gid.y + gid.x];
outTexture.write(float4(y, 0.0f, 0.0f, 0.0f), gid);
kernel void buffer_to_texture_kernel_half( const device float *input [[buffer(0)]],
texture2d<half, access::write> outTexture [[texture(0)]],
uint2 gid [[thread_position_in_grid]]){
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height()) {
float y = input[outTexture.get_width() * gid.y + gid.x];
outTexture.write(half4(y, 0.0f, 0.0f, 0.0f), gid);
......@@ -41,7 +41,7 @@ public class Genet: Net {
class GenetPreProccess: CusomKernel {
init(device: MTLDevice) {
let s = CusomKernel.Shape.init(inWidth: 128, inHeight: 128, inChannel: 3)
let s = Shape.init(inWidth: 128, inHeight: 128, inChannel: 3)
super.init(device: device, inFunctionName: "genet_preprocess", outputDim: s, usePaddleMobileLib: false)
......@@ -18,7 +18,7 @@ public class MobileNet: Net{
class MobilenetPreProccess: CusomKernel {
init(device: MTLDevice) {
let s = CusomKernel.Shape.init(inWidth: 224, inHeight: 224, inChannel: 3)
let s = Shape.init(inWidth: 224, inHeight: 224, inChannel: 3)
super.init(device: device, inFunctionName: "mobilenet_preprocess", outputDim: s, usePaddleMobileLib: false)
......@@ -17,11 +17,11 @@ public class MobileNetCombined: Net {
modelPath = Bundle.main.path(forResource: "combined_mobilenet_model", ofType: nil) ?! "model null"
paramPath = Bundle.main.path(forResource: "combined_mobilenet_params", ofType: nil) ?! "para null"
modelDir = ""
//preprocessKernel = GenetPreProccess.init(device: device)
inputDim_ = Dim.init(inDim: [1, 416, 416, 3])
inputDim_ = Dim.init(inDim: [1, 224, 224, 3])
@objc override public init(device: MTLDevice,paramPointer: UnsafeMutableRawPointer, paramSize:Int, modePointer: UnsafeMutableRawPointer, modelSize: Int) {
means = [0, 0, 0]
scale = 1
......@@ -29,9 +29,7 @@ public class MobileNetCombined: Net {
modelPath = ""
paramPath = ""
modelDir = ""
//preprocessKernel = GenetPreProccess.init(device: device)
inputDim_ = Dim.init(inDim: [1, 416, 416, 3])
inputDim_ = Dim.init(inDim: [1, 224, 224, 3])
// class GenetPreProccess: CusomKernel {
......@@ -41,7 +41,7 @@ public class MobileNet_ssd_hand: Net{
class MobilenetssdPreProccess: CusomKernel {
init(device: MTLDevice) {
let s = CusomKernel.Shape.init(inWidth: 300, inHeight: 300, inChannel: 3)
let s = Shape.init(inWidth: 300, inHeight: 300, inChannel: 3)
super.init(device: device, inFunctionName: "mobilenet_ssd_preprocess", outputDim: s, usePaddleMobileLib: false)
......@@ -41,7 +41,7 @@ public class MobileNet_ssd_AR: Net{
class MobilenetssdPreProccess: CusomKernel {
init(device: MTLDevice) {
let s = CusomKernel.Shape.init(inWidth: 160, inHeight: 160, inChannel: 3)
let s = Shape.init(inWidth: 160, inHeight: 160, inChannel: 3)
super.init(device: device, inFunctionName: "mobilent_ar_preprocess", outputDim: s, usePaddleMobileLib: false)
......@@ -14,6 +14,14 @@
import Foundation
class SuperResolutionPreProccess: CusomKernel {
init(device: MTLDevice) {
let s = Shape.init(inWidth: 224, inHeight: 224, inChannel: 3)
super.init(device: device, inFunctionName: "super_resolution_preprocess", outputDim: s, usePaddleMobileLib: false)
public class SuperResolutionNet: Net{
override public func resultStr(res: ResultHolder) -> String {
return "未实现"
......@@ -18,7 +18,7 @@ public class YoloNet: Net {
modelPath = Bundle.main.path(forResource: "yolo_model", ofType: nil) ?! "model null"
paramPath = Bundle.main.path(forResource: "yolo_params", ofType: nil) ?! "para null"
modelDir = ""
//preprocessKernel = GenetPreProccess.init(device: device)
// preprocessKernel = GenetPreProccess.init(device: device)
inputDim_ = Dim.init(inDim: [1, 416, 416, 3])
......@@ -46,17 +46,60 @@ open class Kernel {
open class CusomKernel: Kernel {
public struct Shape {
public let width: Int
public let height: Int
public let channel: Int
public init(inWidth: Int, inHeight: Int, inChannel: Int){
width = inWidth
height = inHeight
channel = inChannel
public struct Shape {
public let width: Int
public let height: Int
public let channel: Int
public init(inWidth: Int, inHeight: Int, inChannel: Int){
width = inWidth
height = inHeight
channel = inChannel
open class BufferToTextureKernel: Kernel {
public let outputTexture: MTLTexture
public init(device: MTLDevice, outputDim: Shape, usePaddleMobileLib: Bool = false) {
let textureDesc = MTLTextureDescriptor.init()
textureDesc.textureType = .type2D
textureDesc.width = outputDim.width
textureDesc.height = outputDim.height
textureDesc.depth = (outputDim.channel + 3) / 4
if computePrecision == .Float16 {
textureDesc.pixelFormat = .rgba16Float
} else if computePrecision == .Float32 {
textureDesc.pixelFormat = .rgba32Float
} else {
textureDesc.usage = [.shaderRead, .shaderWrite]
textureDesc.storageMode = .shared
outputTexture = device.makeTexture(descriptor: textureDesc) ?! " make texture error "
if computePrecision == .Float32 {
super.init(device: device, inFunctionName: "buffer_to_texture_kernel", usePaddleMobileLib: usePaddleMobileLib)
} else {
super.init(device: device, inFunctionName: "buffer_to_texture_kernel_half", usePaddleMobileLib: usePaddleMobileLib)
public func compute(inputBuffer: MTLBuffer , commandBuffer: MTLCommandBuffer) throws {
guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
throw PaddleMobileError.predictError(message: " encode is nil")
encoder.setBuffer(inputBuffer, offset: 0, index: 0)
encoder.setTexture(outputTexture, index: 0)
encoder.dispatch(computePipline: pipline, outTexture: outputTexture)
open class CusomKernel: Kernel {
public let outputTexture: MTLTexture
public init(device: MTLDevice, inFunctionName: String, outputDim: Shape, usePaddleMobileLib: Bool = false) {
let textureDesc = MTLTextureDescriptor.init()
......@@ -49,7 +49,7 @@ class PoolKernel<P: PrecisionType>: Kernel, Computable{
if computePrecision == .Float32 {
super.init(device: device, inFunctionName: "pool")
super.init(device: device, inFunctionName: "pool_float")
} else if computePrecision == .Float16 {
super.init(device: device, inFunctionName: "pool_half")
} else {
......@@ -24,6 +24,6 @@ using namespace metal;
#define FUNC(f, r, n, v, p) CONCAT5_(f, r, n, v, p)
#define VECTOR(p, n) CONCAT2(p, n)
#define FUNC2_(a, b) CONCAT2_(a, b)
#define FUNC3_(a, b, c) CONCAT3_(a, b, c)
// PoolKernel.inc.metal
// paddle-mobile
// Created by liuRuiLong on 2018/12/29.
// Copyright © 2018 orange. All rights reserved.
#ifdef P
kernel void FUNC2_(pool, P)(texture2d_array<P, access::read> inTexture [[texture(0)]],
texture2d_array<P, access::write> outTexture [[texture(1)]],
constant PoolParam &pm [[buffer(0)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) return;
int xmin = gid.x * pm.strideX - pm.paddingX;
int xmax = min(xmin + pm.ksizeX, int(inTexture.get_width()));
xmin = max(xmin, 0);
int ymin = gid.y * pm.strideX - pm.paddingX;
int ymax = min(ymin + pm.ksizeX, int(inTexture.get_height()));
ymin = max(ymin, 0);
VECTOR(P, 4) r = 0;
if (pm.poolType == 0) {
r = inTexture.read(uint2(xmin, ymin), gid.z);
for (int x = xmin; x < xmax; x++) {
for (int y = ymin; y < ymax; y++) {
r = fmax(r, inTexture.read(uint2(x, y), gid.z));
} else if (pm.poolType == 1) {
for (int x = xmin; x < xmax; x++) {
for (int y = ymin; y < ymax; y++) {
r += inTexture.read(uint2(x, y), gid.z);
r /= (xmax - xmin) * (ymax - ymin);
outTexture.write(r, gid.xy, gid.z);
......@@ -13,7 +13,8 @@
limitations under the License. */
#include <metal_stdlib>
#include "Common.metal"
#include "Macro.metal"
using namespace metal;
struct PoolParam {
......@@ -26,68 +27,10 @@ struct PoolParam {
int poolType;
kernel void pool(texture2d_array<float, access::read> inTexture [[texture(0)]],
texture2d_array<float, access::write> outTexture [[texture(1)]],
constant PoolParam &pm [[buffer(0)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) return;
int xmin = gid.x * pm.strideX - pm.paddingX;
int xmax = min(xmin + pm.ksizeX, int(inTexture.get_width()));
xmin = max(xmin, 0);
int ymin = gid.y * pm.strideX - pm.paddingX;
int ymax = min(ymin + pm.ksizeX, int(inTexture.get_height()));
ymin = max(ymin, 0);
float4 r = 0;
if (pm.poolType == 0) {
r = inTexture.read(uint2(xmin, ymin), gid.z);
for (int x = xmin; x < xmax; x++) {
for (int y = ymin; y < ymax; y++) {
r = fmax(r, inTexture.read(uint2(x, y), gid.z));
} else if (pm.poolType == 1) {
for (int x = xmin; x < xmax; x++) {
for (int y = ymin; y < ymax; y++) {
r += inTexture.read(uint2(x, y), gid.z);
r /= pm.ksizeX * pm.ksizeY;
outTexture.write(r, gid.xy, gid.z);
#define P float
#import "PoolKernel.inc.metal"
#undef P
kernel void pool_half(texture2d_array<half, access::read> inTexture [[texture(0)]],
texture2d_array<half, access::write> outTexture [[texture(1)]],
constant PoolParam &pm [[buffer(0)]],
uint3 gid [[thread_position_in_grid]]) {
if (gid.x >= outTexture.get_width() ||
gid.y >= outTexture.get_height() ||
gid.z >= outTexture.get_array_size()) return;
int xmin = gid.x * pm.strideX - pm.paddingX;
int xmax = min(xmin + pm.ksizeX, int(inTexture.get_width()));
xmin = max(xmin, 0);
int ymin = gid.y * pm.strideX - pm.paddingX;
int ymax = min(ymin + pm.ksizeX, int(inTexture.get_height()));
ymin = max(ymin, 0);
half4 r = 0;
if (pm.poolType == 0) {
r = inTexture.read(uint2(xmin, ymin), gid.z);
for (int x = xmin; x < xmax; x++) {
for (int y = ymin; y < ymax; y++) {
r = fmax(r, inTexture.read(uint2(x, y), gid.z));
} else if (pm.poolType == 1) {
for (int x = xmin; x < xmax; x++) {
for (int y = ymin; y < ymax; y++) {
r += inTexture.read(uint2(x, y), gid.z);
r /= pm.ksizeX * pm.ksizeY;
outTexture.write(r, gid.xy, gid.z);
#define P half
#import "PoolKernel.inc.metal"
#undef P
......@@ -51,7 +51,10 @@ class ScaleKernel: CusomKernel {
numel = net.inputDim.numel()
meansNumber = net.means.map { NSNumber.init(value: $0) }
meansNumber = net.means.map {
NSNumber.init(value: $0)
dimsNum = [NSNumber.init(value: net.inputDim[0]),
NSNumber.init(value: net.inputDim[3]),
NSNumber.init(value: net.inputDim[1]),
......@@ -119,6 +122,29 @@ class ScaleKernel: CusomKernel {
scaleTexture(input: texture!, complete: getTexture)
@objc public func getTexture(inBuffer: MTLBuffer, getTexture: @escaping (MTLTexture) -> Void) {
guard let inQueue = queue, let inDevice = device else {
fatalError( " queue or devcie nil " )
guard let buffer = inQueue.makeCommandBuffer() else {
fatalError( " make buffer error" )
let bufferToTextureKernel = BufferToTextureKernel.init(device: inDevice, outputDim: Shape.init(inWidth: net.inputDim[2], inHeight: net.inputDim[1], inChannel: net.inputDim[3]))
do {
try bufferToTextureKernel.compute(inputBuffer: inBuffer, commandBuffer: buffer)
} catch {
fatalError(" bufferToTextureKernel error ")
buffer.addCompletedHandler { (buffer) in
public func scaleTexture(input: MTLTexture , complete: @escaping (MTLTexture) -> Void) {
guard let inQueue = queue, let inDevice = device else {
......@@ -129,7 +155,7 @@ class ScaleKernel: CusomKernel {
fatalError( " make buffer error" )
let scaleKernel = ScaleKernel.init(device: inDevice, shape: CusomKernel.Shape.init(inWidth: net.inputDim[2], inHeight: net.inputDim[1], inChannel: 3))
let scaleKernel = ScaleKernel.init(device: inDevice, shape: Shape.init(inWidth: net.inputDim[2], inHeight: net.inputDim[1], inChannel: 3))
do {
try scaleKernel.compute(inputTexuture: input, commandBuffer: buffer)
......@@ -25,8 +25,7 @@ public class GPUResultHolder {
public let capacity: Int
public var resultPointer: UnsafeMutablePointer<Float32>?
public var intermediateResults: [String : [Variant]]?
public let elapsedTime: Double
public init(inDim: [Int], inPointer: UnsafeMutablePointer<Float32>?, inCapacity: Int, inElapsedTime: Double, inIntermediateResults: [String : [Variant]]? = nil) {
public init(inDim: [Int], inPointer: UnsafeMutablePointer<Float32>?, inCapacity: Int, inIntermediateResults: [String : [Variant]]? = nil) {
dim = inDim
capacity = inCapacity
......@@ -35,7 +34,6 @@ public class GPUResultHolder {
resultPointer?.initialize(from: inInPointer, count: inCapacity)
elapsedTime = inElapsedTime
intermediateResults = inIntermediateResults
......@@ -124,7 +122,6 @@ public class Executor<P: PrecisionType> {
resInput = input
let beforeDate = Date.init()
let inputTexture = InputTexture.init(inMTLTexture: resInput, inExpectDim: dim)
program.scope.setInput(input: inputTexture)
//(ops.count - except)
......@@ -150,28 +147,28 @@ public class Executor<P: PrecisionType> {
let inputArr = resInput.toTensor(dim: (n: dim[0], c: dim[3], h: dim[1], w: dim[2]))
writeToLibrary(fileName: "test_image_yolo", array: inputArr)
writeToLibrary(fileName: "test_image_mingren", array: inputArr)
print(" write done ")
/* 输出 op 计算结果
for op in SSelf.ops {
let afterDate = Date.init()
var resultHolder: GPUResultHolder
if except > 0 {
resultHolder = GPUResultHolder.init(inDim: [], inPointer: nil, inCapacity: 0, inElapsedTime: afterDate.timeIntervalSince(beforeDate), inIntermediateResults: outputTextures)
resultHolder = GPUResultHolder.init(inDim: [], inPointer: nil, inCapacity: 0, inIntermediateResults: outputTextures)
} else {
let outputVar: Variant = SSelf.program.scope.output()!
let output: FetchHolder = outputVar as! FetchHolder
resultHolder = GPUResultHolder.init(inDim: output.dim.dims, inPointer: output.result, inCapacity: output.capacity, inElapsedTime: afterDate.timeIntervalSince(beforeDate))
resultHolder = GPUResultHolder.init(inDim: output.dim.dims, inPointer: output.result, inCapacity: output.capacity)
......@@ -96,8 +96,6 @@ public class Texture: Tensorial {
return metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
func initTexture(device: MTLDevice, inTranspose: [Int] = [0, 1, 2, 3], computePrecision: ComputePrecision = .Float16) {
transpose = inTranspose
for i in 0..<(4 - tensorDim.cout()) {
......@@ -507,7 +507,7 @@ void Executor<Device, T>::Predict_To(int end) {
template <>
void Executor<GPU_CL, float>::InitNoPersistableMemory(
const LoDTensor &input_tensor) {
const Tensor &input_tensor) {
DLOG << "CL InitNoPersistableMemory ";
for (const auto &block : program_desc_->Blocks()) {
for (const auto &var_desc : block->Vars()) {
