Merge pull request #805 from dolphin8/metal

transpose op

Merge pull request #805 from dolphin8/metal
transpose op
e4554f0a · dolphin8 · GitHub · 0f373c95 · ff6b35c1 · e4554f0a
19 changed file
--- a/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift
+++ b/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift
@@ -71,6 +71,105 @@ extension MTLDevice {
        return buffer!
    }
+    func texture2tensor<P>(texture: MTLTexture, dim: [Int], transpose: [Int] = [0, 1, 2, 3]) -> [P] {
+        var tdim: [Int] = [1, 1, 1, 1]
+        for i in 0..<dim.count {
+            tdim[4 - dim.count + i] = dim[i]
+        }
+        let count = dim.reduce(1) { $0 * $1 }
+        var tensor: [P] = .init(repeating: Float32(0.0) as! P, count: count)
+        let ndim: [Int] = transpose.map { tdim[$0] }
+        assert(texture.width == ndim[2])
+        assert(texture.height == ndim[1])
+        assert(texture.arrayLength == (ndim[0] * ndim[3] + 3) / 4)
+        let bpR = ndim[2] * 4 * MemoryLayout<P>.size
+        let bpI = ndim[1] * bpR
+        let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: ndim[2], height: ndim[1], depth: 1))
+        for i in 0..<texture.arrayLength {
+            let pointer: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: ndim[1] * ndim[2] * 4 * MemoryLayout<P>.size)
+            texture.getBytes(pointer, bytesPerRow: bpR, bytesPerImage: bpI, from: region, mipmapLevel: 0, slice: i)
+            for h in 0..<ndim[1] {
+                for w in 0..<ndim[2] {
+                    for k in 0..<4 {
+                        let tx = (h * ndim[2] + w) * 4 + k
+                        let n = (i * 4 + k) / ndim[3]
+                        let c = (i * 4 + k) % ndim[3]
+                        let jg = [n, h, w, c]
+                        var ig = [0, 0, 0, 0]
+                        for d in 0..<4 {
+                            ig[transpose[d]] = jg[d]
+                        }
+                        let ix = ig[0] * tdim[1] * tdim[2] * tdim[3] + ig[1] * tdim[2] * tdim[3] + ig[2] * tdim[3] + ig[3]
+                        if ix < count {
+                            tensor[ix] = pointer[tx]
+                        }
+                    }
+                }
+            }
+        }
+        return tensor
+    }
+    func tensor2texture<P>(value: [P], dim: [Int], transpose: [Int] = [0, 1, 2, 3]) -> MTLTexture {
+        if value.count > 0 {
+            assert(value.count == dim.reduce(1) { $0 * $1 })
+        }
+        var tdim: [Int] = [1, 1, 1, 1]
+        for i in 0..<dim.count {
+            tdim[4 - dim.count + i] = dim[i]
+        }
+        let ndim: [Int] = transpose.map { tdim[$0] }
+        let textureDesc = MTLTextureDescriptor.init()
+        textureDesc.width = ndim[2]
+        textureDesc.height = ndim[1]
+        textureDesc.depth = 1
+        textureDesc.usage = [.shaderRead, .shaderWrite]
+        textureDesc.pixelFormat = .rgba32Float
+        textureDesc.textureType = .type2DArray
+        textureDesc.storageMode = .shared
+        textureDesc.cpuCacheMode = .defaultCache
+        textureDesc.arrayLength = (ndim[0] * ndim[3] + 3) / 4
+        let texture = makeTexture(descriptor: textureDesc)!
+        if value.count > 0 {
+            var rcount: Int = (ndim[0] * ndim[3] + 3) / 4
+            rcount = rcount * 4 * ndim[1] * ndim[2]
+            var nvalue: [P] = .init(repeating: Float32(0.0) as! P, count: rcount)
+            for i0 in 0..<tdim[0] {
+                for i1 in 0..<tdim[1] {
+                    for i2 in 0..<tdim[2] {
+                        for i3 in 0..<tdim[3] {
+                            let ig = [i0, i1, i2, i3]
+                            let ix = (i0 * tdim[1] * tdim[2] * tdim[3]) + (i1 * tdim[2] * tdim[3]) + (i2 * tdim[3]) + i3
+                            let jg = transpose.map { ig[$0] }
+                            let k = jg[0] * ndim[3] + jg[3]
+                            let jx = ((k / 4) * ndim[1] * ndim[2] * 4) + (jg[1] * ndim[2] * 4) + (jg[2] * 4) + (k % 4)
+                            nvalue[jx] = value[ix]
+                        }
+                    }
+                }
+            }
+            let pointer: UnsafeMutablePointer<P> = UnsafeMutablePointer(mutating: nvalue)
+            let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: ndim[2], height: ndim[1], depth: 1))
+            let bpR = ndim[2] * 4 * MemoryLayout<P>.size
+            let bpI = ndim[1] * bpR
+            for i in 0..<textureDesc.arrayLength {
+                let p = pointer + texture.width * texture.height * 4 * i
+                texture.replace(region: region, mipmapLevel: 0, slice: i, withBytes: p, bytesPerRow: bpR, bytesPerImage: bpI)
+            }
+        }
+        return texture
+    }
    func makeFloatTexture<P>(value: [P], textureWidth: Int, textureHeight: Int, arrayLength: Int) -> MTLTexture{
        let textureDesc = MTLTextureDescriptor.init()
@@ -85,19 +184,25 @@ extension MTLDevice {
        textureDesc.arrayLength = arrayLength
        let texture = makeTexture(descriptor: textureDesc)!
-        if arrayLength == 1 && value.count >= 4{
+        if value.count >= 4{
-            let pointer: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: value.count * MemoryLayout<P>.size)
+            let counts = arrayLength * 4 * textureWidth * textureHeight
+            let pointer: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: counts * MemoryLayout<P>.size)
            for i in 0..<value.count {
                pointer[i] = value[i]
            }
+            for i in value.count..<counts {
+                pointer[i] = 0 as! P
+            }
            let bytesPerRow = texture.width * texture.depth * 4 * MemoryLayout<P>.size
+            let bytesPerImage = texture.height * bytesPerRow
            let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: texture.width, height: texture.height, depth: texture.depth))
-            texture.replace(region: region, mipmapLevel: 0, withBytes: pointer, bytesPerRow: bytesPerRow)
+            for i in 0..<arrayLength {
+                let p = pointer + texture.width * texture.height * 4 * i
+                texture.replace(region: region, mipmapLevel: 0, slice: i, withBytes: p, bytesPerRow: bytesPerRow, bytesPerImage: bytesPerImage)
+            }
        } else {
        }
        return texture
@@ -112,16 +217,16 @@ extension MTLComputeCommandEncoder {
        let height = computePipline.maxTotalThreadsPerThreadgroup/width
        let threadsPerGroup = MTLSize.init(width: width, height: height, depth: 1)
-//        print(" thread: threads per group: \(threadsPerGroup) ")
+        print(" thread: threads per group: \(threadsPerGroup) ")
-//        print(" thread: out texture width: \(outTexture.width) , out texture height: \(outTexture.height)")
+        print(" thread: out texture width: \(outTexture.width) , out texture height: \(outTexture.height)")
        let groupWidth = (outTexture.width + width - 1)/width
        let groupHeight = (outTexture.height + height - 1)/height
        let groupDepth = slices
        let groups = MTLSize.init(width: groupWidth, height: groupHeight, depth: groupDepth)
-//        print("groups: \(groups) ")
+        print("groups: \(groups) ")
-//        print("threads per group: \(threadsPerGroup)")
+        print("threads per group: \(threadsPerGroup)")
        setComputePipelineState(computePipline)
@@ -183,54 +288,54 @@ public extension MTLTexture {
    func logDesc<T>(header: String = "", stridable: Bool = true) -> T? {
        print(header)
        print("texture: \(self)")
-        let res: [(index: Int, value: T)] = stridableFloatArray(stridable: stridable)
+//        let res: [(index: Int, value: T)] = stridableFloatArray(stridable: stridable)
-        print(res)
+//        print(res)
-//        if textureType == .type2DArray {
+        if textureType == .type2DArray {
-//            for i in 0..<arrayLength{
+            for i in 0..<arrayLength{
-//                var str: String = "slice: \(i): \n"
+                var str: String = "slice: \(i): \n"
-//                let bytes = UnsafeMutableRawPointer.allocate(byteCount: width * height * 4 * MemoryLayout<T>.size, alignment: MemoryLayout<T>.alignment)
+                let bytes = UnsafeMutableRawPointer.allocate(byteCount: width * height * 4 * MemoryLayout<T>.size, alignment: MemoryLayout<T>.alignment)
-//                let bytesPerRow = width * depth * 4 * MemoryLayout<T>.size
+                let bytesPerRow = width * depth * 4 * MemoryLayout<T>.size
-//                let bytesPerImage = width * height * depth * 4 * MemoryLayout<T>.size
+                let bytesPerImage = width * height * depth * 4 * MemoryLayout<T>.size
-//                let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth))
+                let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth))
-//                getBytes(bytes, bytesPerRow: bytesPerRow, bytesPerImage: bytesPerImage, from: region, mipmapLevel: 0, slice: i)
+                getBytes(bytes, bytesPerRow: bytesPerRow, bytesPerImage: bytesPerImage, from: region, mipmapLevel: 0, slice: i)
-//                let p = bytes.assumingMemoryBound(to: T.self)
+                let p = bytes.assumingMemoryBound(to: T.self)
-//                str += "2d array count : \(width * height * depth * 4) \n"
+                str += "2d array count : \(width * height * depth * 4) \n"
-//                if stridable && width * height * depth * 4 > 100 {
+                if stridable && width * height * depth * 4 > 100 {
-//                    for j in stride(from: 0, to: width * height * depth * 4 , by: width * height * depth * 4 / 100){
+                    for j in stride(from: 0, to: width * height * depth * 4 , by: width * height * depth * 4 / 100){
-//                        str += " index \(j): \(p[j])"
+                        str += " index \(j): \(p[j])"
-//                    }
+                    }
-//                } else {
+                } else {
-//                    for j in 0..<width * height * depth * 4 {
+                    for j in 0..<width * height * depth * 4 {
-//                        str += " index \(j): \(p[j])"
+                        str += " index \(j): \(p[j])"
-//                    }
+                    }
-//                }
+                }
-//
-//                bytes.deallocate()
+                bytes.deallocate()
-//                print(str)
+                print(str)
-//            }
+            }
-//        } else if textureType == .type2D {
+        } else if textureType == .type2D {
-//            var str: String = "texture 2D: "
+            var str: String = "texture 2D: "
-//            let bytes = UnsafeMutableRawPointer.allocate(byteCount: width * height * 4 * MemoryLayout<T>.size, alignment: MemoryLayout<T>.alignment)
+            let bytes = UnsafeMutableRawPointer.allocate(byteCount: width * height * 4 * MemoryLayout<T>.size, alignment: MemoryLayout<T>.alignment)
-//            let bytesPerRow = width * depth * 4 * MemoryLayout<T>.size
+            let bytesPerRow = width * depth * 4 * MemoryLayout<T>.size
-//            let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth))
+            let region = MTLRegion.init(origin: MTLOrigin.init(x: 0, y: 0, z: 0), size: MTLSize.init(width: width, height: height, depth: depth))
-//            getBytes(bytes, bytesPerRow: bytesPerRow, from: region, mipmapLevel: 0)
+            getBytes(bytes, bytesPerRow: bytesPerRow, from: region, mipmapLevel: 0)
-//            let p = bytes.assumingMemoryBound(to: T.self)
+            let p = bytes.assumingMemoryBound(to: T.self)
-//            str += "2d count : \(width * width * 4) \n"
+            str += "2d count : \(width * width * 4) \n"
-//
-//            if stridable {
+            if stridable {
-//                for j in stride(from: 0, to: width * height * 4, by: width * height * 4 / 100){
+                for j in stride(from: 0, to: width * height * 4, by: width * height * 4 / 100){
-//                    str += "index \(j): \(p[j]) "
+                    str += "index \(j): \(p[j]) "
-//                }
+                }
-//            } else {
+            } else {
-//                for j in 0..<width * height * 4 {
+                for j in 0..<width * height * 4 {
-//                    str += "index \(j): \(p[j]) "
+                    str += "index \(j): \(p[j]) "
-//                }
+                }
-//            }
+            }
-//
-//            print(str)
+            print(str)
-//            bytes.deallocate()
+            bytes.deallocate()
-//        }
+        }
        return nil
    }

--- a/metal/paddle-mobile/paddle-mobile/Common/PaddleMobileUnitTest.swift
+++ b/metal/paddle-mobile/paddle-mobile/Common/PaddleMobileUnitTest.swift
@@ -17,6 +17,103 @@ public class PaddleMobileUnitTest {
        queue = inQueue
    }
+    private func indentPrintTensor(tensor: [Float32], dim: [Int], ix: [Int], indentLevel: Int) {
+        let indent = Array.init(repeating: " ", count: indentLevel).joined(separator: "")
+        var tx = ix
+        if dim.count == indentLevel + 1 {
+            var log: String = indent + "["
+            for i in 0..<dim[indentLevel] {
+                tx = ix
+                tx[indentLevel] = i
+                for x in 1..<dim.count {
+                    for y in 0..<x {
+                        tx[y] *= dim[x]
+                    }
+                }
+                let c = tx.reduce(0) { $0 + $1 }
+                if i > 0 {
+                    log += ", "
+                }
+                log += tensor[c].description
+            }
+            log += "]"
+            if (indentLevel > 0) && (ix[indentLevel - 1] < dim[indentLevel - 1] - 1) {
+                log += ","
+            }
+            print(log)
+        } else {
+            print(indent + "[")
+            for i in 0..<dim[indentLevel] {
+                tx[indentLevel] = i
+                indentPrintTensor(tensor: tensor, dim: dim, ix: tx, indentLevel: indentLevel + 1)
+            }
+            if (indentLevel > 0) && (ix[indentLevel - 1] < dim[indentLevel - 1] - 1) {
+                print(indent + "],")
+            } else {
+                print(indent + "]")
+            }
+        }
+    }
+    private func tensorPrint(tensor: [Float32], dim: [Int]) {
+        var detectPos = -1
+        var odim = 1
+        var ndim = dim
+        for i in 0..<dim.count {
+            if dim[i] == -1 {
+                if detectPos == -1 {
+                    detectPos = i
+                } else {
+                    detectPos = -2
+                }
+            } else if dim[i] <= 0 {
+                detectPos = -3
+            } else {
+                odim *= dim[i]
+            }
+        }
+        assert(detectPos >= -1)
+        if (detectPos == -1) {
+            assert(tensor.count == odim)
+        } else {
+            assert(tensor.count % odim == 0)
+            ndim[detectPos] = tensor.count / odim
+        }
+        indentPrintTensor(tensor: tensor, dim: ndim, ix: dim.map { $0 * 0 }, indentLevel: 0)
+    }
+    public func testTranspose() {
+        let buffer = queue.makeCommandBuffer() ?! "buffer is nil"
+        var input: [Float32] = []
+        for i in 0..<72 {
+            input.append(Float32(i))
+        }
+//        let inputTexture = device.makeFloatTexture(value: input, textureWidth: 3, textureHeight: 2, arrayLength: 3)
+        let inputTexture = device.tensor2texture(value: input, dim: [4, 3, 2, 3]);
+        // group 1
+        let outputTexture = device.tensor2texture(value: [Float32](), dim: [3, 3, 2, 4])
+        let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 3, oC: 4, axis: [3, 1, 2, 0])
+//        let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 4, oC: 2, axis: [3, 0, 2, 1])
+//        // group 2
+//        let outputTexture = device.makeFloatTexture(value: [Float32](), textureWidth: 3, textureHeight: 3, arrayLength: 6)
+//        let param = TransposeTestParam.init(inputTexture: inputTexture, outputTexture: outputTexture, iC: 4, oC: 4, axis: [3, 0, 2, 1])
+//
+        let transposeKernel = TransposeKernel<Float32>.init(device: device, testParam: param)
+        transposeKernel.test(commandBuffer: buffer, param: param)
+        buffer.addCompletedHandler { (buffer) in
+            let _: Float32? = inputTexture.logDesc(header: "input texture", stridable: false)
+            let _: Float32? = outputTexture.logDesc(header: "output texture", stridable: false)
+            self.tensorPrint(tensor: input, dim: [4, 3, 2, 3])
+            let tx: [Float32] = self.device.texture2tensor(texture: outputTexture, dim: [3, 3, 2, 4])
+            self.tensorPrint(tensor: tx, dim: [3, 3, 2, 4])
+        }
+        buffer.commit()
+    }
    public func testConvAddBnRelu() {
        let buffer = queue.makeCommandBuffer() ?! " buffer is nil "
@@ -132,16 +229,6 @@ public class PaddleMobileUnitTest {
        }
        buffer.commit()
-//        let inputTexture = device.makeFloatTexture(value: <#T##[P]#>, textureWidth: <#T##Int#>, textureHeight: <#T##Int#>, arrayLength: <#T##Int#>)
-//        let param = ConvAddBatchNormReluTestParam.init(inInputTexture: <#T##MTLTexture#>, inOutputTexture: <#T##MTLTexture#>, inMetalParam: <#T##MetalConvParam#>, inFilterBuffer: <#T##MTLBuffer#>, inBiaseBuffer: <#T##MTLBuffer#>, inNewScaleBuffer: <#T##MTLBuffer#>, inNewBiaseBuffer: <#T##MTLBuffer#>, inFilterSize: <#T##(width: Int, height: Int, channel: Int)#>)
-//        ConvAddBatchNormReluKernel.init(device: <#T##MTLDevice#>, testParam: <#T##ConvAddBatchNormReluTestParam#>)
    }
 }

--- a/metal/paddle-mobile/paddle-mobile/Executor.swift
+++ b/metal/paddle-mobile/paddle-mobile/Executor.swift
@@ -68,16 +68,6 @@ public class Executor<P: PrecisionType> {
                    throw error
                }
            }
-//            for op in block.ops {
-//                do {
-//                    let op = try OpCreator<P>.shared.creat(device: inDevice, opDesc: op, scope: inProgram.scope)
-//                    op.inferShape()
-//                    ops.append(op)
-//                } catch let error {
-//                    throw error
-//                }
-//            }
        }
    }

--- a/metal/paddle-mobile/paddle-mobile/Loader.swift
+++ b/metal/paddle-mobile/paddle-mobile/Loader.swift
@@ -141,10 +141,6 @@ public class Loader<P: PrecisionType> {
                            throw PaddleMobileError.loaderError(message: "get tensor desc failed")
                        }
-//                        guard (try? tensorDesc.dataType.dataTypeSize()) == MemoryLayout<P>.size else {
-//                            throw PaddleMobileError.memoryError(message: "PrecisionType not support")
-//                        }
                        if (varDesc.persistable
                            && varDesc.type != .FeedMiniBatch
                            && varDesc.type != .FetchList) {

--- a/metal/paddle-mobile/paddle-mobile/Operators/Base/OpCreator.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Base/OpCreator.swift
@@ -56,7 +56,9 @@ class OpCreator<P: PrecisionType> {
         gBoxcoderType              :     BoxcoderOp<P>.creat,
         gConvBnReluType            :     ConvBNReluOp<P>.creat,
         gDwConvBnReluType          :     DwConvBNReluOp<P>.creat,
-         gMulticlassNMSType         :     MulticlassNMSOp<P>.creat]
+         gMulticlassNMSType         :     MulticlassNMSOp<P>.creat,
+         gTransposeType             :     TransposeOp<P>.creat,
+         gPriorBoxType              :     PriorBoxOp<P>.creat]
    private init(){}
 }
--- a/metal/paddle-mobile/paddle-mobile/Operators/BoxcoderOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/BoxcoderOp.swift
@@ -18,19 +18,28 @@ class BoxcoderParam<P: PrecisionType>: OpParam {
    typealias ParamPrecisionType = P
    required init(opDesc: OpDesc, inScope: Scope) throws {
        do {
-            fatalError()
+            priorBox = try BoxcoderParam.getFirstTensor(key: "PriorBox", map: opDesc.inputs, from: inScope)
+            priorBoxVar = try BoxcoderParam.getFirstTensor(key: "PriorBoxVar", map: opDesc.inputs, from: inScope)
+            targetBox = try BoxcoderParam.getFirstTensor(key: "TargetBox", map: opDesc.inputs, from: inScope)
+            output = try BoxcoderParam.getFirstTensor(key: "OutputBox", map: opDesc.outputs, from: inScope)
+            codeType = try BoxcoderParam.getAttr(key: "code_type", attrs: opDesc.attrs)
+            boxNormalized = try BoxcoderParam.getAttr(key: "box_normalized", attrs: opDesc.attrs)
        } catch let error {
            throw error
        }
    }
-    let input: Texture<P>
+    let priorBox: Texture<P>
+    let priorBoxVar: Texture<P>
+    let targetBox: Texture<P>
    var output: Texture<P>
+    let codeType: String
+    let boxNormalized: Bool
 }
 class BoxcoderOp<P: PrecisionType>: Operator<BoxcoderKernel<P>, BoxcoderParam<P>>, Runable, Creator, InferShaperable{
    func inferShape() {
-        para.output.dim = para.input.dim
+//        para.output.dim = para.input.dim
    }
    typealias OpType = BoxcoderOp<P>

--- a/metal/paddle-mobile/paddle-mobile/Operators/ConcatOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConcatOp.swift
@@ -18,19 +18,31 @@ class ConcatParam<P: PrecisionType>: OpParam {
    typealias ParamPrecisionType = P
    required init(opDesc: OpDesc, inScope: Scope) throws {
        do {
-            fatalError()
+            guard let xlist = opDesc.inputs["X"] else {
+                fatalError()
+            }
+            for x in xlist {
+                guard let variant = inScope[x], let v = variant as? Texture<P> else {
+                    fatalError()
+                }
+                input.append(v)
+            }
+            axis = try ConcatParam.getAttr(key: "axis", attrs: opDesc.attrs)
+            output = try ConcatParam.outputOut(outputs: opDesc.outputs, from: inScope)
        } catch let error {
            throw error
        }
    }
-    let input: Texture<P>
+    var input: [Texture<P>] = []
    var output: Texture<P>
+    let axis: Int
 }
 class ConcatOp<P: PrecisionType>: Operator<ConcatKernel<P>, ConcatParam<P>>, Runable, Creator, InferShaperable{
    func inferShape() {
-        para.output.dim = para.input.dim
+        let dim = para.input.reduce([0, 0]) {[$0[0] + $1.dim[0], $1.dim[1]]}
+        para.output.dim = Dim.init(inDim: dim)
    }
    typealias OpType = ConcatOp<P>

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BoxcoderKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BoxcoderKernel.swift
@@ -19,7 +19,7 @@ class BoxcoderKernel<P: PrecisionType>: Kernel, Computable{
        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
            throw PaddleMobileError.predictError(message: " encode is nil")
        }
-        encoder.setTexture(param.input.metalTexture, index: 0)
+//        encoder.setTexture(param.input.metalTexture, index: 0)
        encoder.setTexture(param.output.metalTexture, index: 1)
        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
        encoder.endEncoding()

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Concat.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Concat.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+import Foundation
+class ConcatKernel<P: PrecisionType>: Kernel, Computable{
+    func compute(commandBuffer: MTLCommandBuffer, param: ConcatParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encoder is nil")
+        }
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        encoder.setTexture(param.output.metalTexture, index: 1)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
+    }
+    required init(device: MTLDevice, param: ConcatParam<P>) {
+        super.init(device: device, inFunctionName: "concat")
+    }
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConcatKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConcatKernel.swift
@@ -19,13 +19,13 @@ class ConcatKernel<P: PrecisionType>: Kernel, Computable{
        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
            throw PaddleMobileError.predictError(message: " encode is nil")
        }
-        encoder.setTexture(param.input.metalTexture, index: 0)
+//        encoder.setTexture(param.input.metalTexture, index: 0)
-        encoder.setTexture(param.output.metalTexture, index: 1)
+//        encoder.setTexture(param.output.metalTexture, index: 1)
        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
        encoder.endEncoding()
    }
    required init(device: MTLDevice, param: ConcatParam<P>) {
-        super.init(device: device, inFunctionName: "priorbox")
+        super.init(device: device, inFunctionName: "concat")
    }
 }
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift
@@ -57,7 +57,7 @@ class ConvBNReluKernel<P: PrecisionType>: Kernel, Computable, Testable {
        } else {
            super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3")
        }
+        param.output.initTexture(device: device, transpose: [0, 2, 3, 1])
        param.filter.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32)
        param.variance.initBuffer(device: device)

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Kernels.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Kernels.metal
@@ -202,7 +202,7 @@ kernel void reshape_half(texture2d_array<half, access::read> inTexture [[texture
        gid.y >= outTexture.get_height() ||
        gid.z >= outTexture.get_array_size()) return;
-    half4 r = inTexture.read(uint2(0, 0), gid.z);
+    half4 r = inTexture.read(uint2(0, 0), gid.x);
    outTexture.write(r, gid.xy, gid.z);
 }
@@ -321,8 +321,53 @@ kernel void prior_box(texture2d_array<float, access::read> inTexture [[texture(0
    }
 }
+void xyzn2abcd(uint C, uint xyzn[4], uint abcd[4]) {
+    abcd[1] = xyzn[0];
+    abcd[2] = xyzn[1];
+    uint t = xyzn[2] * 4 + xyzn[3];
+    abcd[0] = t / C;
+    abcd[3] = t % C;
+    return;
+}
+void abcd2xyzn(uint C, uint abcd[4], uint xyzn[4]) {
+    xyzn[0] = abcd[1];
+    xyzn[1] = abcd[2];
+    uint t = abcd[0] * C + abcd[3];
+    xyzn[2] = t / 4;
+    xyzn[3] = t % 4;
+    return;
+}
+struct TransposeParam {
+    int iC;
+    int oC;
+    int axis[4];
+};
+kernel void transpose(texture2d_array<float, access::read> inTexture [[texture(0)]],
+                 texture2d_array<float, access::write> outTexture [[texture(1)]],
+                 constant TransposeParam &pm [[buffer(0)]],
+                 uint3 gid [[thread_position_in_grid]]) {
+    if ((pm.axis[0] == 0) && (pm.axis[1] == 1) && (pm.axis[2] == 2) && (pm.axis[3] == 3)) {
+        // do nothing
+        float4 r = inTexture.read(gid.xy, gid.z);
+        outTexture.write(r, gid.xy, gid.z);
+    } else {
+        float4 r;
+        for (uint i = 0; i < 4; i++) {
+            uint ixyzn[] = {gid.x, gid.y, gid.z, i};
+            uint iabcd[4], oabcd[4], oxyzn[4];
+            xyzn2abcd(pm.oC, ixyzn, iabcd);
+            oabcd[pm.axis[0]] = iabcd[0];
+            oabcd[pm.axis[1]] = iabcd[1];
+            oabcd[pm.axis[2]] = iabcd[2];
+            oabcd[pm.axis[3]] = iabcd[3];
+            abcd2xyzn(pm.iC, oabcd, oxyzn);
+            float4 rt = inTexture.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2]);
+            r[i] = rt[oxyzn[3]];
+        }
+        outTexture.write(r, gid.xy, gid.z);
+    }
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/MulticlassNMSKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/MulticlassNMSKernel.swift
@@ -19,8 +19,8 @@ class MulticlassNMSKernel<P: PrecisionType>: Kernel, Computable{
        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
            throw PaddleMobileError.predictError(message: " encode is nil")
        }
-        encoder.setTexture(param.input.metalTexture, index: 0)
+//        encoder.setTexture(param.input.metalTexture, index: 0)
-        encoder.setTexture(param.output.metalTexture, index: 1)
+//        encoder.setTexture(param.output.metalTexture, index: 1)
        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
        encoder.endEncoding()
    }

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/TransposeKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/TransposeKernel.swift
@@ -14,18 +14,73 @@
 import Foundation
-class TransposeKernel<P: PrecisionType>: Kernel, Computable{
+struct TransposeMetalParam {
+    var iC: Int32 = 0
+    var oC: Int32 = 0
+    var i0: Int32
+    var i1: Int32
+    var i2: Int32
+    var i3: Int32
+    init(_ i0: Int32, _ i1: Int32, _ i2: Int32, _ i3: Int32) {
+        self.i0 = i0
+        self.i1 = i1
+        self.i2 = i2
+        self.i3 = i3
+    }
+    init(_ axis: [Int]) {
+        self.init(Int32(axis[0]), Int32(axis[1]), Int32(axis[2]), Int32(axis[3]))
+    }
+}
+struct TransposeTestParam: TestParam {
+    let inputTexture: MTLTexture
+    let outputTexture: MTLTexture
+    let iC: Int
+    let oC: Int
+    let axis: [Int]
+}
+class TransposeKernel<P: PrecisionType>: Kernel, Computable, Testable {
    func compute(commandBuffer: MTLCommandBuffer, param: TransposeParam<P>) throws {
        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
            throw PaddleMobileError.predictError(message: " encode is nil")
        }
+        var invT: [Int] = [0, 1, 2, 3]
+        for (i, v) in param.input.transpose.enumerated() {
+            invT[v] = i
+        }
+        let realAxis = param.axis.map {invT[$0]}
+        var tmp = TransposeMetalParam.init(realAxis)
+        tmp.iC = Int32(param.input.dim[param.input.transpose[3]])
+        tmp.oC = Int32(param.output.dim[3])
        encoder.setTexture(param.input.metalTexture, index: 0)
        encoder.setTexture(param.output.metalTexture, index: 1)
+        encoder.setBytes(&tmp, length: MemoryLayout<TransposeMetalParam>.size, index: 0)
        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
        encoder.endEncoding()
    }
    required init(device: MTLDevice, param: TransposeParam<P>) {
-        super.init(device: device, inFunctionName: "priorbox")
+        param.output.initTexture(device: device, transpose: [0, 1, 2, 3])
+        super.init(device: device, inFunctionName: "transpose")
    }
-}
+    required init(device: MTLDevice, testParam: TransposeTestParam) {
+        super.init(device: device, inFunctionName: "transpose")
+    }
+    public func test(commandBuffer: MTLCommandBuffer, param: TransposeTestParam) {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            fatalError()
+        }
+        encoder.setTexture(param.inputTexture, index: 0)
+        encoder.setTexture(param.outputTexture, index: 1)
+        var tmp = TransposeMetalParam.init(param.axis)
+        tmp.iC = Int32(param.iC)
+        tmp.oC = Int32(param.oC)
+        encoder.setBytes(&tmp, length: MemoryLayout<TransposeMetalParam>.size, index: 0)
+        encoder.dispatch(computePipline: pipline, outTexture: param.outputTexture)
+        encoder.endEncoding()
+    }}
--- a/metal/paddle-mobile/paddle-mobile/Operators/MulticlassNMSOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/MulticlassNMSOp.swift
@@ -18,19 +18,22 @@ class MulticlassNMSParam<P: PrecisionType>: OpParam {
    typealias ParamPrecisionType = P
    required init(opDesc: OpDesc, inScope: Scope) throws {
        do {
-            fatalError()
+            scores = try MulticlassNMSParam.getFirstTensor(key: "Scores", map: opDesc.inputs, from: inScope)
+            bboxes = try MulticlassNMSParam.getFirstTensor(key: "BBoxes", map: opDesc.inputs, from: inScope)
+            output = try MulticlassNMSParam.outputOut(outputs: opDesc.outputs, from: inScope)
        } catch let error {
            throw error
        }
    }
-    let input: Texture<P>
+    let scores: Texture<P>
+    let bboxes: Texture<P>
    var output: Texture<P>
 }
 class MulticlassNMSOp<P: PrecisionType>: Operator<MulticlassNMSKernel<P>, MulticlassNMSParam<P>>, Runable, Creator, InferShaperable{
    func inferShape() {
-        para.output.dim = para.input.dim
+//        para.output.dim = para.input.dim
    }
    typealias OpType =  MulticlassNMSOp<P>

--- a/metal/paddle-mobile/paddle-mobile/Operators/PriorBoxOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/PriorBoxOp.swift
@@ -18,13 +18,16 @@ class PriorBoxParam<P: PrecisionType>: OpParam {
    typealias ParamPrecisionType = P
    required init(opDesc: OpDesc, inScope: Scope) throws {
        do {
-            fatalError()
+            input = try PriorBoxParam.input(inputs: opDesc.inputs, from: inScope)
+            output = try PriorBoxParam.getFirstTensor(key: "Boxes", map: opDesc.outputs, from: inScope)
+            variances = try PriorBoxParam.getFirstTensor(key: "Variances", map: opDesc.outputs, from: inScope)
        } catch let error {
            throw error
        }
    }
    let input: Texture<P>
    var output: Texture<P>
+    let variances: Texture<P>
 }
 class PriorBoxOp<P: PrecisionType>: Operator<PriorBoxKernel<P>, PriorBoxParam<P>>, Runable, Creator, InferShaperable{
@@ -36,7 +39,7 @@ class PriorBoxOp<P: PrecisionType>: Operator<PriorBoxKernel<P>, PriorBoxParam<P>
    typealias OpType = PriorBoxOp<P>
    func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
        do {
-            try kernel.compute(commandBuffer: buffer, param: para)
+            // try kernel.compute(commandBuffer: buffer, param: para)
        } catch let error {
            throw error
        }

--- a/metal/paddle-mobile/paddle-mobile/Operators/TransposeOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/TransposeOp.swift
@@ -18,13 +18,16 @@ class TransposeParam<P: PrecisionType>: OpParam {
    typealias ParamPrecisionType = P
    required init(opDesc: OpDesc, inScope: Scope) throws {
        do {
-            fatalError()
+            input = try TransposeParam.inputX(inputs: opDesc.inputs, from: inScope)
+            output = try TransposeParam.outputOut(outputs: opDesc.outputs, from: inScope)
+            axis = try TransposeParam.getAttr(key: "axis", attrs: opDesc.attrs)
        } catch let error {
            throw error
        }
    }
    let input: Texture<P>
    var output: Texture<P>
+    let axis: [Int32]
 }
 class TransposeOp<P: PrecisionType>: Operator<TransposeKernel<P>, TransposeParam<P>>, Runable, Creator, InferShaperable{

--- a/metal/paddle-mobile/paddle-mobile/framework/Dim.swift
+++ b/metal/paddle-mobile/paddle-mobile/framework/Dim.swift
@@ -39,7 +39,6 @@ public struct Dim {
        return dims[index];
    }
    private(set) var dims: [Int]
    private init(){
        fatalError()

--- a/metal/paddle-mobile/paddle-mobile/framework/Texture.swift
+++ b/metal/paddle-mobile/paddle-mobile/framework/Texture.swift
@@ -95,43 +95,6 @@ public class Texture<P: PrecisionType>: Tensorial {
        layout = DataLayout.init([(.N, fourDim[0]), (.C, fourDim[1]), (.H, fourDim[2]), (.W, fourDim[3])])
    }
-//    required public init(inDim: Dim, inLayout: DataLayout = .NHWC, inTexture: MTLTexture) {
-//        dim = inDim
-//        layout = inLayout
-//        metalTexture = inTexture
-//        let tmpTextureDes = MTLTextureDescriptor.init()
-//        
-//        if inDim.cout() == 1 {
-//            tmpTextureDes.width = inDim[0]
-//            tmpTextureDes.textureType = .type1D
-//        } else if inDim.cout() == 2 {
-//            tmpTextureDes.height = inDim[0]
-//            tmpTextureDes.width = inDim[1]
-//            tmpTextureDes.textureType = .type2D
-//        } else if inDim.cout() == 3 {
-//            fatalError(" not support texture dim 3")
-//        } else if inDim.cout() == 4 {
-//            tmpTextureDes.height = inDim[1]
-//            tmpTextureDes.width = inDim[2]
-//            tmpTextureDes.depth = inDim[3] * inDim[1]
-//            tmpTextureDes.textureType = .type2DArray
-//        }
-//        
-//        tmpTextureDes.pixelFormat = .r32Float
-//        tmpTextureDes.storageMode = .shared
-//        textureDesc = tmpTextureDes
-//        let device = MTLCreateSystemDefaultDevice()
-//        metalTexture = device!.makeTexture(descriptor: tmpTextureDes)!
-//    }
-//    init() {
-//        dim = Dim.init(inDim: [])
-//        layout = .NCHW
-//        let device = MTLCreateSystemDefaultDevice()
-//        textureDesc = MTLTextureDescriptor.init()
-//        metalTexture = device!.makeTexture(descriptor: textureDesc)!
-//    }
    private(set) var layout: DataLayout
 }