From 6ec031ffb208a60065d89b44452014a6b8567604 Mon Sep 17 00:00:00 2001
From: liuruilong <liuruilong@baidu.com>
Date: Thu, 30 Aug 2018 20:14:18 +0800
Subject: [PATCH] add dilation

---
 .../paddle-mobile-demo/ViewController.swift   |   5 +-
 .../paddle-mobile.xcodeproj/project.pbxproj   |   2 +
 .../paddle-mobile/Common/MetalExtension.swift |   2 +-
 .../Common/PaddleMobileUnitTest.swift         |   2 +-
 .../paddle-mobile/Executor.swift              |  19 +-
 .../paddle-mobile/paddle-mobile/Loader.swift  | 314 +++++----
 .../paddle-mobile/Operators/ConvAddOp.swift   |   7 +
 .../Operators/ConvTransposeOp.swift           |  13 +-
 .../Operators/ElementwiseAddOp.swift          |  13 +-
 .../Kernels/ConvAddBatchNormReluKernel.swift  |   2 +-
 .../Operators/Kernels/ConvAddKernel.swift     |  11 +-
 .../Operators/Kernels/ConvBNReluKernel.swift  |   2 +-
 .../Operators/Kernels/ConvKernel.swift        |   4 +-
 .../Kernels/ConvTransposeKernel.swift         |   1 +
 .../Kernels/ElementwiseAddKernel.swift        |   2 +-
 .../Operators/Kernels/metal/ConvKernel.metal  | 665 +++++++++---------
 .../paddle-mobile/Program/BlockDesc.swift     |   6 +-
 .../paddle-mobile/framework/Tensor.swift      | 421 +++++------
 18 files changed, 765 insertions(+), 726 deletions(-)

diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift
index 1674546fe5..e2351d14e3 100644
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift
@@ -26,7 +26,7 @@ let modelHelperMap: [SupportModel : Net] = [.mobilenet_ssd : MobileNet_ssd_hand.
 enum SupportModel: String{
   //  case mobilenet = "mobilenet"
   case mobilenet_ssd = "mobilenetssd"
-  case genet          = "enet"
+  case genet          = "genet"
   static func supportedModels() -> [SupportModel] {
     //.mobilenet,
     return [.mobilenet_ssd ,.genet]
@@ -79,7 +79,7 @@ class ViewController: UIViewController {
       return
     }
     do {
-      let max = 1
+      let max = 10
       let startDate = Date.init()
       for i in 0..<max {
         try net.predict(inTexture: inTexture) { [weak self] (result) in
@@ -87,6 +87,7 @@ class ViewController: UIViewController {
             fatalError()
           }
           
+          print(result.resultArray)
           if i == max - 1 {
             let time = Date.init().timeIntervalSince(startDate)
             DispatchQueue.main.async {
diff --git a/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj b/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj
index 1da733019f..e04ab7bd0d 100644
--- a/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj
+++ b/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj
@@ -699,6 +699,7 @@
 					"@executable_path/Frameworks",
 					"@loader_path/Frameworks",
 				);
+				MACH_O_TYPE = mh_dylib;
 				MTL_LANGUAGE_REVISION = UseDeploymentTarget;
 				PRODUCT_BUNDLE_IDENTIFIER = "orange.paddle-mobile";
 				PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)";
@@ -727,6 +728,7 @@
 					"@executable_path/Frameworks",
 					"@loader_path/Frameworks",
 				);
+				MACH_O_TYPE = mh_dylib;
 				MTL_LANGUAGE_REVISION = UseDeploymentTarget;
 				PRODUCT_BUNDLE_IDENTIFIER = "orange.paddle-mobile";
 				PRODUCT_NAME = "$(TARGET_NAME:c99extidentifier)";
diff --git a/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift b/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift
index 2a51dc66f0..e250c37c49 100644
--- a/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift
+++ b/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift
@@ -342,7 +342,7 @@ public extension MTLTexture {
   
   // n c h w - dim
   func toTensor(dim: (n: Int, c: Int, h: Int, w: Int)) -> [Float32] {
-    print("origin dim: \(dim)")
+//    print("origin dim: \(dim)")
     print("texture: ")
     print(self)
     
diff --git a/metal/paddle-mobile/paddle-mobile/Common/PaddleMobileUnitTest.swift b/metal/paddle-mobile/paddle-mobile/Common/PaddleMobileUnitTest.swift
index 0d1fd39a34..2764146929 100644
--- a/metal/paddle-mobile/paddle-mobile/Common/PaddleMobileUnitTest.swift
+++ b/metal/paddle-mobile/paddle-mobile/Common/PaddleMobileUnitTest.swift
@@ -314,7 +314,7 @@ public class PaddleMobileUnitTest {
         let offsetX = filterSize.width/2 - paddings.0
         let offsetY = filterSize.height/2 - paddings.1
         
-        let metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: 0, strideX: UInt16(stride.0), strideY: UInt16(stride.1), paddedZ: UInt16(paddings.0))
+        let metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: 0, strideX: UInt16(stride.0), strideY: UInt16(stride.1), paddedZ: UInt16(paddings.0), dilationX: UInt16(1), dilationY: UInt16(1))
         
         let param = ConvAddBatchNormReluTestParam.init(inInputTexture: inputeTexture, inOutputTexture: outputTexture, inMetalParam: metalParam, inFilterBuffer: filterBuffer, inBiaseBuffer: biaseBuffer, inNewScaleBuffer: newScalueBuffer, inNewBiaseBuffer: newBiaseBuffer, inFilterSize: filterSize)
         
diff --git a/metal/paddle-mobile/paddle-mobile/Executor.swift b/metal/paddle-mobile/paddle-mobile/Executor.swift
index f569b74c02..47c91de096 100644
--- a/metal/paddle-mobile/paddle-mobile/Executor.swift
+++ b/metal/paddle-mobile/paddle-mobile/Executor.swift
@@ -14,7 +14,7 @@
 
 import Foundation
 
-let testTo = 12
+let testTo = 54
 
 public class ResultHolder<P: PrecisionType> {
   public let dim: [Int]
@@ -62,7 +62,7 @@ public class Executor<P: PrecisionType> {
     queue = inQueue
     for block in inProgram.programDesc.blocks {
       //block.ops.count
-      for i in 0..<testTo {
+      for i in 0..<block.ops.count {
         let op = block.ops[i]
         do {
           let op = try OpCreator<P>.shared.creat(device: inDevice, opDesc: op, scope: inProgram.scope)
@@ -124,13 +124,13 @@ public class Executor<P: PrecisionType> {
       //            print(stridableInput)
       
       //            let _: Flo? = input.logDesc(header: "input: ", stridable: true)
-      for i in 0..<self.ops.count {
-        let op = self.ops[i]
-        print(" 第 \(i) 个 op: ")
-        op.delogOutput()
-      }
-      
-      return
+//      for i in 0..<self.ops.count {
+//        let op = self.ops[i]
+//        print(" 第 \(i) 个 op: ")
+//        op.delogOutput()
+//      }
+//      
+//      return
       
       let afterDate = Date.init()
      
@@ -145,7 +145,6 @@ public class Executor<P: PrecisionType> {
           return p
         }), inElapsedTime: afterDate.timeIntervalSince(beforeDate))
       }
-      
 
       completionHandle(resultHolder)
     }
diff --git a/metal/paddle-mobile/paddle-mobile/Loader.swift b/metal/paddle-mobile/paddle-mobile/Loader.swift
index 5c2733a5d9..4b5f91f9c7 100644
--- a/metal/paddle-mobile/paddle-mobile/Loader.swift
+++ b/metal/paddle-mobile/paddle-mobile/Loader.swift
@@ -16,168 +16,170 @@ import Foundation
 import SwiftProtobuf
 
 public class Loader<P: PrecisionType> {
-    class ParaLoader {
-        let file: UnsafeMutablePointer<FILE>
-        let fileSize: Int
-        var nowIndex: Int
-        init(paramPath: String) throws {
-            guard let tmpFile = fopen(paramPath, "rb") else {
-                throw PaddleMobileError.loaderError(message: "open param file error" + paramPath)
-            }
-            file = tmpFile
-            fseek(file, 0, SEEK_END)
-            fileSize = ftell(file)
-            guard fileSize > 0 else {
-                throw PaddleMobileError.loaderError(message: "param file size is too small")
-            }
-            rewind(file)
-            nowIndex = 0
-        }
-        
-        func read(tensor: Tensor<P>) throws {
-            guard nowIndex <= fileSize else {
-                throw PaddleMobileError.loaderError(message: "out of the file range")
-            }
-            
-            func pointerReader<T>(type: T.Type) -> T {
-                let ptr = UnsafeMutablePointer<T>.allocate(capacity: MemoryLayout<T>.size)
-                fread(ptr, 1, MemoryLayout<T>.size, file)
-                nowIndex += MemoryLayout<T>.size
-                let pointee = ptr.pointee
-                ptr.deinitialize(count: MemoryLayout<UInt32>.size)
-                ptr.deallocate()
-                return pointee
-            }
-            
-            let _ = pointerReader(type: UInt32.self)
-            let lodLevel = pointerReader(type: UInt64.self)
-            for _ in 0..<lodLevel {
-                let size = pointerReader(type: UInt64.self)
-                for _ in 0..<Int(size/UInt64(MemoryLayout<size_t>.size)){
-                    _ = pointerReader(type: size_t.self)
-                }
-            }
-            
-            let _ = pointerReader(type: UInt32.self)
-            
-            let tensorDescSize = pointerReader(type: Int32.self)
-            
-            fseek(file, Int(tensorDescSize), SEEK_CUR)
-            nowIndex += Int(tensorDescSize)
-            
-            /*
-             这里没有根据 Data Type 去判断, 而是从外部泛型直接指定了精度
-             */
-            
-            //现在模型传入模型为  Float 类型, 这块应该根据模型来
-//            let tmpCapacity = MemoryLayout<Float>.size * tensor.numel()
-//            let tmpPointer = UnsafeMutablePointer<Float>.allocate(capacity: tmpCapacity);
-            let bytesRead = fread(tensor.data.pointer, 1, tensor.data.size, file)
-            
-            guard bytesRead == tensor.data.size else {
-                throw PaddleMobileError.loaderError(message: "param read size error")
-            }
-            
-            // TODO: use script to convert
-//            let bytesRead = fread(tmpPointer, 1, tmpCapacity, file)
-//            for i in 0..<tensor.numel() {
-//                tensor.data[i] = P.init(inFloat: tmpPointer[i])
-//            }
-//            tmpPointer.deinitialize(count: tmpCapacity)
-//            tmpPointer.deallocate()
-            
-            nowIndex += bytesRead
-        }
-        
-        deinit {
-            fclose(file)
-        }
+  class ParaLoader {
+    let file: UnsafeMutablePointer<FILE>
+    let fileSize: Int
+    var nowIndex: Int
+    init(paramPath: String) throws {
+      guard let tmpFile = fopen(paramPath, "rb") else {
+        throw PaddleMobileError.loaderError(message: "open param file error" + paramPath)
+      }
+      file = tmpFile
+      fseek(file, 0, SEEK_END)
+      fileSize = ftell(file)
+      guard fileSize > 0 else {
+        throw PaddleMobileError.loaderError(message: "param file size is too small")
+      }
+      rewind(file)
+      nowIndex = 0
     }
-    public init(){}
-    public func load(device: MTLDevice, modelPath: String, paraPath: String) throws -> Program{
-        guard let modelData = try? Data.init(contentsOf: URL.init(fileURLWithPath: modelPath)) else {
-            throw PaddleMobileError.loaderError(message: "load " + modelPath + " failed !")
+    
+    func read(tensor: Tensor<P>) throws {
+      guard nowIndex <= fileSize else {
+        throw PaddleMobileError.loaderError(message: "out of the file range")
+      }
+      
+      func pointerReader<T>(type: T.Type) -> T {
+        let ptr = UnsafeMutablePointer<T>.allocate(capacity: MemoryLayout<T>.size)
+        fread(ptr, 1, MemoryLayout<T>.size, file)
+        nowIndex += MemoryLayout<T>.size
+        let pointee = ptr.pointee
+        ptr.deinitialize(count: MemoryLayout<UInt32>.size)
+        ptr.deallocate()
+        return pointee
+      }
+      
+      let _ = pointerReader(type: UInt32.self)
+      let lodLevel = pointerReader(type: UInt64.self)
+      for _ in 0..<lodLevel {
+        let size = pointerReader(type: UInt64.self)
+        for _ in 0..<Int(size/UInt64(MemoryLayout<size_t>.size)){
+          _ = pointerReader(type: size_t.self)
         }
-        
-        do {
-            let protoProgram = try PaddleMobile_Framework_Proto_ProgramDesc.init(
-                serializedData: modelData)
-            
-            let originProgramDesc = ProgramDesc.init(protoProgram: protoProgram)
-            let programDesc = ProgramOptimize<P>.init().optimize(originProgramDesc: originProgramDesc)
-            print(programDesc)
-
-            guard let paraLoader = try? ParaLoader.init(paramPath: paraPath) else {
-                throw PaddleMobileError.loaderError(message: "load para error")
-            }
-            
-            guard programDesc.blocks.count > 0 else {
-                throw PaddleMobileError.loaderError(message: "count of blocks must greater than 0")
-            }
-            
-            // to get feed key and fetch key
-            let block = programDesc.blocks[0]
-            guard let firstOp = block.ops.first, let lastOp = block.ops.last else {
-                throw PaddleMobileError.loaderError(message: "at least two operator")
-            }
-            guard firstOp.type == gFeedType, lastOp.type == gFetchType else {
-                throw PaddleMobileError.loaderError(message: "the first op is not feed or the last op is not fetch")
+      }
+      
+      let _ = pointerReader(type: UInt32.self)
+      
+      let tensorDescSize = pointerReader(type: Int32.self)
+      
+      fseek(file, Int(tensorDescSize), SEEK_CUR)
+      nowIndex += Int(tensorDescSize)
+      
+      /*
+       这里没有根据 Data Type 去判断, 而是从外部泛型直接指定了精度
+       */
+      
+      //现在模型传入模型为  Float 类型, 这块应该根据模型来
+      //            let tmpCapacity = MemoryLayout<Float>.size * tensor.numel()
+      //            let tmpPointer = UnsafeMutablePointer<Float>.allocate(capacity: tmpCapacity);
+      let bytesRead = fread(tensor.data.pointer, 1, tensor.data.size, file)
+      
+      guard bytesRead == tensor.data.size else {
+        throw PaddleMobileError.loaderError(message: "param read size error")
+      }
+      
+      // TODO: use script to convert
+      //            let bytesRead = fread(tmpPointer, 1, tmpCapacity, file)
+      //            for i in 0..<tensor.numel() {
+      //                tensor.data[i] = P.init(inFloat: tmpPointer[i])
+      //            }
+      //            tmpPointer.deinitialize(count: tmpCapacity)
+      //            tmpPointer.deallocate()
+      
+      nowIndex += bytesRead
+    }
+    
+    deinit {
+      fclose(file)
+    }
+  }
+  public init(){}
+  public func load(device: MTLDevice, modelPath: String, paraPath: String) throws -> Program{
+    
+    guard let modelData = try? Data.init(contentsOf: URL.init(fileURLWithPath: modelPath)) else {
+      throw PaddleMobileError.loaderError(message: "load " + modelPath + " failed !")
+    }
+    
+    do {
+      let protoProgram = try PaddleMobile_Framework_Proto_ProgramDesc.init(
+        serializedData: modelData)
+      
+      let originProgramDesc = ProgramDesc.init(protoProgram: protoProgram)
+      let programDesc = ProgramOptimize<P>.init().optimize(originProgramDesc: originProgramDesc)
+      print(programDesc)
+      
+      guard let paraLoader = try? ParaLoader.init(paramPath: paraPath) else {
+        throw PaddleMobileError.loaderError(message: "load para error")
+      }
+      
+      guard programDesc.blocks.count > 0 else {
+        throw PaddleMobileError.loaderError(message: "count of blocks must greater than 0")
+      }
+      
+      // to get feed key and fetch key
+      let block = programDesc.blocks[0]
+      guard let firstOp = block.ops.first, let lastOp = block.ops.last else {
+        throw PaddleMobileError.loaderError(message: "at least two operator")
+      }
+      
+      guard firstOp.type == gFeedType, lastOp.type == gFetchType else {
+        throw PaddleMobileError.loaderError(message: "the first op is not feed or the last op is not fetch")
+      }
+      
+      guard let inputKey = opInfos[gFeedType]?.inputs.first, let outKey = opInfos[gFetchType]?.outputs.first else {
+        throw PaddleMobileError.loaderError(message: "the feed input key or fetch output key not found")
+      }
+      guard let feedKey = firstOp.inputs[inputKey]?.first, let fetchKey = lastOp.outputs[outKey]?.first else {
+        throw PaddleMobileError.loaderError(message: "feed key or fetch key not found")
+      }
+      
+      let scope = Scope.init(inFeedKey: feedKey, inFetchKey: fetchKey)
+      
+      // to load memory
+      for block in programDesc.blocks {
+        for varDesc in block.vars {
+          if (varDesc.type == .LodTensor) {
+            guard let tensorDesc = varDesc.tensorDesc else {
+              throw PaddleMobileError.loaderError(message: "get tensor desc failed")
             }
             
-            guard let inputKey = opInfos[gFeedType]?.inputs.first, let outKey = opInfos[gFetchType]?.outputs.first else {
-                throw PaddleMobileError.loaderError(message: "the feed input key or fetch output key not found")
-            }
-            guard let feedKey = firstOp.inputs[inputKey]?.first, let fetchKey = lastOp.outputs[outKey]?.first else {
-                throw PaddleMobileError.loaderError(message: "feed key or fetch key not found")
+            if (varDesc.persistable
+              && varDesc.type != .FeedMiniBatch
+              && varDesc.type != .FetchList) {
+              let dimArr = tensorDesc.dims
+              
+              guard dimArr.count > 0 else {
+                throw PaddleMobileError.loaderError(message: "tensor desc dim size error")
+              }
+              
+              let dim = Dim.init(inDim: dimArr)
+              let tensor = Tensor<P>.init(inDim: dim, inLayout: tensorDesc.dataLayout)
+              do {
+                try paraLoader.read(tensor: tensor)
+              } catch let error {
+                throw error
+              }
+              tensor.convert(to: DataLayout.NHWC())
+              //                            tensor.initBuffer(device: device)
+              scope[varDesc.name] = tensor
+            } else {
+              let dim = Dim.init(inDim: tensorDesc.dims)
+              scope[varDesc.name] = Texture<P>.init(device: device, inDim: dim)
             }
-            
-            let scope = Scope.init(inFeedKey: feedKey, inFetchKey: fetchKey)
-            
-            // to load memory
-            for block in programDesc.blocks {
-                for varDesc in block.vars {
-                    if (varDesc.type == .LodTensor) {
-                        guard let tensorDesc = varDesc.tensorDesc else {
-                            throw PaddleMobileError.loaderError(message: "get tensor desc failed")
-                        }
-                        
-                        if (varDesc.persistable
-                            && varDesc.type != .FeedMiniBatch
-                            && varDesc.type != .FetchList) {
-                            let dimArr = tensorDesc.dims
-                            
-                            guard dimArr.count > 0 else {
-                                throw PaddleMobileError.loaderError(message: "tensor desc dim size error")
-                            }
-                            
-                            let dim = Dim.init(inDim: dimArr)
-                            let tensor = Tensor<P>.init(inDim: dim, inLayout: tensorDesc.dataLayout)
-                            do {
-                                try paraLoader.read(tensor: tensor)
-                            } catch let error {
-                                throw error
-                            }
-                            tensor.convert(to: DataLayout.NHWC())
-//                            tensor.initBuffer(device: device)
-                            scope[varDesc.name] = tensor
-                        } else {
-                            let dim = Dim.init(inDim: tensorDesc.dims)
-                            scope[varDesc.name] = Texture<P>.init(device: device, inDim: dim)
-                        }
-                    } else {
-                        if varDesc.name == fetchKey {
-                            scope[varDesc.name] = ResultHolder<P>.init(inDim: [], inResult: [], inElapsedTime: 0.0)
-                        } else if varDesc.name == feedKey {
-                        }
-                    }
-                }
+          } else {
+            if varDesc.name == fetchKey {
+              scope[varDesc.name] = ResultHolder<P>.init(inDim: [], inResult: [], inElapsedTime: 0.0)
+            } else if varDesc.name == feedKey {
             }
-            
-            let program = Program.init(inProgramDesc: programDesc, inParamPath: paraPath, inScope: scope)
-            
-            return program
-        } catch _ {
-            throw PaddleMobileError.loaderError(message: "protobuf decoder error")
+          }
         }
+      }
+      
+      let program = Program.init(inProgramDesc: programDesc, inParamPath: paraPath, inScope: scope)
+      
+      return program
+    } catch _ {
+      throw PaddleMobileError.loaderError(message: "protobuf decoder error")
     }
+  }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddOp.swift
index e79707a475..c42e5fa1d8 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddOp.swift
@@ -97,6 +97,13 @@ class ConvAddOp<P: PrecisionType>: Operator<ConvAddKernel<P>, ConvAddParam<P>>,
   }
   
   func delogOutput() {
+    print("stride: ")
+    print(para.stride)
+    print("dilations: ")
+    print(para.dilations)
+    
+    
+    
     print(" \(type) output: ")
     print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
   }
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConvTransposeOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConvTransposeOp.swift
index cf67ef7ccf..ba83de1bf8 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/ConvTransposeOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvTransposeOp.swift
@@ -43,8 +43,15 @@ class ConvTransposeOp<P: PrecisionType>: Operator<ConvTransposeKernel<P>, ConvTr
   }
   
   func delogOutput() {
-    print("conv transpose delog")
-    let _: P? = para.input.metalTexture.logDesc(header: "conv transpose input: ", stridable: true)
-    let _: P? = para.output.metalTexture.logDesc(header: "conv transpose output: ", stridable: true)
+    print(" \(type) output: ")
+    let originDim = para.output.originDim
+    if para.output.transpose == [0, 1, 2, 3] {
+      let outputArray = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]))
+      print(outputArray.strideArray())
+    } else if para.output.transpose == [0, 2, 3, 1] {
+      print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+    } else {
+      print(" not implement")
+    }
   }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddOp.swift
index a6c3162155..4812f05182 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddOp.swift
@@ -61,11 +61,18 @@ class ElementwiseAddOp<P: PrecisionType>: Operator<ElementwiseAddKernel<P>, Elem
     print(para.inputX.metalTexture.toTensor(dim: (n: para.inputX.tensorDim[0], c: para.inputX.tensorDim[1], h: para.inputX.tensorDim[2], w: para.inputX.tensorDim[3])).strideArray())
     print(" \(type) inputY: ")
     print(para.inputY.metalTexture.toTensor(dim: (n: para.inputY.tensorDim[0], c: para.inputY.tensorDim[1], h: para.inputY.tensorDim[2], w: para.inputY.tensorDim[3])).strideArray())
+    
     print(" \(type) output: ")
     let originDim = para.output.originDim
-    let outputArray = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]))
-    print(outputArray.strideArray())
-    print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+    if para.output.transpose == [0, 1, 2, 3] {
+      let outputArray = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]))
+      print(outputArray.strideArray())
+    } else if para.output.transpose == [0, 2, 3, 1] {
+      print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+    } else {
+      print(" not implement")
+    }
+    
   }
   
   func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift
index f2bc495158..eabadc9d44 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift
@@ -75,7 +75,7 @@ class ConvAddBatchNormReluKernel<P: PrecisionType>: Kernel, Computable, Testable
         print("offset y: \(offsetY)")
         
         let offsetZ = 0.0
-        metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]))
+        metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
         
         var invs: [P] = []
         let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift
index 3c999052fc..83dd4f996a 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift
@@ -27,9 +27,10 @@ class ConvAddKernel<P: PrecisionType>: Kernel, Computable {
     
     param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1])
     
-    let offsetX = param.filter.width/2 - Int(param.paddings[0])
-    let offsetY = param.filter.height/2 - Int(param.paddings[1])
+    let offsetX = (Int(param.dilations[0]) * (param.filter.width - 1) + 1)/2 - Int(param.paddings[0])
     
+    let offsetY = (Int(param.dilations[1]) * (param.filter.height - 1) + 1)/2 - Int(param.paddings[1])
+
     param.filter.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32)
     param.y.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32)
     
@@ -37,7 +38,11 @@ class ConvAddKernel<P: PrecisionType>: Kernel, Computable {
     print("offset y: \(offsetY)")
     
     let offsetZ = 0.0
-    metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]))
+    let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
+    print("metal param: ")
+    print(inMetalParam)
+    
+    metalParam = inMetalParam
   }
   
   func compute(commandBuffer: MTLCommandBuffer, param: ConvAddParam<P>) throws {
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift
index 2878d82879..c5d3ffe6c9 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift
@@ -81,7 +81,7 @@ class ConvBNReluKernel<P: PrecisionType>: Kernel, Computable, Testable {
     let offsetZ = 0.0
     
     print(" fuck ")
-    metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]))
+    metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
     
     var invs: [P] = []
     let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift
index 60ce245a78..680beba1ea 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift
@@ -21,6 +21,8 @@ public struct MetalConvParam {
   let strideX: UInt16
   let strideY: UInt16
   let paddedZ: UInt16
+  let dilationX: UInt16
+  let dilationY: UInt16
 }
 
 class ConvKernel<P: PrecisionType>: Kernel, Computable {
@@ -39,7 +41,7 @@ class ConvKernel<P: PrecisionType>: Kernel, Computable {
     let offsetZ = 0.0
     param.filter.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32)
     
-    metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]))
+    metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
   }
   
   func compute(commandBuffer: MTLCommandBuffer, param: ConvParam<P>) throws {
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvTransposeKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvTransposeKernel.swift
index 81aa9d2ff0..9354972d71 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvTransposeKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvTransposeKernel.swift
@@ -45,6 +45,7 @@ class ConvTransposeKernel<P: PrecisionType>: Kernel, Computable{
     metalParam = MetalConvTransposeParam.init(kernelW: kernelWidth, kernelH: kernelHeight, strideX: strideX, strideY: strideY, paddingX: paddingX, paddingY: paddingY, dilationX: dilationX, dilationY: dilationY)
     
     param.output.initTexture(device: device, inTranspose: param.input.transpose)
+    param.filter.initBuffer(device: device)
   }
   
   func compute(commandBuffer: MTLCommandBuffer, param: ConvTransposeParam<P>) throws {
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ElementwiseAddKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ElementwiseAddKernel.swift
index 884b2ae3c3..c80df94bb5 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ElementwiseAddKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ElementwiseAddKernel.swift
@@ -55,7 +55,7 @@ class ElementwiseAddKernel<P: PrecisionType>: Kernel, Computable {
     }
     emp.yoff = 4 - Int32(param.inputY.tensorDim.cout())
     if (param.inputX.dim == param.inputY.dim) && (param.inputX.transpose == param.inputY.transpose) {
-      print("===> elementwise_add fast!!!")
+//      print("===> elementwise_add fast!!!")
       emp.fast = 1
     }
     
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvKernel.metal
index 06c93da59d..a17366cbe0 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvKernel.metal
@@ -16,198 +16,198 @@
 using namespace metal;
 
 struct MetalConvParam {
-    short offsetX;
-    short offsetY;
-    short offsetZ;
-    ushort strideX;
-    ushort strideY;
+  short offsetX;
+  short offsetY;
+  short offsetZ;
+  ushort strideX;
+  ushort strideY;
+  ushort dilationX;
+  ushort dilationY;
 };
 
-
 kernel void conv_add_batch_norm_relu_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                                         texture2d_array<half, access::write> outTexture [[texture(1)]],
-                                         constant MetalConvParam &param [[buffer(0)]],
-                                         const device half4 *weights [[buffer(1)]],
-                                         const device half4 *biase [[buffer(2)]],
-                                         const device float4 *new_scale [[buffer(3)]],
-                                         const device float4 *new_biase [[buffer(4)]],
-                                         uint3 gid [[thread_position_in_grid]]) {
-    
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const uint kernelHXW = 1;
-    
-    uint input_arr_size = inTexture.get_array_size();
-    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+                                              texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                              constant MetalConvParam &param [[buffer(0)]],
+                                              const device half4 *weights [[buffer(1)]],
+                                              const device half4 *biase [[buffer(2)]],
+                                              const device float4 *new_scale [[buffer(3)]],
+                                              const device float4 *new_biase [[buffer(4)]],
+                                              uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 1;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  half4 output = half4(0.0);
+  
+  half4 input;
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+    output.x += dot(input, weight_x);
     
-    half4 output = half4(0.0);
+    half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+    output.y += dot(input, weight_y);
     
-    half4 input;
-    for (uint i = 0; i < input_arr_size; ++i) {
-        input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-        half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
-        output.x += dot(input, weight_x);
-        
-        half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
-        output.y += dot(input, weight_y);
-        
-        half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
-        output.z += dot(input, weight_z);
-        
-        half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
-        output.w += dot(input, weight_w);
-    }
+    half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+    output.z += dot(input, weight_z);
     
-    output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0));
-    outTexture.write(output, gid.xy, gid.z);
+    half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+    output.w += dot(input, weight_w);
+  }
+  
+  output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0));
+  outTexture.write(output, gid.xy, gid.z);
 }
 
 kernel void conv_add_batch_norm_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                                         texture2d_array<half, access::write> outTexture [[texture(1)]],
-                                         constant MetalConvParam &param [[buffer(0)]],
-                                         const device half4 *weights [[buffer(1)]],
-                                         const device half4 *biase [[buffer(2)]],
-                                         const device float4 *new_scale [[buffer(3)]],
-                                         const device float4 *new_biase [[buffer(4)]],
-                                         uint3 gid [[thread_position_in_grid]]) {
-    
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const uint kernelHXW = 9;
-    uint input_arr_size = inTexture.get_array_size();
-    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-    
-    half4 output = half4(0.0);
-    
-    half4 input[9];
-    for (uint i = 0; i < input_arr_size; ++i) {
-        input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
-        input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
-        input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
-        input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
-        input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
-        input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
-        input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
-        input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
-        input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
-        for (int j = 0; j < 9; ++j) {
-            half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.x += dot(input[j], weight_x);
-            
-            half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.y += dot(input[j], weight_y);
-            
-            half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.z += dot(input[j], weight_z);
-            
-            half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.w += dot(input[j], weight_w);
-        }
+                                              texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                              constant MetalConvParam &param [[buffer(0)]],
+                                              const device half4 *weights [[buffer(1)]],
+                                              const device half4 *biase [[buffer(2)]],
+                                              const device float4 *new_scale [[buffer(3)]],
+                                              const device float4 *new_biase [[buffer(4)]],
+                                              uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  half4 output = half4(0.0);
+  
+  half4 input[9];
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
+    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
+    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
+    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
+    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
+    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
+    for (int j = 0; j < 9; ++j) {
+      half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      
+      half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      
+      half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      
+      half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
     }
-    output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0));
-    outTexture.write(output, gid.xy, gid.z);
+  }
+  output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0));
+  outTexture.write(output, gid.xy, gid.z);
 }
 
 kernel void conv_add_1x1_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                         texture2d_array<half, access::write> outTexture [[texture(1)]],
-                         constant MetalConvParam &param [[buffer(0)]],
-                         const device half4 *weights [[buffer(1)]],
-                         const device half4 *biase [[buffer(2)]],
-                         uint3 gid [[thread_position_in_grid]]) {
-    
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const uint kernelHXW = 1;
+                              texture2d_array<half, access::write> outTexture [[texture(1)]],
+                              constant MetalConvParam &param [[buffer(0)]],
+                              const device half4 *weights [[buffer(1)]],
+                              const device half4 *biase [[buffer(2)]],
+                              uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 1;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  half4 output = half4(0.0);
+  
+  half4 input;
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+    output.x += dot(input, weight_x);
     
-    uint input_arr_size = inTexture.get_array_size();
-    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+    output.y += dot(input, weight_y);
     
-    half4 output = half4(0.0);
+    half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+    output.z += dot(input, weight_z);
     
-    half4 input;
-    for (uint i = 0; i < input_arr_size; ++i) {
-        input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-        half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
-        output.x += dot(input, weight_x);
-        
-        half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
-        output.y += dot(input, weight_y);
-        
-        half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
-        output.z += dot(input, weight_z);
-        
-        half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
-        output.w += dot(input, weight_w);
-    }
-    output = output + biase[gid.z];
-    outTexture.write(output, gid.xy, gid.z);
+    half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+    output.w += dot(input, weight_w);
+  }
+  output = output + biase[gid.z];
+  outTexture.write(output, gid.xy, gid.z);
 }
 
 kernel void depthwise_conv_add_batch_norm_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
-                                                   texture2d_array<half, access::write> outTexture [[texture(1)]],
-                                                   constant MetalConvParam &param [[buffer(0)]],
-                                                   const device half *weights [[buffer(1)]],
-                                                   const device half4 *biase [[buffer(2)]],
-                                                   const device float4 *new_scale [[buffer(3)]],
-                                                   const device float4 *new_biase [[buffer(4)]],
-                                                   uint3 gid [[thread_position_in_grid]]) {
-    
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    uint output_slice = gid.z;
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const uint kernelHXW = 9;
-    uint weithTo = gid.z * kernelHXW * 4;
-    half4 output = half4(0.0);
-    half4 inputs[9];
-    inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
-    inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
-    inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
-    inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
-    inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
-    inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
-    inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
-    inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
-    inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
-    for (int j = 0; j < 9; ++j) {
-        half4 input = inputs[j];
-        output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
-        output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
-        output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
-        output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
-    }
-    output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0));
-    outTexture.write(output, gid.xy, gid.z);
+                                                        texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                                        constant MetalConvParam &param [[buffer(0)]],
+                                                        const device half *weights [[buffer(1)]],
+                                                        const device half4 *biase [[buffer(2)]],
+                                                        const device float4 *new_scale [[buffer(3)]],
+                                                        const device float4 *new_biase [[buffer(4)]],
+                                                        uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  uint output_slice = gid.z;
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint weithTo = gid.z * kernelHXW * 4;
+  half4 output = half4(0.0);
+  half4 inputs[9];
+  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+  for (int j = 0; j < 9; ++j) {
+    half4 input = inputs[j];
+    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+  }
+  output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0));
+  outTexture.write(output, gid.xy, gid.z);
 }
 
 
@@ -223,41 +223,41 @@ kernel void conv_add_batch_norm_relu_1x1(texture2d_array<float, access::sample>
                                          const device float4 *new_scale [[buffer(3)]],
                                          const device float4 *new_biase [[buffer(4)]],
                                          uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 1;
+  
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  float4 input;
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+    output.x += dot(input, weight_x);
     
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const uint kernelHXW = 1;
-    
-    uint input_arr_size = inTexture.get_array_size();
-    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+    float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+    output.y += dot(input, weight_y);
     
-    float4 output = float4(0.0);
+    float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+    output.z += dot(input, weight_z);
     
-    float4 input;
-    for (uint i = 0; i < input_arr_size; ++i) {
-        input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
-        float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
-        output.x += dot(input, weight_x);
-        
-        float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
-        output.y += dot(input, weight_y);
-        
-        float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
-        output.z += dot(input, weight_z);
-        
-        float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
-        output.w += dot(input, weight_w);
-    }
-    output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
-    outTexture.write(output, gid.xy, gid.z);
+    float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+    output.w += dot(input, weight_w);
+  }
+  output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
+  outTexture.write(output, gid.xy, gid.z);
 }
 
 kernel void conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
@@ -268,92 +268,92 @@ kernel void conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample>
                                          const device float4 *new_scale [[buffer(3)]],
                                          const device float4 *new_biase [[buffer(4)]],
                                          uint3 gid [[thread_position_in_grid]]) {
-    
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const uint kernelHXW = 9;
-    uint input_arr_size = inTexture.get_array_size();
-    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-    
-    float4 output = float4(0.0);
-    
-    float4 input[9];
-    for (uint i = 0; i < input_arr_size; ++i) {
-        input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
-        input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
-        input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
-        input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
-        input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
-        input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
-        input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
-        input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
-        input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
-        for (int j = 0; j < 9; ++j) {
-            float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.x += dot(input[j], weight_x);
-            
-            float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.y += dot(input[j], weight_y);
-            
-            float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.z += dot(input[j], weight_z);
-            
-            float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.w += dot(input[j], weight_w);
-        }
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  
+  float4 output = float4(0.0);
+  
+  float4 input[9];
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
+    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
+    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
+    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
+    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
+    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
+    for (int j = 0; j < 9; ++j) {
+      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      
+      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      
+      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      
+      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
     }
-    output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
-    outTexture.write(output, gid.xy, gid.z);
+  }
+  output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
+  outTexture.write(output, gid.xy, gid.z);
 }
 
 kernel void depthwise_conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                                         texture2d_array<float, access::write> outTexture [[texture(1)]],
-                                         constant MetalConvParam &param [[buffer(0)]],
-                                         const device float *weights [[buffer(1)]],
-                                         const device float4 *biase [[buffer(2)]],
-                                         const device float4 *new_scale [[buffer(3)]],
-                                         const device float4 *new_biase [[buffer(4)]],
-                                         uint3 gid [[thread_position_in_grid]]) {
-    
-    if (gid.x >= outTexture.get_width() ||
-        gid.y >= outTexture.get_height() ||
-        gid.z >= outTexture.get_array_size()) {
-        return;
-    }
-    uint output_slice = gid.z;
-    ushort2 stride = ushort2(param.strideX, param.strideY);
-    ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
-    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-    const uint kernelHXW = 9;
-    uint weithTo = gid.z * kernelHXW * 4;
-    float4 output = float4(0.0);
-    float4 inputs[9];
-    inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
-    inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
-    inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
-    inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
-    inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
-    inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
-    inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
-    inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
-    inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
-    for (int j = 0; j < 9; ++j) {
-        float4 input = inputs[j];
-        output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
-        output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
-        output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
-        output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
-    }
-    output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
-    outTexture.write(output, gid.xy, gid.z);
+                                                   texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                                   constant MetalConvParam &param [[buffer(0)]],
+                                                   const device float *weights [[buffer(1)]],
+                                                   const device float4 *biase [[buffer(2)]],
+                                                   const device float4 *new_scale [[buffer(3)]],
+                                                   const device float4 *new_biase [[buffer(4)]],
+                                                   uint3 gid [[thread_position_in_grid]]) {
+  
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  uint output_slice = gid.z;
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint weithTo = gid.z * kernelHXW * 4;
+  float4 output = float4(0.0);
+  float4 inputs[9];
+  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+  for (int j = 0; j < 9; ++j) {
+    float4 input = inputs[j];
+    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+  }
+  output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
+  outTexture.write(output, gid.xy, gid.z);
 }
 
 // conv
@@ -447,10 +447,10 @@ kernel void depthwise_conv_3x3(texture2d_array<float, access::sample> inTexture
 }
 
 kernel void conv_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                         texture2d_array<float, access::write> outTexture [[texture(1)]],
-                         constant MetalConvParam &param [[buffer(0)]],
-                         const device float4 *weights [[buffer(1)]],
-                         uint3 gid [[thread_position_in_grid]]) {
+                     texture2d_array<float, access::write> outTexture [[texture(1)]],
+                     constant MetalConvParam &param [[buffer(0)]],
+                     const device float4 *weights [[buffer(1)]],
+                     uint3 gid [[thread_position_in_grid]]) {
   
   if (gid.x >= outTexture.get_width() ||
       gid.y >= outTexture.get_height() ||
@@ -532,13 +532,13 @@ kernel void conv_add_1x1(texture2d_array<float, access::sample> inTexture [[text
 }
 
 kernel void conv_add_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                                         texture2d_array<float, access::write> outTexture [[texture(1)]],
-                                         constant MetalConvParam &param [[buffer(0)]],
-                                         const device float4 *weights [[buffer(1)]],
-                                         const device float4 *biase [[buffer(2)]],
-                                         const device float4 *new_scale [[buffer(3)]],
-                                         const device float4 *new_biase [[buffer(4)]],
-                                         uint3 gid [[thread_position_in_grid]]) {
+                         texture2d_array<float, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device float4 *weights [[buffer(1)]],
+                         const device float4 *biase [[buffer(2)]],
+                         const device float4 *new_scale [[buffer(3)]],
+                         const device float4 *new_biase [[buffer(4)]],
+                         uint3 gid [[thread_position_in_grid]]) {
   
   if (gid.x >= outTexture.get_width() ||
       gid.y >= outTexture.get_height() ||
@@ -556,17 +556,20 @@ kernel void conv_add_3x3(texture2d_array<float, access::sample> inTexture [[text
   
   float4 output = float4(0.0);
   
+  ushort dilation_x = param.dilationX;
+  ushort dilation_y = param.dilationY;
+  
   float4 input[9];
   for (uint i = 0; i < input_arr_size; ++i) {
-    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
-    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
-    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
-    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
+    input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y - dilation_y), i);
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - dilation_y), i);
+    input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y - dilation_y), i);
+    input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y), i);
     input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
-    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
-    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
-    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
-    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
+    input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y), i);
+    input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x,    posInInput.y + dilation_y), i);
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + dilation_y), i);
+    input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x,    posInInput.y + dilation_y), i);
     for (int j = 0; j < 9; ++j) {
       float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
       output.x += dot(input[j], weight_x);
@@ -586,13 +589,13 @@ kernel void conv_add_3x3(texture2d_array<float, access::sample> inTexture [[text
 }
 
 kernel void depthwise_conv_add_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                                                   texture2d_array<float, access::write> outTexture [[texture(1)]],
-                                                   constant MetalConvParam &param [[buffer(0)]],
-                                                   const device float *weights [[buffer(1)]],
-                                                   const device float4 *biase [[buffer(2)]],
-                                                   const device float4 *new_scale [[buffer(3)]],
-                                                   const device float4 *new_biase [[buffer(4)]],
-                                                   uint3 gid [[thread_position_in_grid]]) {
+                                   texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                   constant MetalConvParam &param [[buffer(0)]],
+                                   const device float *weights [[buffer(1)]],
+                                   const device float4 *biase [[buffer(2)]],
+                                   const device float4 *new_scale [[buffer(3)]],
+                                   const device float4 *new_biase [[buffer(4)]],
+                                   uint3 gid [[thread_position_in_grid]]) {
   
   if (gid.x >= outTexture.get_width() ||
       gid.y >= outTexture.get_height() ||
@@ -629,12 +632,12 @@ kernel void depthwise_conv_add_3x3(texture2d_array<float, access::sample> inText
 
 #pragma mark - conv bn relu
 kernel void conv_batch_norm_relu_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                                         texture2d_array<float, access::write> outTexture [[texture(1)]],
-                                         constant MetalConvParam &param [[buffer(0)]],
-                                         const device float4 *weights [[buffer(1)]],
-                                         const device float4 *new_scale [[buffer(2)]],
-                                         const device float4 *new_biase [[buffer(3)]],
-                                         uint3 gid [[thread_position_in_grid]]) {
+                                     texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                     constant MetalConvParam &param [[buffer(0)]],
+                                     const device float4 *weights [[buffer(1)]],
+                                     const device float4 *new_scale [[buffer(2)]],
+                                     const device float4 *new_biase [[buffer(3)]],
+                                     uint3 gid [[thread_position_in_grid]]) {
   
   if (gid.x >= outTexture.get_width() ||
       gid.y >= outTexture.get_height() ||
@@ -673,12 +676,12 @@ kernel void conv_batch_norm_relu_1x1(texture2d_array<float, access::sample> inTe
 }
 
 kernel void conv_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                                         texture2d_array<float, access::write> outTexture [[texture(1)]],
-                                         constant MetalConvParam &param [[buffer(0)]],
-                                         const device float4 *weights [[buffer(1)]],
-                                         const device float4 *new_scale [[buffer(2)]],
-                                         const device float4 *new_biase [[buffer(3)]],
-                                         uint3 gid [[thread_position_in_grid]]) {
+                                     texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                     constant MetalConvParam &param [[buffer(0)]],
+                                     const device float4 *weights [[buffer(1)]],
+                                     const device float4 *new_scale [[buffer(2)]],
+                                     const device float4 *new_biase [[buffer(3)]],
+                                     uint3 gid [[thread_position_in_grid]]) {
   
   if (gid.x >= outTexture.get_width() ||
       gid.y >= outTexture.get_height() ||
@@ -726,12 +729,12 @@ kernel void conv_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTe
 }
 
 kernel void depthwise_conv_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                                                   texture2d_array<float, access::write> outTexture [[texture(1)]],
-                                                   constant MetalConvParam &param [[buffer(0)]],
-                                                   const device float *weights [[buffer(1)]],
-                                                   const device float4 *new_scale [[buffer(2)]],
-                                                   const device float4 *new_biase [[buffer(3)]],
-                                                   uint3 gid [[thread_position_in_grid]]) {
+                                               texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                               constant MetalConvParam &param [[buffer(0)]],
+                                               const device float *weights [[buffer(1)]],
+                                               const device float4 *new_scale [[buffer(2)]],
+                                               const device float4 *new_biase [[buffer(3)]],
+                                               uint3 gid [[thread_position_in_grid]]) {
   
   if (gid.x >= outTexture.get_width() ||
       gid.y >= outTexture.get_height() ||
diff --git a/metal/paddle-mobile/paddle-mobile/Program/BlockDesc.swift b/metal/paddle-mobile/paddle-mobile/Program/BlockDesc.swift
index 8e1915a497..f1a94e4e2a 100644
--- a/metal/paddle-mobile/paddle-mobile/Program/BlockDesc.swift
+++ b/metal/paddle-mobile/paddle-mobile/Program/BlockDesc.swift
@@ -48,8 +48,10 @@ extension BlockDesc: CustomStringConvertible, CustomDebugStringConvertible {
     var description: String {
         var str = ""
         
-        for op in ops {
-            str += op.description
+        for i in 0..<ops.count {
+          str += " op \(i): "
+          let op = ops[i]
+          str += op.description
         }
         
         for varDesc in vars {
diff --git a/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift b/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift
index d2c8228ed9..e7953e8349 100644
--- a/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift
+++ b/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift
@@ -16,245 +16,246 @@ import Accelerate
 import Foundation
 
 protocol Tensorial: CustomStringConvertible, CustomDebugStringConvertible{
-    var dim: Dim { get set }
-    func numel() -> Int
-    var layout: DataLayout { get }
+  var dim: Dim { get set }
+  func numel() -> Int
+  var layout: DataLayout { get }
 }
 
 extension Tensorial {
-    func numel() -> Int {
-        return dim.numel()
-    }
+  func numel() -> Int {
+    return dim.numel()
+  }
 }
 
 class Tensor<P: PrecisionType>: Tensorial {
-    enum BufferPrecision {
-        case Float32, Float16
+  enum BufferPrecision {
+    case Float32, Float16
+  }
+  
+  var data: Data
+  var dim: Dim
+  var buffer: MTLBuffer!
+  private(set) var layout: DataLayout
+  
+  class Data {
+    init(inSize: Int, inPointer: UnsafeMutablePointer<P>) {
+      size = inSize
+      pointer = inPointer
     }
-    
-    var data: Data
-    var dim: Dim
-    var buffer: MTLBuffer!
-    private(set) var layout: DataLayout
-    
-    class Data {
-        init(inSize: Int, inPointer: UnsafeMutablePointer<P>) {
-            size = inSize
-            pointer = inPointer
-        }
-        let size: Int
-        var pointer: UnsafeMutablePointer<P>
-        subscript(index: Int) -> P{
-            get {
-                return pointer[index]
-            }
-            set {
-                pointer[index] = newValue
-            }
-        }
-        func release() {
-            pointer.deinitialize(count: size)
-            pointer.deallocate()
-        }
-        deinit {
-//            release()
-        }
+    let size: Int
+    var pointer: UnsafeMutablePointer<P>
+    subscript(index: Int) -> P{
+      get {
+        return pointer[index]
+      }
+      set {
+        pointer[index] = newValue
+      }
     }
- 
-    required init(inDim: Dim, inLayout: DataLayout = DataLayout.NCHW()) {
-        dim = inDim
-        let size = inDim.numel() * MemoryLayout<P>.size
-        let pointer = UnsafeMutablePointer<P>.allocate(capacity: size)
-        data = Data.init(inSize: size, inPointer: pointer)
-        layout = inLayout
+    func release() {
+      pointer.deinitialize(count: size)
+      pointer.deallocate()
+    }
+    deinit {
+      //            release()
+    }
+  }
+  
+  required init(inDim: Dim, inLayout: DataLayout = DataLayout.NCHW()) {
+    dim = inDim
+    let size = inDim.numel() * MemoryLayout<P>.size
+    let pointer = UnsafeMutablePointer<P>.allocate(capacity: size)
+    data = Data.init(inSize: size, inPointer: pointer)
+    layout = inLayout
+  }
+  
+  func convert(to: DataLayout) {
+    guard to != layout else {
+      return
     }
     
-    func convert(to: DataLayout) {
-        guard to != layout else {
-            return
-        }
-        
-        guard dim.cout() == 4 else {
-            return
-        }
-        
-        guard layout == DataLayout.NCHW() && to == DataLayout.NHWC() else {
-            // other not support
-            return
-        }
-        let newPointer = UnsafeMutablePointer<P>.allocate(capacity: data.size)
-        
-        if layout == DataLayout.NCHW() {
-            NCHW2NHWC(newPtr: newPointer)
-        }
-        
-        data.release()
-        data.pointer = newPointer
-        layout = to
+    guard dim.cout() == 4 else {
+      return
     }
     
-    func float32ToFloat16(input: UnsafeMutablePointer<Float32>, output: UnsafeMutableRawPointer, count: Int) {
-        var float32Buffer = vImage_Buffer(data: input,  height: 1, width: UInt(count), rowBytes: count * 4)
-        var float16buffer = vImage_Buffer(data: output, height: 1, width: UInt(count), rowBytes: count * 2)
-        guard vImageConvert_PlanarFtoPlanar16F(&float32Buffer, &float16buffer, 0) == kvImageNoError else {
-            fatalError(" float 32 to float 16 error ! ")
-        }
+    guard layout == DataLayout.NCHW() && to == DataLayout.NHWC() else {
+      // other not support
+      return
     }
+    let newPointer = UnsafeMutablePointer<P>.allocate(capacity: data.size)
     
-    func initBuffer(device: MTLDevice, precision: BufferPrecision = .Float32) {
-        guard let floatPointer = data.pointer as? UnsafeMutablePointer<Float32> else {
-            fatalError(" not support yet ")
-        }
-        
-        let precisionSize: Int
-        switch precision {
-        case .Float32:
-            precisionSize = 4
-        case .Float16:
-            precisionSize = 2
-        }
-        
-        if dim.cout() == 4 {
-            if layout == DataLayout.NHWC() {
-                let C = dim[3]
-                let cSlices = (C + 3) / 4
-                let paddedC = cSlices * 4
-                let count = paddedC * dim[0] * dim[1] * dim[2]
-                if C == paddedC {
-                    buffer = device.makeBuffer(length: count * precisionSize)
-                    switch precision {
-                    case .Float32:
-                        buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout<P>.stride)
-                    case .Float16:
-                        float32ToFloat16(input: floatPointer, output: buffer.contents(), count: count)
-                    }
-                } else if C == 1 {
-                    buffer = device.makeBuffer(length: numel() * precisionSize)
-                    switch precision {
-                    case .Float32:
-                        buffer?.contents().copyMemory(from: data.pointer, byteCount: numel() * MemoryLayout<P>.stride)
-                    case .Float16:
-                        float32ToFloat16(input: floatPointer, output: buffer.contents(), count: numel())
-                    }
-                } else {
-                    buffer = device.makeBuffer(length: count * precisionSize)
-                    let convertedPointer = UnsafeMutablePointer<Float32>.allocate(capacity: count)
-                    var tmpPointer = floatPointer
-                    var dstPtr = convertedPointer
-                    for _ in 0..<dim[0] * dim[1] * dim[2] {
-                        for j in 0..<paddedC {
-                            if j < C {
-                                dstPtr[j] = tmpPointer[j]
-                            }
-                        }
-                        tmpPointer += C
-                        dstPtr += paddedC
-                    }
-                    
-                    switch precision {
-                    case .Float32:
-                        buffer?.contents().copyMemory(from: convertedPointer, byteCount: count * MemoryLayout<P>.stride)
-                    case .Float16:
-                        float32ToFloat16(input: convertedPointer, output: buffer.contents(), count: count)
-                    }
-                    
-                    convertedPointer.deinitialize(count: count)
-                    convertedPointer.deallocate()
-                }
-            }
-        } else if dim.cout() == 1 {
-            buffer = device.makeBuffer(length: numel() * precisionSize)
-            switch precision {
-            case .Float32:
-                buffer?.contents().copyMemory(from: data.pointer, byteCount: numel() * MemoryLayout<P>.stride)
-            case .Float16:
-                float32ToFloat16(input: floatPointer, output: buffer.contents(), count: numel())
-            }
-        } else {
-            fatalError(" not support !")
-        }
-        //TODO: release
-        data.release()
+    if layout == DataLayout.NCHW() {
+      NCHW2NHWC(newPtr: newPointer)
     }
     
-    var width: Int {
-        get {
-            if dim.cout() == 4 {
-                return dim[1]
-            } else {
-                fatalError()
-            }
-        }
+    data.release()
+    data.pointer = newPointer
+    layout = to
+  }
+  
+  func float32ToFloat16(input: UnsafeMutablePointer<Float32>, output: UnsafeMutableRawPointer, count: Int) {
+    var float32Buffer = vImage_Buffer(data: input,  height: 1, width: UInt(count), rowBytes: count * 4)
+    var float16buffer = vImage_Buffer(data: output, height: 1, width: UInt(count), rowBytes: count * 2)
+    guard vImageConvert_PlanarFtoPlanar16F(&float32Buffer, &float16buffer, 0) == kvImageNoError else {
+      fatalError(" float 32 to float 16 error ! ")
+    }
+  }
+  
+  func initBuffer(device: MTLDevice, precision: BufferPrecision = .Float32) {
+    guard let floatPointer = data.pointer as? UnsafeMutablePointer<Float32> else {
+      fatalError(" not support yet ")
     }
     
-    var height: Int {
-        get {
-            if dim.cout() == 4 {
-                return dim[2]
-            } else {
-                fatalError()
-            }
-        }
+    let precisionSize: Int
+    switch precision {
+    case .Float32:
+      precisionSize = 4
+    case .Float16:
+      precisionSize = 2
     }
     
-    var channel: Int {
-        get {
-            if dim.cout() == 4 {
-                return dim[3]
-            } else {
-                fatalError()
+    if dim.cout() == 4 {
+      if layout == DataLayout.NHWC() {
+        let C = dim[3]
+        let cSlices = (C + 3) / 4
+        let paddedC = cSlices * 4
+        let count = paddedC * dim[0] * dim[1] * dim[2]
+        if C == paddedC {
+          buffer = device.makeBuffer(length: count * precisionSize)
+          switch precision {
+          case .Float32:
+            buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout<P>.stride)
+          case .Float16:
+            float32ToFloat16(input: floatPointer, output: buffer.contents(), count: count)
+          }
+        } else if C == 1 {
+          buffer = device.makeBuffer(length: numel() * precisionSize)
+          switch precision {
+          case .Float32:
+            buffer?.contents().copyMemory(from: data.pointer, byteCount: numel() * MemoryLayout<P>.stride)
+          case .Float16:
+            float32ToFloat16(input: floatPointer, output: buffer.contents(), count: numel())
+          }
+        } else {
+          buffer = device.makeBuffer(length: count * precisionSize)
+          let convertedPointer = UnsafeMutablePointer<Float32>.allocate(capacity: count)
+          var tmpPointer = floatPointer
+          var dstPtr = convertedPointer
+          for _ in 0..<dim[0] * dim[1] * dim[2] {
+            for j in 0..<paddedC {
+              if j < C {
+                dstPtr[j] = tmpPointer[j]
+              }
             }
+            tmpPointer += C
+            dstPtr += paddedC
+          }
+          
+          switch precision {
+          case .Float32:
+            buffer?.contents().copyMemory(from: convertedPointer, byteCount: count * MemoryLayout<P>.stride)
+          case .Float16:
+            float32ToFloat16(input: convertedPointer, output: buffer.contents(), count: count)
+          }
+          
+          convertedPointer.deinitialize(count: count)
+          convertedPointer.deallocate()
         }
+      }
+    } else if dim.cout() == 1 {
+      let num = ((numel() + 3) / 4) * 4
+      buffer = device.makeBuffer(length: num * precisionSize)
+      switch precision {
+      case .Float32:
+        buffer?.contents().copyMemory(from: data.pointer, byteCount: num * MemoryLayout<P>.stride)
+      case .Float16:
+        float32ToFloat16(input: floatPointer, output: buffer.contents(), count: num)
+      }
+    } else {
+      fatalError(" not support !")
     }
-
+    //TODO: release
+    data.release()
+  }
+  
+  var width: Int {
+    get {
+      if dim.cout() == 4 {
+        return dim[1]
+      } else {
+        fatalError()
+      }
+    }
+  }
+  
+  var height: Int {
+    get {
+      if dim.cout() == 4 {
+        return dim[2]
+      } else {
+        fatalError()
+      }
+    }
+  }
+  
+  var channel: Int {
+    get {
+      if dim.cout() == 4 {
+        return dim[3]
+      } else {
+        fatalError()
+      }
+    }
+  }
+  
+  
+  func NCHW2NHWC(newPtr: UnsafeMutablePointer<P>) {
+    let N = dim[0]
+    let C = dim[1]
+    let H = dim[2]
+    let W = dim[3]
+    let HXW = H * W
+    let CXHXW = C * H * W
     
-    func NCHW2NHWC(newPtr: UnsafeMutablePointer<P>) {
-        let N = dim[0]
-        let C = dim[1]
-        let H = dim[2]
-        let W = dim[3]
-        let HXW = H * W
-        let CXHXW = C * H * W
-        
-        var index: Int = 0
-        for n in 0..<N {
-            for h in 0..<H{
-                for w in 0..<W{
-                    for c in 0..<C{
-                        newPtr[index] = data.pointer[n * CXHXW + c * HXW + h * W + w]
-                        index += 1
-                    }
-                }
-            }
+    var index: Int = 0
+    for n in 0..<N {
+      for h in 0..<H{
+        for w in 0..<W{
+          for c in 0..<C{
+            newPtr[index] = data.pointer[n * CXHXW + c * HXW + h * W + w]
+            index += 1
+          }
         }
-        dim.swapeDimAt(index1: 1, index2: 3)
+      }
     }
+    dim.swapeDimAt(index1: 1, index2: 3)
+  }
 }
 
 extension Tensor {
-    
-    var debugDescription: String {
-        var str = "dim: \(dim) \n"
-        str += "MTLBuffer: \(self.buffer) \n"
-        for i in 0..<buffer.length/MemoryLayout<P>.size {
-            str += " \(buffer.contents().assumingMemoryBound(to: P.self)[i])"
-        }
-        return str
+  
+  var debugDescription: String {
+    var str = "dim: \(dim) \n"
+    str += "MTLBuffer: \(self.buffer) \n"
+    for i in 0..<buffer.length/MemoryLayout<P>.size {
+      str += " \(buffer.contents().assumingMemoryBound(to: P.self)[i])"
     }
-    
-    func logDataPointer(header: String = "") {
-        print(header)
-        var str = ""
-        str += "data size: \(data.size) \n"
-        str += "dim: \(dim) \n"
-        for i in 0..<numel() {
-            str += " \(data.pointer[i])"
-        }
-        print(str)
+    return str
+  }
+  
+  func logDataPointer(header: String = "") {
+    print(header)
+    var str = ""
+    str += "data size: \(data.size) \n"
+    str += "dim: \(dim) \n"
+    for i in 0..<numel() {
+      str += " \(data.pointer[i])"
     }
-    
-    var description: String {
-        return debugDescription
-    }
-    
+    print(str)
+  }
+  
+  var description: String {
+    return debugDescription
+  }
+  
 }
-- 
GitLab