update

b0a47b5b · liuruilong · 2e51fd63 · b0a47b5b · b0a47b5b · b0a47b5b
22 changed file
--- a/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.pbxproj
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.pbxproj
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Base.lproj/Main.storyboard
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Base.lproj/Main.storyboard
 <?xml version="1.0" encoding="UTF-8"?>
-<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="14113" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="4MS-jc-i6A">
+<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="14113" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="BYZ-38-t0r">
    <device id="retina4_7" orientation="portrait">
        <adaptation id="fullscreen"/>
    </device>
@@ -303,7 +303,7 @@
                </viewController>
                <placeholder placeholderIdentifier="IBFirstResponder" id="hGb-Pb-icS" userLabel="First Responder" sceneMemberID="firstResponder"/>
            </objects>
-            <point key="canvasLocation" x="-514" y="-3"/>
+            <point key="canvasLocation" x="-721" y="-427"/>
        </scene>
    </scenes>
    <resources>

--- a/metal/paddle-mobile-demo/paddle-mobile-demo/MultiPredictViewController.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/MultiPredictViewController.swift
@@ -20,12 +20,6 @@ class MultiPredictViewController: UIViewController {
    let queue2 = MetalHelper.shared.device.makeCommandQueue()
    
    runner2 = Runner.init(inNet: genet, commandQueue: MetalHelper.shared.queue, inPlatform: .GPU)
-    
-   
-    
-    
-    
-    
  }

  @IBAction func predictAct(_ sender: Any) {

--- a/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/VideoCapture.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/VideoCapture/VideoCapture.swift
@@ -169,6 +169,8 @@ public class VideoCapture: NSObject {
    }
 }

+
+@available(iOS 10.0, *)
 extension VideoCapture: AVCaptureVideoDataOutputSampleBufferDelegate {
  public func captureOutput(_ output: AVCaptureOutput, didOutput sampleBuffer: CMSampleBuffer, from connection: AVCaptureConnection) {
    // Because lowering the capture device's FPS looks ugly in the preview,
@@ -191,6 +193,7 @@ extension VideoCapture: AVCaptureVideoDataOutputSampleBufferDelegate {
  }
 }

+@available(iOS 10.0, *)
 extension VideoCapture: AVCapturePhotoCaptureDelegate {
  public func photoOutput(_ captureOutput: AVCapturePhotoOutput,
                          didFinishProcessingPhoto photoSampleBuffer: CMSampleBuffer?,

--- a/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift
@@ -50,7 +50,7 @@ class ViewController: UIViewController {
  @IBOutlet weak var modelPickerView: UIPickerView!
  @IBOutlet weak var threadPickerView: UIPickerView!
  @IBOutlet weak var videoView: UIView!
-  var videoCapture: VideoCapture!
+//  var videoCapture: VideoCapture!

  var selectImage: UIImage?
  var inputPointer: UnsafeMutablePointer<Float32>?
@@ -106,6 +106,12 @@ class ViewController: UIViewController {
        return
      }
      
+//      for _ in 0..<10{
+//        runner.predict(texture: inTexture) { (success, resultHolder)  in
+//          resultHolder?.releasePointer()
+//        }
+//      }
+      
      let startDate = Date.init()
      for i in 0..<max {
        runner.predict(texture: inTexture) { [weak self] (success, resultHolder)  in
@@ -279,17 +285,18 @@ extension ViewController: VideoCaptureDelegate{
  }
  
  
-  func videoCapture(_ capture: VideoCapture, didCaptureVideoTexture texture: MTLTexture?, timestamp: CMTime) {
-//    if !bool1 {
-//      DispatchQueue.main.asyncAfter(deadline: DispatchTime.init(uptimeNanoseconds: 500000000)) {
-    self.predictTexture(texture: texture!)
-//      }
-
-      
-//      bool1 = true
-//    }
-    
-  }
+//  @available(iOS 10.0, *)
+//  func videoCapture(_ capture: VideoCapture, didCaptureVideoTexture texture: MTLTexture?, timestamp: CMTime) {
+////    if !bool1 {
+////      DispatchQueue.main.asyncAfter(deadline: DispatchTime.init(uptimeNanoseconds: 500000000)) {
+//    self.predictTexture(texture: texture!)
+////      }
+//
+//
+////      bool1 = true
+////    }
+//
+//  }

 }


--- a/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj
+++ b/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj
@@ -70,7 +70,7 @@
 		FC4CB74920F0B954007C0C6D /* ConvKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC4CB74820F0B954007C0C6D /* ConvKernel.metal */; };
 		FC4CB74B20F12C30007C0C6D /* ProgramOptimize.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC4CB74A20F12C30007C0C6D /* ProgramOptimize.swift */; };
 		FC4FD9752140E1DE0073E130 /* PaddleMobile.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC4FD9742140E1DE0073E130 /* PaddleMobile.swift */; };
-		FC4FD9792140E4980073E130 /* PaddleMobile.h in Headers */ = {isa = PBXBuildFile; fileRef = FC4FD9772140E4980073E130 /* PaddleMobile.h */; settings = {ATTRIBUTES = (Public, ); }; };
+		FC4FD9792140E4980073E130 /* PaddleMobileCPU.h in Headers */ = {isa = PBXBuildFile; fileRef = FC4FD9772140E4980073E130 /* PaddleMobileCPU.h */; settings = {ATTRIBUTES = (Public, ); }; };
 		FC4FD97A2140E4980073E130 /* libpaddle-mobile.a in Frameworks */ = {isa = PBXBuildFile; fileRef = FC4FD9782140E4980073E130 /* libpaddle-mobile.a */; };
 		FC4FD97E2140F2C30073E130 /* libstdc++.tbd in Frameworks */ = {isa = PBXBuildFile; fileRef = FC4FD97D2140F2C30073E130 /* libstdc++.tbd */; };
 		FC5163F620EF556E00636C28 /* Texture2DTo2DArrayKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC5163F520EF556E00636C28 /* Texture2DTo2DArrayKernel.swift */; };
@@ -122,6 +122,12 @@
 		FCDDC6CC212FDFDB00E5EF74 /* ReluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCDDC6CB212FDFDB00E5EF74 /* ReluKernel.metal */; };
 		FCDDC6CF212FE14700E5EF74 /* PriorBoxKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCDDC6CE212FE14700E5EF74 /* PriorBoxKernel.metal */; };
 		FCDE8A33212A917900F4A8F6 /* ConvTransposeOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCDE8A32212A917900F4A8F6 /* ConvTransposeOp.swift */; };
+		FCE3A1A92153DE5100C37CDE /* ConvAddAddPreluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCE3A1A82153DE5100C37CDE /* ConvAddAddPreluOp.swift */; };
+		FCE3A1AB2153DE8C00C37CDE /* ConvAddAddPreluKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCE3A1AA2153DE8C00C37CDE /* ConvAddAddPreluKernel.swift */; };
+		FCE3A1AD2153E8BA00C37CDE /* ElementwiseAddPreluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCE3A1AC2153E8BA00C37CDE /* ElementwiseAddPreluOp.swift */; };
+		FCE3A1AF2153E8EE00C37CDE /* ElementwiseAddPreluKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCE3A1AE2153E8EE00C37CDE /* ElementwiseAddPreluKernel.swift */; };
+		FCE3A1B12153E90F00C37CDE /* ElementwiseAddPreluKernel.inc.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCE3A1B02153E90F00C37CDE /* ElementwiseAddPreluKernel.inc.metal */; };
+		FCE3A1B32153E91900C37CDE /* ElementwiseAddPreluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCE3A1B22153E91900C37CDE /* ElementwiseAddPreluKernel.metal */; };
 		FCE9D7B7214F869000B520C3 /* Net.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCE9D7B6214F869000B520C3 /* Net.swift */; };
 		FCE9D7B9214FAA4800B520C3 /* NMSFetchResultKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCE9D7B8214FAA4800B520C3 /* NMSFetchResultKernel.metal */; };
 		FCEB684A212F00DB00D2448E /* PreluKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCEB6849212F00DB00D2448E /* PreluKernel.metal */; };
@@ -199,7 +205,7 @@
 		FC4CB74820F0B954007C0C6D /* ConvKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ConvKernel.metal; sourceTree = "<group>"; };
 		FC4CB74A20F12C30007C0C6D /* ProgramOptimize.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ProgramOptimize.swift; sourceTree = "<group>"; };
 		FC4FD9742140E1DE0073E130 /* PaddleMobile.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = PaddleMobile.swift; sourceTree = "<group>"; };
-		FC4FD9772140E4980073E130 /* PaddleMobile.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = PaddleMobile.h; sourceTree = "<group>"; };
+		FC4FD9772140E4980073E130 /* PaddleMobileCPU.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = PaddleMobileCPU.h; sourceTree = "<group>"; };
 		FC4FD9782140E4980073E130 /* libpaddle-mobile.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; path = "libpaddle-mobile.a"; sourceTree = "<group>"; };
 		FC4FD97D2140F2C30073E130 /* libstdc++.tbd */ = {isa = PBXFileReference; lastKnownFileType = "sourcecode.text-based-dylib-definition"; name = "libstdc++.tbd"; path = "usr/lib/libstdc++.tbd"; sourceTree = SDKROOT; };
 		FC5163F520EF556E00636C28 /* Texture2DTo2DArrayKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Texture2DTo2DArrayKernel.swift; sourceTree = "<group>"; };
@@ -251,6 +257,12 @@
 		FCDDC6CB212FDFDB00E5EF74 /* ReluKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ReluKernel.metal; sourceTree = "<group>"; };
 		FCDDC6CE212FE14700E5EF74 /* PriorBoxKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = PriorBoxKernel.metal; sourceTree = "<group>"; };
 		FCDE8A32212A917900F4A8F6 /* ConvTransposeOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvTransposeOp.swift; sourceTree = "<group>"; };
+		FCE3A1A82153DE5100C37CDE /* ConvAddAddPreluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvAddAddPreluOp.swift; sourceTree = "<group>"; };
+		FCE3A1AA2153DE8C00C37CDE /* ConvAddAddPreluKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvAddAddPreluKernel.swift; sourceTree = "<group>"; };
+		FCE3A1AC2153E8BA00C37CDE /* ElementwiseAddPreluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ElementwiseAddPreluOp.swift; sourceTree = "<group>"; };
+		FCE3A1AE2153E8EE00C37CDE /* ElementwiseAddPreluKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ElementwiseAddPreluKernel.swift; sourceTree = "<group>"; };
+		FCE3A1B02153E90F00C37CDE /* ElementwiseAddPreluKernel.inc.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ElementwiseAddPreluKernel.inc.metal; sourceTree = "<group>"; };
+		FCE3A1B22153E91900C37CDE /* ElementwiseAddPreluKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ElementwiseAddPreluKernel.metal; sourceTree = "<group>"; };
 		FCE9D7B6214F869000B520C3 /* Net.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Net.swift; sourceTree = "<group>"; };
 		FCE9D7B8214FAA4800B520C3 /* NMSFetchResultKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = NMSFetchResultKernel.metal; sourceTree = "<group>"; };
 		FCEB6849212F00DB00D2448E /* PreluKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = PreluKernel.metal; sourceTree = "<group>"; };
@@ -390,6 +402,8 @@
 				FCDE8A32212A917900F4A8F6 /* ConvTransposeOp.swift */,
 				FCEB684B212F093800D2448E /* PreluOp.swift */,
 				FC803BBE214CB65A0094B8E5 /* ConvAddPreluOp.swift */,
+				FCE3A1A82153DE5100C37CDE /* ConvAddAddPreluOp.swift */,
+				FCE3A1AC2153E8BA00C37CDE /* ElementwiseAddPreluOp.swift */,
 			);
 			path = Operators;
 			sourceTree = "<group>";
@@ -439,6 +453,8 @@
 				FCBCCC70212309A700D94F7E /* MulticlassNMSKernel.swift */,
 				FCDDC6C5212F9FB800E5EF74 /* PreluKernel.swift */,
 				FC803BC0214CB77A0094B8E5 /* ConvAddPreluKernel.swift */,
+				FCE3A1AA2153DE8C00C37CDE /* ConvAddAddPreluKernel.swift */,
+				FCE3A1AE2153E8EE00C37CDE /* ElementwiseAddPreluKernel.swift */,
 			);
 			path = Kernels;
 			sourceTree = "<group>";
@@ -447,7 +463,7 @@
 			isa = PBXGroup;
 			children = (
 				FC4FD9782140E4980073E130 /* libpaddle-mobile.a */,
-				FC4FD9772140E4980073E130 /* PaddleMobile.h */,
+				FC4FD9772140E4980073E130 /* PaddleMobileCPU.h */,
 			);
 			path = CPU;
 			sourceTree = "<group>";
@@ -506,6 +522,8 @@
 				FC803BC6214CBA820094B8E5 /* Macro.metal */,
 				FC803BC8214CFC8D0094B8E5 /* FetchKernel.metal */,
 				FCE9D7B8214FAA4800B520C3 /* NMSFetchResultKernel.metal */,
+				FCE3A1B02153E90F00C37CDE /* ElementwiseAddPreluKernel.inc.metal */,
+				FCE3A1B22153E91900C37CDE /* ElementwiseAddPreluKernel.metal */,
 			);
 			path = metal;
 			sourceTree = "<group>";
@@ -517,7 +535,7 @@
 			isa = PBXHeadersBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
-				FC4FD9792140E4980073E130 /* PaddleMobile.h in Headers */,
+				FC4FD9792140E4980073E130 /* PaddleMobileCPU.h in Headers */,
 				FC292C85214257CB00CF622F /* CPUCompute.h in Headers */,
 				FC292C5421421B2F00CF622F /* PaddleMobileGPU.h in Headers */,
 				4AA1EA9E2148D6F900D0F791 /* ConcatKernel.inc.metal in Headers */,
@@ -632,8 +650,10 @@
 				FC039B9B20E11CA00081E9F8 /* Executor.swift in Sources */,
 				4AF9288421357BE3005B6C3A /* Elementwise.metal in Sources */,
 				FCD04E7020F31B720007374F /* ReshapeKernel.swift in Sources */,
+				FCE3A1B12153E90F00C37CDE /* ElementwiseAddPreluKernel.inc.metal in Sources */,
 				FCD04E7220F343420007374F /* ConvAddOp.swift in Sources */,
 				FC039BBB20E11CC20081E9F8 /* ProgramDesc.swift in Sources */,
+				FCE3A1AB2153DE8C00C37CDE /* ConvAddAddPreluKernel.swift in Sources */,
 				FC9D037920E229E4000F735A /* OpParam.swift in Sources */,
 				FC3602CC2108819F00FACB58 /* PaddleMobileUnitTest.swift in Sources */,
 				FCF2D73820E64E70007AC5F5 /* Kernel.swift in Sources */,
@@ -668,6 +688,7 @@
 				FCBCCC592122F42700D94F7E /* ConvBNReluOp.swift in Sources */,
 				FC039BA920E11CBC0081E9F8 /* ConvOp.swift in Sources */,
 				FC9D038420E23B01000F735A /* Texture.swift in Sources */,
+				FCE3A1B32153E91900C37CDE /* ElementwiseAddPreluKernel.metal in Sources */,
 				4AA1EAA2214912CD00D0F791 /* FlattenKernel.swift in Sources */,
 				4AA1EA982146666500D0F791 /* FlattenOp.swift in Sources */,
 				FCBCCC652122FCD700D94F7E /* TransposeOp.swift in Sources */,
@@ -700,6 +721,7 @@
 				4AA1EA92214665D700D0F791 /* ShapeOp.swift in Sources */,
 				FC803BC1214CB77A0094B8E5 /* ConvAddPreluKernel.swift in Sources */,
 				FCBCCC5D2122F8A100D94F7E /* DepthwiseConvOp.swift in Sources */,
+				FCE3A1AF2153E8EE00C37CDE /* ElementwiseAddPreluKernel.swift in Sources */,
 				FCE9D7B7214F869000B520C3 /* Net.swift in Sources */,
 				FC0E2DBE20EE460D009C1FAC /* BatchNormKernel.swift in Sources */,
 				FC039BAB20E11CBC0081E9F8 /* Operator.swift in Sources */,
@@ -718,7 +740,9 @@
 				FC9A19E32148C31300CD9CBF /* MobilenetSSD_AR.swift in Sources */,
 				FCDDC6CF212FE14700E5EF74 /* PriorBoxKernel.metal in Sources */,
 				FC4CB74B20F12C30007C0C6D /* ProgramOptimize.swift in Sources */,
+				FCE3A1A92153DE5100C37CDE /* ConvAddAddPreluOp.swift in Sources */,
 				FC5163F620EF556E00636C28 /* Texture2DTo2DArrayKernel.swift in Sources */,
+				FCE3A1AD2153E8BA00C37CDE /* ElementwiseAddPreluOp.swift in Sources */,
 				FC039BC020E11CC20081E9F8 /* BlockDesc.swift in Sources */,
 				FC803BC3214CB79C0094B8E5 /* ConvAddPreluKernel.metal in Sources */,
 				4AA1EA90214664CD00D0F791 /* Split.metal in Sources */,

--- a/metal/paddle-mobile/paddle-mobile.xcodeproj/xcuserdata/liuruilong.xcuserdatad/xcschemes/paddle-mobile.xcscheme
+++ b/metal/paddle-mobile/paddle-mobile.xcodeproj/xcuserdata/liuruilong.xcuserdatad/xcschemes/paddle-mobile.xcscheme
@@ -33,7 +33,7 @@
      </AdditionalOptions>
   </TestAction>
   <LaunchAction
-      buildConfiguration = "Debug"
+      buildConfiguration = "Release"
      selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
      selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
      launchStyle = "0"

--- a/metal/paddle-mobile/paddle-mobile/CPU/PaddleMobile.h
+++ b/metal/paddle-mobile/paddle-mobile/CPU/PaddleMobile.h
--- a/metal/paddle-mobile/paddle-mobile/MobilenetSSD_AR.swift
+++ b/metal/paddle-mobile/paddle-mobile/MobilenetSSD_AR.swift
@@ -90,32 +90,32 @@ public class MobileNet_ssd_AR: Net{
      let v = program.scope[output]!
      let originTexture = v as! Texture<Float32>
      originTexture.tensorDim = Dim.init(inDim: [originTexture.tensorDim[1] / 7, originTexture.tensorDim[0] * 7])
-      
+
      originTexture.dim = Dim.init(inDim: [1, 1, originTexture.dim[3] / 7, originTexture.dim[2] * 7])
-      
+
      originTexture.padToFourDim = Dim.init(inDim: [1, 1, originTexture.padToFourDim[3] / 7, originTexture.padToFourDim[2] * 7])
-      
+
      program.scope[output] = originTexture
-      
+
      if i == 99 {
        opDesc.attrs["axis"] = 0
      } else {
        opDesc.attrs["shape"] = originTexture.tensorDim.dims.map { Int32($0) }
      }
    }
-    
+
    for i in [58, 59, 88, 89, 95, 96, 68, 69, 78, 79] {
      let opDesc = program.programDesc.blocks[0].ops[i]
      let output = opDesc.outputs["Out"]!.first!
      let v = program.scope[output]!
-      
-      
+
+

      let originTexture = v as! Texture<Float32>
      originTexture.tensorDim = Dim.init(inDim: [originTexture.tensorDim[1], originTexture.tensorDim[2]])
      opDesc.attrs["shape"] = originTexture.tensorDim.dims.map { Int32($0) }
    }
-    
+
    for i in [60, 101, 90, 97, 70, 80] {
      let opDesc = program.programDesc.blocks[0].ops[i]
      let output = opDesc.outputs["Out"]!.first!
@@ -124,7 +124,7 @@ public class MobileNet_ssd_AR: Net{
      originTexture.tensorDim = Dim.init(inDim: [originTexture.tensorDim[1], originTexture.tensorDim[2]])
      opDesc.attrs["axis"] = (opDesc.attrs["axis"]! as! Int) - 1
    }
-    
+
    for i in [102] {
      let opDesc = program.programDesc.blocks[0].ops[i]
      for output in opDesc.outputs["Out"]! {

--- a/metal/paddle-mobile/paddle-mobile/Operators/Base/OpCreator.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Base/OpCreator.swift
@@ -65,7 +65,9 @@ class OpCreator<P: PrecisionType> {
         gSplit                     :     SplitOp<P>.creat,
         gShape                     :     ShapeOp<P>.creat,
         gFlatten                   :     FlattenOp<P>.creat,
-         gConvAddPreluType          :     ConvAddPreluOp<P>.creat]
+         gConvAddPreluType          :     ConvAddPreluOp<P>.creat,
+         gConvAddAddPreluType       :     ConvAddAddPreluOp<P>.creat,
+         gElementwiseAddPreluType:   ElementwiseAddPreluOp<P>.creat]
  
    private init(){}
 }
--- a/metal/paddle-mobile/paddle-mobile/Operators/Base/Operator.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Base/Operator.swift
@@ -157,6 +157,9 @@ let gSplit                      = "split"
 let gShape                      = "shape"
 let gFlatten                    = "flatten"
 let gConvAddPreluType           = "conv_add_prelu"
+let gConvAddAddPreluType        = "conv_add_add_prelu"
+let gElementwiseAddPreluType = "elementwise_add_prelu"
+

 let opInfos = [gConvType                    : (inputs: ["Input"], outputs: ["Output"]),
               gBatchNormType               : (inputs: ["X"], outputs: ["Y"]),
@@ -183,6 +186,7 @@ let opInfos = [gConvType                    : (inputs: ["Input"], outputs: ["Out
               gSplit                       : (inputs: ["X"], outputs: ["Out"]),
               gShape                       : (inputs: ["Input"], outputs: ["Out"]),
               gFlatten                     : (inputs: ["X"], outputs: ["Out"]),
-               gConvAddPreluType            : (inputs: ["Input"], outputs: ["Out"])
-
+               gConvAddPreluType            : (inputs: ["Input"], outputs: ["Out"]),
+               gConvAddAddPreluType         : (inputs: ["Input"], outputs: ["Out"]),
+               gElementwiseAddPreluType  :  (inputs: ["X"], outputs: ["Out"])
              ]
--- a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddAddPreluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddAddPreluOp.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class ConvAddAddPreluParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      filter = try ConvAddAddPreluParam.inputFilter(paraInputs: opDesc.paraInputs, from: inScope)
+      input = try ConvAddAddPreluParam.input(inputs: opDesc.inputs, from: inScope)
+      output = try ConvAddAddPreluParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      stride = try ConvAddAddPreluParam.getAttr(key: "strides", attrs: opDesc.attrs)
+      paddings = try ConvAddAddPreluParam.getAttr(key: "paddings", attrs: opDesc.attrs)
+      dilations = try ConvAddAddPreluParam.getAttr(key: "dilations", attrs: opDesc.attrs)
+      groups = try ConvAddAddPreluParam.getAttr(key: "groups", attrs: opDesc.attrs)
+      alpha = try ConvAddAddPreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope)
+      mode = try ConvAddAddPreluParam.getAttr(key: "mode", attrs: opDesc.attrs)
+      y = try ConvAddAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  let input: Texture<P>
+  let y: Tensor<ParamPrecisionType>
+  let filter: Tensor<ParamPrecisionType>
+  let mode: String
+  let alpha: Tensor<P>
+  var output: Texture<P>
+  let stride: [Int32]
+  let paddings: [Int32]
+  let dilations: [Int32]
+  let groups: Int
+}
+
+class ConvAddAddPreluOp<P: PrecisionType>: Operator<ConvAddAddPreluKernel<P>, ConvAddAddPreluParam<P>>, Runable, Creator, InferShaperable, Fusion{
+  typealias OpType = ConvAddAddPreluOp<P>
+  
+  static func fusionNode() -> Node {
+    let beginNode = Node.init(inType: gConvType)
+    _ = beginNode
+      --> Node.init(inType: gElementwiseAddType) --> Node.init(inType: gElementwiseAddType) --> Node.init(inType: gPreluType)
+    return beginNode
+  }
+  
+  static func change() -> [String : [(from: String, to: String)]] {
+    return [:]
+  }
+  
+  static func fusionType() -> String {
+    return gConvAddAddPreluType
+  }
+  
+  static func needCheck() -> [(Int, String)] {
+    return [(2, "Y"), (2, "X")]
+  }
+  
+  
+  
+  func inferShape() {
+    let inDims = para.input.dim
+    let filterDim = para.filter.dim
+    let strides = para.stride
+    let paddings = para.paddings
+    let dilations = para.dilations
+    
+    var outDim = [inDims[0]]
+    for i in 0..<strides.count {
+      let dilation: Int = Int(dilations[i])
+      let filterSize: Int = filterDim[i + 1]
+      let inputSize: Int = inDims[i + 1]
+      let padding: Int = Int(paddings[i])
+      let stride: Int = Int(strides[i])
+      let dKernel = dilation * (filterSize - 1) + 1
+      let outputSize = (inputSize + 2 * padding - dKernel) / stride + 1
+      outDim.append(outputSize)
+    }
+    outDim.append(filterDim[0])
+    para.output.dim = Dim.init(inDim: outDim)
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+  }
+  
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddPreluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddPreluOp.swift
+///* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. */
+
+import Foundation
+
+class ElementwiseAddPreluParam<P: PrecisionType>: OpParam {
+  typealias ParamPrecisionType = P
+  required init(opDesc: OpDesc, inScope: Scope) throws {
+    do {
+      alpha = try ElementwiseAddPreluParam.paramInputAlpha(inputs: opDesc.paraInputs, from: inScope)
+      mode = try ElementwiseAddPreluParam.getAttr(key: "mode", attrs: opDesc.attrs)
+      inputX = try ElementwiseAddPreluParam.inputX(inputs: opDesc.inputs, from: inScope)
+      output = try ElementwiseAddPreluParam.outputOut(outputs: opDesc.outputs, from: inScope)
+      axis = try ElementwiseAddPreluParam.getAttr(key: "axis", attrs: opDesc.attrs)
+    } catch let error {
+      throw error
+    }
+    do {
+      inputY = try ElementwiseAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
+    } catch _ {
+      let tensorY: Tensor<P> = try ElementwiseAddPreluParam.inputY(inputs: opDesc.paraInputs, from: inScope)
+      let device = inputX.metalTexture!.device
+      inputY = Texture.init(device: device, inDim: tensorY.dim)
+      let value: [P] = Array(UnsafeBufferPointer(start: tensorY.data.pointer, count: tensorY.dim.numel()))
+      inputY.metalTexture = device.tensor2texture(value: value, dim: tensorY.dim.dims, transpose: [0, 1, 2, 3], inComputePrecision: computePrecision)
+    }
+    
+    //    required init(device: MTLDevice, param: ElementwiseAddParam<P>) {
+    //      param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: computePrecision)
+    //      if computePrecision == .Float32 {
+    //        super.init(device: device, inFunctionName: "elementwise_add")
+    //      } else if computePrecision == .Float16 {
+    //        super.init(device: device, inFunctionName: "elementwise_add_half")
+    //      } else {
+    //        fatalError()
+    //      }
+    //    }
+    
+    var offset = axis
+    if axis == -1 {
+      offset = inputX.tensorDim.cout() - inputY.tensorDim.cout()
+    }
+    for i in 0..<(inputY.tensorDim.cout()) {
+      assert(inputX.tensorDim[offset + i] == inputY.tensorDim[i])
+    }
+  }
+  
+  let mode: String
+  let alpha: Tensor<P>
+  var inputX: Texture<P>
+  var inputY: Texture<P>
+  var output: Texture<P>
+  var axis: Int
+}
+
+class ElementwiseAddPreluOp<P: PrecisionType>: Operator<ElementwiseAddPreluKernel<P>, ElementwiseAddPreluParam<P>>, Runable, Creator, InferShaperable, Fusion{
+  static func fusionNode() -> Node {
+    let beginNode = Node.init(inType: gElementwiseAddType)
+    _ = beginNode
+      --> Node.init(inType: gPreluType)
+    return beginNode
+  }
+  
+  static func change() -> [String : [(from: String, to: String)]] {
+    return [:]
+  }
+  
+  static func fusionType() -> String {
+    return gElementwiseAddPreluType
+  }
+  
+  typealias OpType = ElementwiseAddPreluOp<P>
+  
+  func inferShape() {
+    //    para.output.dim = para.input.dim
+  }
+  
+  func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws {
+    do {
+      try kernel.compute(commandBuffer: buffer, param: para)
+    } catch let error {
+      throw error
+    }
+  }
+  
+  
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    print(para.output)
+    
+    let padToFourDim = para.output.padToFourDim
+    if para.output.transpose == [0, 1, 2, 3] {
+      let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: padToFourDim[0], h: padToFourDim[1], w: padToFourDim[2], c: padToFourDim[3]))
+      print(outputArray.strideArray())
+    } else if para.output.transpose == [0, 2, 3, 1] {
+      print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+    } else {
+      print(" not implement")
+    }
+  }
+}
+
+
+
+
+
+
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddAddPreluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddAddPreluKernel.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+class ConvAddAddPreluKernel<P: PrecisionType>: Kernel, Computable {
+  var metalParam: MetalConvParam!
+  required init(device: MTLDevice, param: ConvAddAddPreluParam<P>) {
+    param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision)
+    param.filter.initBuffer(device: device, precision: computePrecision)
+    param.y.initBuffer(device: device, precision: computePrecision)
+    param.alpha.initBuffer(device: device, precision: computePrecision)
+    
+    if computePrecision == .Float16 {
+      if param.filter.width == 1 && param.filter.height == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_half")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_half")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_half")
+        }
+        
+      } else if param.filter.channel == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_half")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_half")
+        } else {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_half")
+        }
+      } else if param.filter.width == 3 && param.filter.height == 3 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_half")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_half")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_half")
+        }
+        
+      } else if param.filter.width == 1 && param.filter.height == 5 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_half")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_half")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_half")
+        }
+      } else if param.filter.width == 5 && param.filter.height == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_half")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_half")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_half")
+        }
+      } else {
+        fatalError(" unsupport yet ")
+      }
+    } else if computePrecision == .Float32 {
+      if param.filter.width == 1 && param.filter.height == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_channel_float")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_element_float")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_1x1_prelu_other_float")
+        }
+      } else if param.filter.channel == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_channel_float")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_element_float")
+        } else {
+          super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_prelu_other_float")
+        }
+      } else if param.filter.width == 3 && param.filter.height == 3 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_channel_float")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_element_float")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_3x3_prelu_other_float")
+        }
+        
+      } else if param.filter.width == 1 && param.filter.height == 5 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_channel_float")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_element_float")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_5x1_prelu_other_float")
+        }
+      } else if param.filter.width == 5 && param.filter.height == 1 {
+        if param.mode == "channel" {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_channel_float")
+        } else if param.mode == "element" {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_element_float")
+        } else {
+          super.init(device: device, inFunctionName: "conv_add_1x5_prelu_other_float")
+        }
+      } else {
+        fatalError(" unsupport yet ")
+      }
+    } else {
+      fatalError()
+    }
+    
+    let offsetY = (Int(param.dilations[1]) * (param.filter.height - 1) + 1)/2 - Int(param.paddings[1])
+    
+    let offsetX = (Int(param.dilations[0]) * (param.filter.width - 1) + 1)/2 - Int(param.paddings[0])
+    
+    //    print(" function: \(functionName)")
+    //    print("offset x: \(offsetX)")
+    //    print("offset y: \(offsetY)")
+    
+    let offsetZ = 0.0
+    let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
+    //    print("metal param: ")
+    //    print(inMetalParam)
+    
+    metalParam = inMetalParam
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: ConvAddAddPreluParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    
+    encoder.setTexture(param.input.metalTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+    encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
+    encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
+    encoder.setBuffer(param.alpha.buffer, offset: 0, index: 3)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ElementwiseAddPreluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ElementwiseAddPreluKernel.swift
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+import Foundation
+
+
+class ElementwiseAddPreluKernel<P: PrecisionType>: Kernel, Computable {
+  var metalParam: ElementwiseAddMetalParam
+  required init(device: MTLDevice, param: ElementwiseAddPreluParam<P>) {
+    param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: computePrecision)
+    param.alpha.initBuffer(device: device, precision: computePrecision)
+   
+    metalParam = ElementwiseAddMetalParam.init()
+    
+    let xdim: [Int32] = (0..<4).map { Int32(param.inputX.dim[$0]) }
+    let ydim: [Int32] = (0..<4).map { Int32(param.inputY.dim[$0]) }
+    let xtrans: [Int32] = (0..<4).map { Int32(param.inputX.transpose[$0]) }
+    let ytrans: [Int32] = (0..<4).map { Int32(param.inputY.transpose[$0]) }
+    
+    metalParam.xdim = (xdim[0], xdim[1], xdim[2], xdim[3])
+    metalParam.ydim = (ydim[0], ydim[1], ydim[2], ydim[3])
+    metalParam.xtrans = (xtrans[0], xtrans[1], xtrans[2], xtrans[3])
+    metalParam.ytrans = (ytrans[0], ytrans[1], ytrans[2], ytrans[3])
+    if param.axis == -1 {
+      metalParam.axis = 4 - Int32(param.inputY.tensorDim.cout())
+    } else {
+      metalParam.axis = 4 - Int32(param.inputX.tensorDim.cout()) + Int32(param.axis)
+    }
+    metalParam.ylen = Int32(param.inputY.tensorDim.cout())
+    if (param.inputX.dim == param.inputY.dim) && (param.inputX.transpose == param.inputY.transpose) {
+      //      print("===> elementwise_add fast!!!")
+      metalParam.fast = 1
+    }
+    
+    if computePrecision == .Float32 {
+      if param.mode == "channel" {
+        super.init(device: device, inFunctionName: "elementwise_add_channel_float")
+      } else if param.mode == "element" {
+        super.init(device: device, inFunctionName: "elementwise_add_element_float")
+      } else {
+        super.init(device: device, inFunctionName: "elementwise_add_prelu_float")
+      }
+    } else if computePrecision == .Float16 {
+      if param.mode == "channel" {
+        super.init(device: device, inFunctionName: "elementwise_add_channel_half")
+      } else if param.mode == "element" {
+        super.init(device: device, inFunctionName: "elementwise_add_channel_half")
+      } else {
+        super.init(device: device, inFunctionName: "elementwise_add_channel_half")
+      }
+    } else {
+      fatalError()
+    }
+  }
+  
+  func compute(commandBuffer: MTLCommandBuffer, param: ElementwiseAddPreluParam<P>) throws {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+      throw PaddleMobileError.predictError(message: " encode is nil")
+    }
+    encoder.setTexture(param.inputX.metalTexture, index: 0)
+    encoder.setTexture(param.inputY.metalTexture, index: 1)
+    encoder.setTexture(param.output.metalTexture, index: 2)
+    encoder.setBytes(&metalParam, length: MemoryLayout<ElementwiseAddMetalParam>.size, index: 0)
+    encoder.setBuffer(param.alpha.buffer, offset: 0, index: 1)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    encoder.endEncoding()
+  }
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddMetal.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddMetal.metal
@@ -40,7 +40,7 @@ kernel void conv_add_1x1(texture2d_array<float, access::sample> inTexture [[text
  uint input_arr_size = inTexture.get_array_size();
  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
  
-  float4 output = float4(0.0);
+  float4 output = biase[gid.z];
  
  float4 input;
  for (uint i = 0; i < input_arr_size; ++i) {
@@ -57,7 +57,7 @@ kernel void conv_add_1x1(texture2d_array<float, access::sample> inTexture [[text
    float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
    output.w += dot(input, weight_w);
  }
-  output = output + biase[gid.z];
+//  output = output + biase[gid.z];
  outTexture.write(output, gid.xy, gid.z);
 }

@@ -85,7 +85,7 @@ kernel void conv_add_3x3(texture2d_array<float, access::sample> inTexture [[text
  
  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
  
-  float4 output = float4(0.0);
+  float4 output = biase[gid.z];
  
  ushort dilation_x = param.dilationX;
  ushort dilation_y = param.dilationY;
@@ -125,7 +125,7 @@ kernel void conv_add_3x3(texture2d_array<float, access::sample> inTexture [[text
      output.w += dot(input[j], weight_w);
    }
  }
-  output = output + biase[gid.z];
+//  output = output + biase[gid.z];
  outTexture.write(output, gid.xy, gid.z);
 }

@@ -153,7 +153,7 @@ kernel void conv_add_5x1(texture2d_array<float, access::sample> inTexture [[text
  
  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
  
-  float4 output = float4(0.0);
+  float4 output = biase[gid.z];
  
  ushort dilation_y = param.dilationY;
  float4 input[5];
@@ -183,7 +183,7 @@ kernel void conv_add_5x1(texture2d_array<float, access::sample> inTexture [[text
      output.w += dot(input[j], weight_w);
    }
  }
-  output = output + biase[gid.z];
+//  output = output + biase[gid.z];
  outTexture.write(output, gid.xy, gid.z);
 }

@@ -212,7 +212,7 @@ kernel void conv_add_1x5(texture2d_array<float, access::sample> inTexture [[text
  
  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
  
-  float4 output = float4(0.0);
+  float4 output = biase[gid.z];
  
  ushort dilation_x = param.dilationX;
  float4 input[5];
@@ -242,7 +242,7 @@ kernel void conv_add_1x5(texture2d_array<float, access::sample> inTexture [[text
      output.w += dot(input[j], weight_w);
    }
  }
-  output = output + biase[gid.z];
+//  output = output + biase[gid.z];
  outTexture.write(output, gid.xy, gid.z);
 }

@@ -265,7 +265,7 @@ kernel void depthwise_conv_add_3x3(texture2d_array<float, access::sample> inText
  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
  const uint kernelHXW = 9;
  uint weithTo = gid.z * kernelHXW * 4;
-  float4 output = float4(0.0);
+  float4 output = biase[gid.z];
  float4 inputs[9];
  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
@@ -283,7 +283,7 @@ kernel void depthwise_conv_add_3x3(texture2d_array<float, access::sample> inText
    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
  }
-  output = output + biase[gid.z];
+//  output = output + biase[gid.z];
  outTexture.write(output, gid.xy, gid.z);
 }

@@ -312,7 +312,7 @@ kernel void conv_add_1x1_half(texture2d_array<half, access::sample> inTexture [[
  uint input_arr_size = inTexture.get_array_size();
  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
  
-  float4 output = float4(0.0);
+  half4 output = biase[gid.z];
  
  half4 input;
  for (uint i = 0; i < input_arr_size; ++i) {
@@ -329,8 +329,8 @@ kernel void conv_add_1x1_half(texture2d_array<half, access::sample> inTexture [[
    half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
    output.w += dot(input, weight_w);
  }
-  output = output + float4(biase[gid.z]);
-  outTexture.write(half4(output), gid.xy, gid.z);
+//  output = output + float4(biase[gid.z]);
+  outTexture.write(output, gid.xy, gid.z);
 }

 kernel void conv_add_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
@@ -354,7 +354,7 @@ kernel void conv_add_3x3_half(texture2d_array<half, access::sample> inTexture [[
  uint input_arr_size = inTexture.get_array_size();
  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
  
-  float4 output = float4(0.0);
+  half4 output = biase[gid.z];
  
  ushort dilation_x = param.dilationX;
  ushort dilation_y = param.dilationY;
@@ -384,8 +384,8 @@ kernel void conv_add_3x3_half(texture2d_array<half, access::sample> inTexture [[
      output.w += dot(float4(input[j]), float4(weight_w));
    }
  }
-  output = output + float4(biase[gid.z]);
-  outTexture.write(half4(output), gid.xy, gid.z);
+//  output = output + float4(biase[gid.z]);
+  outTexture.write(output, gid.xy, gid.z);
 }

 kernel void depthwise_conv_add_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
@@ -406,7 +406,7 @@ kernel void depthwise_conv_add_3x3_half(texture2d_array<half, access::sample> in
  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
  const uint kernelHXW = 9;
  uint weithTo = gid.z * kernelHXW * 4;
-  float4 output = float4(0.0);
+  half4 output = biase[gid.z];
  half4 inputs[9];
  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
@@ -419,13 +419,13 @@ kernel void depthwise_conv_add_3x3_half(texture2d_array<half, access::sample> in
  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
  for (int j = 0; j < 9; ++j) {
    half4 input = inputs[j];
-    output.x += float(input.x) * float(weights[weithTo + 0 * kernelHXW + j]);
-    output.y += float(input.y) * float(weights[weithTo + 1 * kernelHXW + j]);
-    output.z += float(input.z) * float(weights[weithTo + 2 * kernelHXW + j]);
+    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
  }
-  output = output + float4(biase[gid.z]);
-  outTexture.write(half4(output), gid.xy, gid.z);
+//  output = output + float4(biase[gid.z]);
+  outTexture.write(output, gid.xy, gid.z);
 }


@@ -453,7 +453,7 @@ kernel void conv_add_5x1_half(texture2d_array<half, access::sample> inTexture [[
  
  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
  
-  float4 output = float4(0.0);
+  half4 output = biase[gid.z];
  
  ushort dilation_y = param.dilationY;
  half4 input[5];
@@ -480,11 +480,11 @@ kernel void conv_add_5x1_half(texture2d_array<half, access::sample> inTexture [[
      output.z += dot(input[j], weight_z);
      
      half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-      output.w += dot(float4(input[j]), float4(weight_w));
+      output.w += dot(input[j], weight_w);
    }
  }
-  output = output + float4(biase[gid.z]);
-  outTexture.write(half4(output), gid.xy, gid.z);
+//  output = output + float4(biase[gid.z]);
+  outTexture.write(output, gid.xy, gid.z);
 }


@@ -512,7 +512,7 @@ kernel void conv_add_1x5_half(texture2d_array<half, access::sample> inTexture [[
  
  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
  
-  float4 output = float4(0.0);
+  half4 output = biase[gid.z];
  
  ushort dilation_x = param.dilationX;
  half4 input[5];
@@ -542,8 +542,8 @@ kernel void conv_add_1x5_half(texture2d_array<half, access::sample> inTexture [[
      output.w += dot(input[j], weight_w);
    }
  }
-  output = output + float4(biase[gid.z]);
-  outTexture.write(half4(output), gid.xy, gid.z);
+//  output = output + float4(biase[gid.z]);
+  outTexture.write(output, gid.xy, gid.z);
 }



--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddPrelu.inc.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddPrelu.inc.metal
@@ -49,7 +49,7 @@ kernel void FUNC3_(conv_add_1x1, PRELU_TYPE, P)(texture2d_array<P, access::sampl
  uint input_arr_size = inTexture.get_array_size();
  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
  
-  float4 output = float4(0.0);
+  VECTOR(P, 4) output = biase[gid.z];
  
  VECTOR(P, 4) input;
  for (uint i = 0; i < input_arr_size; ++i) {
@@ -67,7 +67,7 @@ kernel void FUNC3_(conv_add_1x1, PRELU_TYPE, P)(texture2d_array<P, access::sampl
    output.w += dot(input, weight_w);
  }
  
-  output = output + float4(biase[gid.z]);
+//  output = output + float4(biase[gid.z]);
  
 #ifdef PRELU_CHANNEL
  VECTOR(P, 4) alpha_value = alpha[gid.z];
@@ -126,7 +126,7 @@ kernel void FUNC3_(conv_add_3x3, PRELU_TYPE, P)(texture2d_array<P, access::sampl

  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;

-  float4 output = float4(0.0);
+  VECTOR(P, 4) output = biase[gid.z];

  ushort dilation_x = param.dilationX;
  ushort dilation_y = param.dilationY;
@@ -166,7 +166,7 @@ kernel void FUNC3_(conv_add_3x3, PRELU_TYPE, P)(texture2d_array<P, access::sampl
      output.w += dot(input[j], weight_w);
    }
  }
-  output = output + float4(biase[gid.z]);
+//  output = output + float4(biase[gid.z]);
  
 #ifdef PRELU_CHANNEL
  VECTOR(P, 4) alpha_value = alpha[gid.z];
@@ -226,7 +226,7 @@ kernel void FUNC3_(conv_add_5x1, PRELU_TYPE, P)(texture2d_array<P, access::sampl

  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;

-  float4 output = float4(biase[gid.z]);;
+  VECTOR(P, 4) output = biase[gid.z];;

  ushort dilation_y = param.dilationY;
  VECTOR(P, 4) input[5];
@@ -316,7 +316,7 @@ kernel void FUNC3_(conv_add_1x5, PRELU_TYPE, P)(texture2d_array<P, access::sampl

  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;

-  float4 output = float4(biase[gid.z]);
+  VECTOR(P, 4) output = biase[gid.z];

  ushort dilation_x = param.dilationX;
  VECTOR(P, 4) input[5];
@@ -399,7 +399,7 @@ kernel void FUNC3_(depthwise_conv_add_3x3, PRELU_TYPE, P)(texture2d_array<P, acc
  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
  const uint kernelHXW = 9;
  uint weithTo = gid.z * kernelHXW * 4;
-  float4 output = float4(biase[gid.z]);
+  VECTOR(P, 4) output = biase[gid.z];
  VECTOR(P, 4) inputs[9];
  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ElementwiseAddPreluKernel.inc.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ElementwiseAddPreluKernel.inc.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#ifdef P
+
+#include <metal_stdlib>
+#include "Macro.metal"
+
+using namespace metal;
+
+kernel void FUNC3_(elementwise_add, PRELU_TYPE, P)(texture2d_array<P, access::read> inputX [[texture(0)]],
+                                 texture2d_array<P, access::read> inputY [[texture(1)]],
+                                 texture2d_array<P, access::write> outTexture [[texture(2)]],
+                                 constant ElementwiseAddParam &pm [[buffer(0)]],
+#ifdef PRELU_CHANNEL
+                                 const device VECTOR(P, 4) *alpha [[buffer(1)]],
+#endif
+#ifdef PRELU_ELEMENT
+                                 const device VECTOR(P, 4) *alpha [[buffer(1)]],
+#endif
+#ifdef PRELU_OTHER
+                                 const device P *alpha [[buffer(1)]],
+#endif
+                                 uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) return;
+  VECTOR(P, 4) rx, ry;
+  
+  if (pm.fast == 1) {
+    rx = inputX.read(gid.xy, gid.z);
+    ry = inputY.read(gid.xy, gid.z);
+    } else {
+      rx = inputX.read(gid.xy, gid.z);
+      int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4];
+      int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4];
+      int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]};
+      int32_t ytrans[4] = {pm.ytrans[0], pm.ytrans[1], pm.ytrans[2], pm.ytrans[3]};
+      int32_t yshift = 4 - pm.ylen - pm.axis;
+      for (int n = 0; n < 4; n++) {
+        x_xyzn[3] = n;
+        xyzn2abcd(pm.xdim[3], x_xyzn, x_abcd);
+        invtrans(xtrans, x_abcd, t_abcd);
+        for (int k = pm.axis; k < (pm.axis + pm.ylen); k++) {
+          y_abcd[yshift+k] = t_abcd[k];
+        }
+        trans(ytrans, y_abcd, t_abcd);
+        abcd2xyzn(pm.ydim[3], t_abcd, y_xyzn);
+        ry[n] = inputY.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]];
+      }
+  }
+  VECTOR(P, 4) output = rx + ry;
+  
+#ifdef PRELU_CHANNEL
+  VECTOR(P, 4) alpha_value = alpha[gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_ELEMENT
+  int alpha_to = (gid.y * outTexture.get_width() + gid.x) * outTexture.get_array_size();
+  VECTOR(P, 4) alpha_value = alpha[alpha_to + gid.z];
+  output.x = output.x > 0 ? output.x : (alpha_value.x * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value.y * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value.z * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value.w * output.w);
+#endif
+#ifdef PRELU_OTHER
+  P alpha_value = alpha[0];
+  output.x = output.x > 0 ? output.x : (alpha_value * output.x);
+  output.y = output.y > 0 ? output.y : (alpha_value * output.y);
+  output.z = output.z > 0 ? output.z : (alpha_value * output.z);
+  output.w = output.w > 0 ? output.w : (alpha_value * output.w);
+#endif
+  
+  outTexture.write(output, gid.xy, gid.z);
+}
+
+#endif
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ElementwiseAddPreluKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ElementwiseAddPreluKernel.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+#include "Common.metal"
+using namespace metal;
+
+struct ElementwiseAddParam {
+  int32_t fast;
+  int32_t axis;
+  int32_t ylen;
+  int32_t xdim[4];
+  int32_t xtrans[4];
+  int32_t ydim[4];
+  int32_t ytrans[4];
+};
+
+#define P float
+
+#define PRELU_CHANNEL prelu_channel
+#define PRELU_TYPE channel
+#include "ElementwiseAddPreluKernel.inc.metal"
+#undef  PRELU_TYPE
+#undef  PRELU_CHANNEL
+
+#define PRELU_ELEMENT element
+#define PRELU_TYPE prelu_element
+#include "ElementwiseAddPreluKernel.inc.metal"
+#undef  PRELU_TYPE
+#undef  PRELU_ELEMENT
+
+#define PRELU_OTHER   other
+#define PRELU_TYPE prelu_other
+#include "ElementwiseAddPreluKernel.inc.metal"
+#undef  PRELU_TYPE
+#undef  PRELU_OTHER
+
+#undef P
+
+#define P half
+
+#define PRELU_CHANNEL channel
+#define PRELU_TYPE channel
+#include "ElementwiseAddPreluKernel.inc.metal"
+#undef  PRELU_TYPE
+#undef  PRELU_CHANNEL
+
+#define PRELU_ELEMENT element
+#define PRELU_TYPE prelu_element
+#include "ElementwiseAddPreluKernel.inc.metal"
+#undef  PRELU_TYPE
+#undef  PRELU_ELEMENT
+
+#define PRELU_OTHER   other
+#define PRELU_TYPE prelu_other
+#include "ElementwiseAddPreluKernel.inc.metal"
+#undef  PRELU_TYPE
+#undef  PRELU_OTHER
+
+#undef P
+
+
+
+
--- a/metal/paddle-mobile/paddle-mobile/Program/ProgramOptimize.swift
+++ b/metal/paddle-mobile/paddle-mobile/Program/ProgramOptimize.swift
@@ -69,6 +69,7 @@ class Node {
  
  func to(depth: UInt) -> Node {
    let beginNode = Node.init(inType: type)
+    beginNode.opDesc = opDesc
    to(depth: depth - 1, withNode: beginNode)
    return beginNode
  }
@@ -130,6 +131,7 @@ class Node {
    
    for output in outputs {
      let node = Node.init(inType: output.type)
+      node.opDesc = output.opDesc
      withNode.outputs.append(node)
      output.to(depth: depth - 1, withNode: node)
    }
@@ -182,10 +184,12 @@ extension Node: Equatable {
 class ProgramOptimize<P: PrecisionType> {
  // register fusion
  let fusionOps: [Fusion.Type] = [ConvAddBatchNormReluOp<P>.self,
+//                                  ConvAddAddPreluOp<P>.self,
                                  ConvAddPreluOp<P>.self,
                                  ConvAddOp<P>.self,
                                  ConvBNReluOp<P>.self,
-                                  DwConvBNReluOp<P>.self
+                                  DwConvBNReluOp<P>.self,
+//                                  ElementwiseAddPreluOp<P>.self
  ]
  
  func optimize(originProgramDesc: ProgramDesc) -> ProgramDesc {
@@ -256,6 +260,15 @@ class ProgramOptimize<P: PrecisionType> {
                    }
                  }
                }
+                
+                let paramInputToChecks = checkNode.opDesc?.paraInputs[toCheck.1] ?? []
+                for paramInputToCheck in paramInputToChecks {
+                  if node.output[paramInputToCheck] == nil {
+                    if relationshipMap[paramInputToCheck] == nil {
+                      canFolder = false
+                    }
+                  }
+                }
              }
            }
            

--- a/metal/paddle-mobile/paddle-mobile/framework/Executor.swift
+++ b/metal/paddle-mobile/paddle-mobile/framework/Executor.swift
@@ -83,7 +83,7 @@ public class Executor<P: PrecisionType> {
    for block in inProgram.programDesc.blocks {
      //block.ops.count
      for i in 0..<block.ops.count {
-        let op = block.ops[i]
+        let opDesc = block.ops[i]
        do {
 //          print("in for i \(i): ")
 //      print(program.scope.vars["fea_pyramid1_mbox_conf_flat.Flatten.output.1.tmp_0"])
@@ -93,7 +93,7 @@ public class Executor<P: PrecisionType> {
 //
 //          }
          
-          let op = try OpCreator<P>.shared.creat(device: inDevice, opDesc: op, scope: inProgram.scope)
+          let op = try OpCreator<P>.shared.creat(device: inDevice, opDesc: opDesc, scope: inProgram.scope)
          ops.append(op)
        } catch let error {
          throw error

--- a/metal/paddle-mobile/paddle-mobile/paddle_mobile.h
+++ b/metal/paddle-mobile/paddle-mobile/paddle_mobile.h
@@ -14,7 +14,7 @@

 #pragma once

-#import "PaddleMobile.h"
+#import "PaddleMobileCPU.h"
 #import "CPUCompute.h"
 #import "PaddleMobileGPU.h"
 #import <UIKit/UIKit.h>