diff --git a/metal/paddle-mobile/paddle-mobile/Executor.swift b/metal/paddle-mobile/paddle-mobile/Executor.swift
index c3520723ff02d13d364fe71328f3a6853e0dad33..ec21b5148734f25017ca5580a3d5be09b0a450f7 100644
--- a/metal/paddle-mobile/paddle-mobile/Executor.swift
+++ b/metal/paddle-mobile/paddle-mobile/Executor.swift
@@ -55,6 +55,7 @@ public class Executor<P: PrecisionType> {
         device = inDevice
         queue = inQueue
         for block in inProgram.programDesc.blocks {
+            //block.ops.count
             for i in 0..<block.ops.count {
                 let op = block.ops[i]
                 do {
@@ -65,6 +66,7 @@ public class Executor<P: PrecisionType> {
                     throw error
                 }
             }
+            
 //            for op in block.ops {
 //                do {
 //                    let op = try OpCreator<P>.shared.creat(device: inDevice, opDesc: op, scope: inProgram.scope)
@@ -94,16 +96,14 @@ public class Executor<P: PrecisionType> {
         }
         
         buffer.addCompletedHandler { (commandbuffer) in
-            
             for op in self.ops {
                 op.delogOutput()
             }
             
+
             let afterDate = Date.init()
             print(" encoder end ! time: \(afterDate.timeIntervalSince(beforeDate))")
-            
         }
-        
         buffer.commit()
         
         guard let outputVar = program.scope.output() else {
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddBatchNormReluOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddBatchNormReluOp.swift
index b4d471c68efc2ec79dfbeeb967d60c8fb38e3430..5c78b03f9fc5dafadc60d25975d952271ba3848a 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddBatchNormReluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddBatchNormReluOp.swift
@@ -107,17 +107,16 @@ class ConvAddBatchNormReluOp<P: PrecisionType>: Operator<ConvAddBatchNormReluKer
     }
     
     func delogOutput() {
-//        let _: P? = para.input.metalTexture.logDesc(header: "conv add batchnorm relu input: ", stridable: false)
+//        let _: P? = para.input.metalTexture.logDesc(header: "conv add batchnorm relu input: ", stridable: true)
 //        para.filter.logDataPointer(header: "filter data pointer: ")
-//        
+        
 //        print("filter: \(para.filter)")
 //        print("biase: \(para.bias)")
 //        print("padding: \(para.paddings)")
 //        print("stride: \(para.stride)")
-//        
+        
 //        let _: P? = para.newBiase?.logDesc(header: "new biase: ", stridable: false)
 //        let _: P? = para.newScale?.logDesc(header: "new scale: ", stridable: false)
-//        let _: P? = para.output.metalTexture.logDesc(header: "conv add batchnorm relu output: ", stridable: true)
+//        let _: P? = para.output.metalTexture.logDesc(header: "conv add batchnorm relu output: ", stridable: false)
     }
-    
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift
index dc70c2fd43ba466fe77f25051252be00e7072485..cac707dfa9fe9d04341a1d7de348a2e95def56f1 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift
@@ -22,7 +22,7 @@ class ConvAddBatchNormReluKernel<P: PrecisionType>: Kernel, Computable {
         if param.filter.width == 1 && param.filter.height == 1 {
             super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1")
         } else if param.filter.channel == 1 {
-            super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_1x1")
+            super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3")
         } else {
             super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3")
         }
@@ -75,7 +75,7 @@ class ConvAddBatchNormReluKernel<P: PrecisionType>: Kernel, Computable {
         encoder.setTexture(param.output.metalTexture, index: 1)
         encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
         encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
-        encoder.setBuffer(param.bias.buffer, offset: 0, index: 2)
+        encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
         encoder.setBuffer(param.newScale!, offset: 0, index: 3)
         encoder.setBuffer(param.newBiase!, offset: 0, index: 4)
         encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift
index 950abd47f3f98c3f1404c25bd0a572043086df5e..57726c5fb07d4644cce9c37ba6e6eafa81e661b4 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift
@@ -15,11 +15,31 @@
 import Foundation
 
 class ConvAddKernel<P: PrecisionType>: Kernel, Computable {
+    var metalParam: MetalConvParam!
     required init(device: MTLDevice, param: ConvAddParam<P>) {
         super.init(device: device, inFunctionName: "conv_add_1x1")
+        let offsetX = param.filter.width/2 - Int(param.paddings[0])
+        let offsetY = param.filter.height/2 - Int(param.paddings[1])
         
+        print("offset x: \(offsetX)")
+        print("offset y: \(offsetY)")
+        
+        let offsetZ = 0.0
+        metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]))
     }
     
     func compute(commandBuffer: MTLCommandBuffer, param: ConvAddParam<P>) throws {
+        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+            throw PaddleMobileError.predictError(message: " encode is nil")
+        }
+        
+        print("Conv Add compute")
+        encoder.setTexture(param.input.metalTexture, index: 0)
+        encoder.setTexture(param.output.metalTexture, index: 1)
+        encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+        encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
+        encoder.setBuffer(param.y.buffer, offset: 0, index: 2)
+        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+        encoder.endEncoding()
     }
 }
diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.metal
index 7286e11d9a618ef6943b5d8462dc3a3e07072e1f..660235eb1431b716c499ca729cf4300cbe228309 100644
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.metal
@@ -66,7 +66,6 @@ struct MetalConvParam {
 //
 //}
 
-
 kernel void conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
                                          texture2d_array<float, access::write> outTexture [[texture(1)]],
                                          constant MetalConvParam &param [[buffer(0)]],
@@ -120,8 +119,6 @@ kernel void conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample>
     outTexture.write(output, gid.xy, gid.z);
 }
 
-
-
 kernel void conv_add_batch_norm_relu_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
                                          texture2d_array<float, access::write> outTexture [[texture(1)]],
                                          constant MetalConvParam &param [[buffer(0)]],
@@ -165,14 +162,11 @@ kernel void conv_add_batch_norm_relu_1x1(texture2d_array<float, access::sample>
     outTexture.write(output, gid.xy, gid.z);
 }
 
-
 kernel void conv_add_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
                                          texture2d_array<float, access::write> outTexture [[texture(1)]],
                                          constant MetalConvParam &param [[buffer(0)]],
                                          const device float4 *weights [[buffer(1)]],
                                          const device float4 *biase [[buffer(2)]],
-                                         const device float4 *new_scale [[buffer(3)]],
-                                         const device float4 *new_biase [[buffer(4)]],
                                          uint3 gid [[thread_position_in_grid]]) {
     
     if (gid.x >= outTexture.get_width() ||
@@ -210,10 +204,10 @@ kernel void conv_add_1x1(texture2d_array<float, access::sample> inTexture [[text
 }
 
 
-kernel void depthwise_conv_add_batch_norm_relu_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+kernel void depthwise_conv_add_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
                                          texture2d_array<float, access::write> outTexture [[texture(1)]],
                                          constant MetalConvParam &param [[buffer(0)]],
-                                         const device float4 *weights [[buffer(1)]],
+                                         const device float *weights [[buffer(1)]],
                                          const device float4 *biase [[buffer(2)]],
                                          const device float4 *new_scale [[buffer(3)]],
                                          const device float4 *new_biase [[buffer(4)]],
@@ -225,11 +219,10 @@ kernel void depthwise_conv_add_batch_norm_relu_1x1(texture2d_array<float, access
         return;
     }
     uint output_slice = gid.z;
-    
     short2 posInInput = short2(gid.xy) + short2(param.offsetX, param.offsetY);
     constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
     const uint kernelHXW = 9;
-    uint weithTo = gid.z * kernelHXW;
+    uint weithTo = gid.z * kernelHXW * 4;
     float4 output = float4(0.0);
     float4 inputs[9];
     inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
@@ -243,13 +236,12 @@ kernel void depthwise_conv_add_batch_norm_relu_1x1(texture2d_array<float, access
     inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
     for (int j = 0; j < 9; ++j) {
         float4 input = inputs[j];
-        float4 weight = weights[weithTo + j];
-        output.x += input.x * weight.x;
-        output.y += input.y * weight.y;
-        output.z += input.z * weight.z;
-        output.w += input.w * weight.w;
+        output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+        output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+        output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+        output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
     }
-    output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0);
+    output = (output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z];
     outTexture.write(output, gid.xy, gid.z);
 }
 
diff --git a/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift b/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift
index 92bfe88c4994791e1d11646cf8796b8a8461f176..39891c77de6cd96ec9fd93c5fd78b5d944b5c71e 100644
--- a/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift
+++ b/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift
@@ -95,12 +95,14 @@ class Tensor<P: PrecisionType>: Tensorial {
                 let cSlices = (C + 3) / 4
                 let paddedC = cSlices * 4
                 let count = paddedC * dim[0] * dim[1] * dim[2]
-                buffer = device.makeBuffer(length: count * MemoryLayout<P>.stride)
                 if C == paddedC {
+                    buffer = device.makeBuffer(length: count * MemoryLayout<P>.stride)
                     buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout<P>.stride)
                 } else if C == 1 {
-                    buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout<P>.stride)
+                    buffer = device.makeBuffer(length: numel() * MemoryLayout<P>.stride)
+                    buffer?.contents().copyMemory(from: data.pointer, byteCount: numel() * MemoryLayout<P>.stride)
                 } else {
+                    buffer = device.makeBuffer(length: count * MemoryLayout<P>.stride)
                     var tmpPointer = data.pointer
                     var dstPtr = buffer?.contents().bindMemory(to: P.self, capacity: count)
                     for _ in 0..<dim[0] * dim[1] * dim[2] {
@@ -120,7 +122,8 @@ class Tensor<P: PrecisionType>: Tensorial {
         } else {
             fatalError(" not support !")
         }
-        data.release()
+        //TODO: release
+//        data.release()
     }
     
     var width: Int {