Merge pull request #1481 from codeWorm2015/develop

add mps support

Merge pull request #1481 from codeWorm2015/develop
add mps support
e5e51936 · Ray Liu · GitHub · 09ec8398 · 548723ec · e5e51936
11 changed file
--- a/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.pbxproj
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/project.pbxproj
@@ -33,8 +33,6 @@
 		FC5E03B221DCE8D90016C137 /* mingren_input_data in Resources */ = {isa = PBXBuildFile; fileRef = FC5E03B121DCE8D90016C137 /* mingren_input_data */; };
 		FC704C1921D2375300F98BAB /* super_params in Resources */ = {isa = PBXBuildFile; fileRef = FC704C1721D2375300F98BAB /* super_params */; };
 		FC704C1A21D2375300F98BAB /* super_model in Resources */ = {isa = PBXBuildFile; fileRef = FC704C1821D2375300F98BAB /* super_model */; };
-		FC704C2221D237FC00F98BAB /* combined_mobilenet_params in Resources */ = {isa = PBXBuildFile; fileRef = FC704C1D21D237FC00F98BAB /* combined_mobilenet_params */; };
-		FC704C2321D237FC00F98BAB /* combined_mobilenet_model in Resources */ = {isa = PBXBuildFile; fileRef = FC704C1E21D237FC00F98BAB /* combined_mobilenet_model */; };
 		FC704C2421D237FC00F98BAB /* yolo_params in Resources */ = {isa = PBXBuildFile; fileRef = FC704C2021D237FC00F98BAB /* yolo_params */; };
 		FC704C2521D237FC00F98BAB /* yolo_model in Resources */ = {isa = PBXBuildFile; fileRef = FC704C2121D237FC00F98BAB /* yolo_model */; };
 		FC803BCD214D27930094B8E5 /* FPSCounter.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC803BCB214D27920094B8E5 /* FPSCounter.swift */; };
@@ -49,6 +47,9 @@
 		FCBCCC552122EF5500D94F7E /* MetalHelper.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC542122EF5400D94F7E /* MetalHelper.swift */; };
 		FCC15E15221E716500DC3CB2 /* paddle-mobile-metallib.metallib in Resources */ = {isa = PBXBuildFile; fileRef = FCC15E14221E716400DC3CB2 /* paddle-mobile-metallib.metallib */; };
 		FCCED60521D7646E00BE8D5F /* test_image_super in Resources */ = {isa = PBXBuildFile; fileRef = FCCED60421D7646E00BE8D5F /* test_image_super */; };
+		FCE834AE2232A4AE0057BF43 /* combined_mobilenet_params in Resources */ = {isa = PBXBuildFile; fileRef = FCE834AC2232A4AE0057BF43 /* combined_mobilenet_params */; };
+		FCE834AF2232A4AE0057BF43 /* combined_mobilenet_model in Resources */ = {isa = PBXBuildFile; fileRef = FCE834AD2232A4AE0057BF43 /* combined_mobilenet_model */; };
+		FCE834B12232B6DC0057BF43 /* vision_synset.txt in Resources */ = {isa = PBXBuildFile; fileRef = FCE834B02232B6DC0057BF43 /* vision_synset.txt */; };
 		FCEBEC2C20E1391F00C0B14D /* paddle_mobile.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = FCEBEC2B20E1391F00C0B14D /* paddle_mobile.framework */; };
 		FCEBEC2D20E1391F00C0B14D /* paddle_mobile.framework in Embed Frameworks */ = {isa = PBXBuildFile; fileRef = FCEBEC2B20E1391F00C0B14D /* paddle_mobile.framework */; settings = {ATTRIBUTES = (CodeSignOnCopy, RemoveHeadersOnCopy, ); }; };
 		FCF437E8214B6DDB00943429 /* MultiPredictViewController.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCF437E7214B6DDB00943429 /* MultiPredictViewController.swift */; };
@@ -105,8 +106,6 @@
 		FC5E03B121DCE8D90016C137 /* mingren_input_data */ = {isa = PBXFileReference; lastKnownFileType = file; path = mingren_input_data; sourceTree = "<group>"; };
 		FC704C1721D2375300F98BAB /* super_params */ = {isa = PBXFileReference; lastKnownFileType = file; path = super_params; sourceTree = "<group>"; };
 		FC704C1821D2375300F98BAB /* super_model */ = {isa = PBXFileReference; lastKnownFileType = file; path = super_model; sourceTree = "<group>"; };
-		FC704C1D21D237FC00F98BAB /* combined_mobilenet_params */ = {isa = PBXFileReference; lastKnownFileType = file; path = combined_mobilenet_params; sourceTree = "<group>"; };
-		FC704C1E21D237FC00F98BAB /* combined_mobilenet_model */ = {isa = PBXFileReference; lastKnownFileType = file; path = combined_mobilenet_model; sourceTree = "<group>"; };
 		FC704C2021D237FC00F98BAB /* yolo_params */ = {isa = PBXFileReference; lastKnownFileType = file; path = yolo_params; sourceTree = "<group>"; };
 		FC704C2121D237FC00F98BAB /* yolo_model */ = {isa = PBXFileReference; lastKnownFileType = file; path = yolo_model; sourceTree = "<group>"; };
 		FC803BCB214D27920094B8E5 /* FPSCounter.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = FPSCounter.swift; sourceTree = "<group>"; };
@@ -121,6 +120,9 @@
 		FCBCCC542122EF5400D94F7E /* MetalHelper.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = MetalHelper.swift; sourceTree = "<group>"; };
 		FCC15E14221E716400DC3CB2 /* paddle-mobile-metallib.metallib */ = {isa = PBXFileReference; lastKnownFileType = "archive.metal-library"; name = "paddle-mobile-metallib.metallib"; path = "../../../../Library/Developer/Xcode/DerivedData/paddle-mobile-hdsimtkoxoondndnjczkbkchcwyh/Build/Products/Release-iphoneos/paddle-mobile-metallib.metallib"; sourceTree = "<group>"; };
 		FCCED60421D7646E00BE8D5F /* test_image_super */ = {isa = PBXFileReference; lastKnownFileType = file; path = test_image_super; sourceTree = "<group>"; };
+		FCE834AC2232A4AE0057BF43 /* combined_mobilenet_params */ = {isa = PBXFileReference; lastKnownFileType = file; path = combined_mobilenet_params; sourceTree = "<group>"; };
+		FCE834AD2232A4AE0057BF43 /* combined_mobilenet_model */ = {isa = PBXFileReference; lastKnownFileType = file; path = combined_mobilenet_model; sourceTree = "<group>"; };
+		FCE834B02232B6DC0057BF43 /* vision_synset.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = vision_synset.txt; sourceTree = "<group>"; };
 		FCEBEC2B20E1391F00C0B14D /* paddle_mobile.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; path = paddle_mobile.framework; sourceTree = BUILT_PRODUCTS_DIR; };
 		FCF437E7214B6DDB00943429 /* MultiPredictViewController.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = MultiPredictViewController.swift; sourceTree = "<group>"; };
 		FCFADE33222F63CB0037DCE8 /* test_big.JPG */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = test_big.JPG; sourceTree = "<group>"; };
@@ -267,22 +269,13 @@
 		FC704C1B21D237FC00F98BAB /* vision_model */ = {
 			isa = PBXGroup;
 			children = (
+				FCE834AB2232A4AE0057BF43 /* vision_mobilenet */,
 				FCAFD8482231614200496A36 /* yolo_16 */,
-				FC704C1C21D237FC00F98BAB /* mobilenet */,
 				FC704C1F21D237FC00F98BAB /* yolo */,
 			);
 			path = vision_model;
 			sourceTree = "<group>";
 		};
-		FC704C1C21D237FC00F98BAB /* mobilenet */ = {
-			isa = PBXGroup;
-			children = (
-				FC704C1D21D237FC00F98BAB /* combined_mobilenet_params */,
-				FC704C1E21D237FC00F98BAB /* combined_mobilenet_model */,
-			);
-			path = mobilenet;
-			sourceTree = "<group>";
-		};
 		FC704C1F21D237FC00F98BAB /* yolo */ = {
 			isa = PBXGroup;
 			children = (
@@ -336,6 +329,16 @@
 			path = yolo_16;
 			sourceTree = "<group>";
 		};
+		FCE834AB2232A4AE0057BF43 /* vision_mobilenet */ = {
+			isa = PBXGroup;
+			children = (
+				FCE834B02232B6DC0057BF43 /* vision_synset.txt */,
+				FCE834AC2232A4AE0057BF43 /* combined_mobilenet_params */,
+				FCE834AD2232A4AE0057BF43 /* combined_mobilenet_model */,
+			);
+			path = vision_mobilenet;
+			sourceTree = "<group>";
+		};
 /* End PBXGroup section */
 /* Begin PBXNativeTarget section */
@@ -401,8 +404,8 @@
 				FCCED60521D7646E00BE8D5F /* test_image_super in Resources */,
 				FC039B8C20E11C560081E9F8 /* LaunchScreen.storyboard in Resources */,
 				FC9797CF21D6506F00F2FD90 /* mingren.jpg in Resources */,
-				FC704C2221D237FC00F98BAB /* combined_mobilenet_params in Resources */,
 				FCAFD84B2231614200496A36 /* yolo_16_param in Resources */,
+				FCE834AF2232A4AE0057BF43 /* combined_mobilenet_model in Resources */,
 				FC704C1921D2375300F98BAB /* super_params in Resources */,
 				FC2BFCBE21DF15D900C262B2 /* 123.jpg in Resources */,
 				FC039B8920E11C560081E9F8 /* Assets.xcassets in Resources */,
@@ -411,14 +414,15 @@
 				FC5E03B221DCE8D90016C137 /* mingren_input_data in Resources */,
 				FC704C1A21D2375300F98BAB /* super_model in Resources */,
 				FC039B8720E11C550081E9F8 /* Main.storyboard in Resources */,
+				FCE834B12232B6DC0057BF43 /* vision_synset.txt in Resources */,
 				FC9797C221D608E000F2FD90 /* mobilenet_model in Resources */,
 				FCAFD84C2231614200496A36 /* yolo_16_model in Resources */,
 				FC2BFCC021DF279900C262B2 /* classify-img-output.png in Resources */,
 				FC203FB221CBFDBA00B37166 /* test.jpg in Resources */,
 				FCC15E15221E716500DC3CB2 /* paddle-mobile-metallib.metallib in Resources */,
-				FC704C2321D237FC00F98BAB /* combined_mobilenet_model in Resources */,
 				FC9797C321D608E000F2FD90 /* mobilenet_params in Resources */,
 				FC704C2421D237FC00F98BAB /* yolo_params in Resources */,
+				FCE834AE2232A4AE0057BF43 /* combined_mobilenet_params in Resources */,
 				FC2BFCBC21DF0A8600C262B2 /* 00001.jpg in Resources */,
 				FC9797BE21D6045B00F2FD90 /* banana.jpeg in Resources */,
 				FC704C2521D237FC00F98BAB /* yolo_model in Resources */,

--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNetCombined.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNetCombined.swift
@@ -24,10 +24,35 @@ public class MobileNetCombined: Net {
        inputDim = Dim.init(inDim: [1, 224, 224, 3])
        metalLoadMode = .LoadMetalInCustomMetalLib
        metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
+        useMPS = true
+    }
+    let labels = PreWords.init(fileName: "vision_synset")
+    class PreWords {
+        var contents: [String] = []
+        init(fileName: String, type: String = "txt", inBundle: Bundle = Bundle.main) {
+            if let filePath = inBundle.path(forResource: fileName, ofType: type) {
+                let string = try! String.init(contentsOfFile: filePath)
+                contents = string.components(separatedBy: CharacterSet.newlines).filter{$0.count > 10}.map{
+                    String($0[$0.index($0.startIndex, offsetBy: 10)...])
+                }
+            }else{
+                fatalError("no file call \(fileName)")
+            }
+        }
+        subscript(index: Int) -> String {
+            return contents[index]
+        }
    }
    override  public func resultStr(res: [ResultHolder]) -> String {
-        return " \(res[0].result[0]) ... "
+        let firstRes = res[0]
+        let resPointer = firstRes.result
+        var s: [String] = []
+        (0..<firstRes.capacity).map { resPointer[$0] }.top(r: 5).enumerated().forEach{
+            s.append(String(format: "%d: %@ (%3.2f%%)", $0 + 1, labels[$1.0], $1.1 * 100))
+        }
+        return s.joined(separator: "\n")
    }
 }
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/YoloNet.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/YoloNet.swift
@@ -25,7 +25,7 @@ public class YoloNet: Net {
        inputDim = Dim.init(inDim: [1, 416, 416, 3])
        metalLoadMode = .LoadMetalInCustomMetalLib
        metalLibPath = Bundle.main.path(forResource: "paddle-mobile-metallib", ofType: "metallib")
-        useMPS = false
+        useMPS = true
        paramPrecision = .Float16
    }

--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddMetal.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddMetal.metal
@@ -354,7 +354,7 @@ kernel void conv_add_3x3_half(texture2d_array<half, access::sample> inTexture [[
    uint input_arr_size = inTexture.get_array_size();
    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-    half4 output = biase[gid.z];
+    float4 output = float4(biase[gid.z]);
    ushort dilation_x = param.dilationX;
    ushort dilation_y = param.dilationY;
@@ -385,7 +385,7 @@ kernel void conv_add_3x3_half(texture2d_array<half, access::sample> inTexture [[
        }
    }
    //  output = output + float4(biase[gid.z]);
-    outTexture.write(output, gid.xy, gid.z);
+    outTexture.write(half4(output), gid.xy, gid.z);
 }
 kernel void depthwise_conv_add_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
@@ -406,7 +406,7 @@ kernel void depthwise_conv_add_3x3_half(texture2d_array<half, access::sample> in
    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
    const uint kernelHXW = 9;
    uint weithTo = gid.z * kernelHXW * 4;
-    half4 output = biase[gid.z];
+    float4 output = float4(biase[gid.z]);
    half4 inputs[9];
    inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
    inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
@@ -419,13 +419,13 @@ kernel void depthwise_conv_add_3x3_half(texture2d_array<half, access::sample> in
    inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
    for (int j = 0; j < 9; ++j) {
        half4 input = inputs[j];
-        output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+        output.x += float(input.x) * float(weights[weithTo + 0 * kernelHXW + j]);
-        output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+        output.y += float(input.y) * float(weights[weithTo + 1 * kernelHXW + j]);
-        output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+        output.z += float(input.z) * float(weights[weithTo + 2 * kernelHXW + j]);
-        output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+        output.w += float(input.w) * float(weights[weithTo + 3 * kernelHXW + j]);
    }
    //  output = output + float4(biase[gid.z]);
-    outTexture.write(output, gid.xy, gid.z);
+    outTexture.write(half4(output), gid.xy, gid.z);
 }
@@ -453,7 +453,7 @@ kernel void conv_add_5x1_half(texture2d_array<half, access::sample> inTexture [[
    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-    half4 output = biase[gid.z];
+    float4 output = float4(biase[gid.z]);
    ushort dilation_y = param.dilationY;
    half4 input[5];
@@ -471,20 +471,20 @@ kernel void conv_add_5x1_half(texture2d_array<half, access::sample> inTexture [[
        for (int j = 0; j < 5; ++j) {
            half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.x += dot(input[j], weight_x);
+            output.x += dot(float4(input[j]), float4(weight_x));
            half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.y += dot(input[j], weight_y);
+            output.y += dot(float4(input[j]), float4(weight_y));
            half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.z += dot(input[j], weight_z);
+            output.z += dot(float4(input[j]), float4(weight_z));
            half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.w += dot(input[j], weight_w);
+            output.w += dot(float4(input[j]), float4(weight_w));
        }
    }
    //  output = output + float4(biase[gid.z]);
-    outTexture.write(output, gid.xy, gid.z);
+    outTexture.write(half4(output), gid.xy, gid.z);
 }
@@ -512,7 +512,7 @@ kernel void conv_add_1x5_half(texture2d_array<half, access::sample> inTexture [[
    uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
-    half4 output = biase[gid.z];
+    float4 output = float4(biase[gid.z]);
    ushort dilation_x = param.dilationX;
    half4 input[5];
@@ -530,20 +530,20 @@ kernel void conv_add_1x5_half(texture2d_array<half, access::sample> inTexture [[
        for (int j = 0; j < 5; ++j) {
            half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.x += dot(input[j], weight_x);
+            output.x += dot(float4(input[j]), float4(weight_x));
            half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.y += dot(input[j], weight_y);
+            output.y += dot(float4(input[j]), float4(weight_y));
            half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.z += dot(input[j], weight_z);
+            output.z += dot(float4(input[j]), float4(weight_z));
            half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
-            output.w += dot(input[j], weight_w);
+            output.w += dot(float4(input[j]), float4(weight_w));
        }
    }
    //  output = output + float4(biase[gid.z]);
-    outTexture.write(output, gid.xy, gid.z);
+    outTexture.write(half4(output), gid.xy, gid.z);
 }

--- a/metal/paddle-mobile/paddle-mobile/Src/Framework/Executor.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Framework/Executor.swift
@@ -117,10 +117,9 @@ public class Executor<P: PrecisionProtocol>: Executorable{
            //将输入写进文件
            /*
             let inputArr = resInput.toTensor(dim: (n: dim[0], c: dim[3], h: dim[1], w: dim[2]))
             print(dim)
-             writeToLibrary(fileName: "yolo_input", array: inputArr)
+             writeToLibrary(fileName: "mobilenet_input", array: inputArr)
             print(" write done ")
             return
             */

--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddKernel.swift
@@ -27,6 +27,78 @@ func getUniqueKey() -> String {
    return UUID.init().uuidString
 }
+@available(iOS 11.0, *)
+class ConvDataSource<P: PrecisionProtocol>: NSObject, MPSCNNConvolutionDataSource {
+    var _descriptor: MPSCNNConvolutionDescriptor
+    var _weightsTensor: Tensor<P>
+    var _biasTensor: Tensor<P>
+    var _biasTerms: UnsafeMutablePointer<Float>?
+    func load() -> Bool {
+        switch P.precisionType {
+        case .Float32:
+            _biasTerms = _biasTensor.data.pointer as? UnsafeMutablePointer<Float>
+        case .Float16:
+            _biasTerms = UnsafeMutablePointer<Float>.allocate(capacity: _biasTensor.data.count)
+            if let float16Point = _biasTensor.data.pointer as? UnsafeMutablePointer<Float16> {
+                float16to32(input: float16Point, output: _biasTerms!, count: _biasTensor.data.count)
+            }
+        }
+        return true
+    }
+    func purge() {
+        switch P.precisionType {
+        case .Float32:
+            return
+        case .Float16:
+            _biasTerms?.deinitialize(count: _biasTensor.data.count)
+            _biasTerms?.deallocate()
+        }
+    }
+    func label() -> String? {
+        return "conv_add_label"
+    }
+    func copy(with zone: NSZone? = nil) -> Any {
+        return self
+    }
+    init(inDesc: MPSCNNConvolutionDescriptor,
+                  inWeights: Tensor<P>,
+                  inBiasTerms: Tensor<P>) {
+        _descriptor = inDesc
+        _weightsTensor = inWeights
+        _biasTensor = inBiasTerms
+        super.init()
+    }
+    func descriptor() -> MPSCNNConvolutionDescriptor {
+        return _descriptor
+    }
+    func dataType() -> MPSDataType {
+        switch P.precisionType {
+        case .Float32:
+            return .float32
+        case .Float16:
+            return .float16
+        }
+    }
+    func weights() -> UnsafeMutableRawPointer {
+        return UnsafeMutableRawPointer.init(_weightsTensor.data.pointer)
+    }
+    func biasTerms() -> UnsafeMutablePointer<Float>? {
+        return _biasTerms
+    }
+}
 class ConvAddKernel<P: PrecisionProtocol>: Kernel, Computable {
    var metalParam: MetalConvParam!
@@ -40,30 +112,37 @@ class ConvAddKernel<P: PrecisionProtocol>: Kernel, Computable {
        let offsetX = (Int(param.dilations[0]) * (param.filter.tensorDim[3] - 1) + 1)/2 - Int(param.paddings[0])
        let key = identifyingKey
-        if initContext.useMPS {
-            if #available(iOS 10.0, *) {
+        if initContext.useMPS {  // 使用 apple 的 MetalPerformanceShaders
+            if #available(iOS 11.0, *) {
+                var desc: MPSCNNConvolutionDescriptor?
+                // 如果不是 depth wise, 并且输入输出 tensor channel 都大于 4
                if !(param.filter.tensorDim[1] == 1 && param.filter.tensorDim[0] == param.input.tensorDim[1]) && param.input.tensorDim[1] > 4 && param.output.tensorDim[1] > 4 {
+                    desc = MPSCNNConvolutionDescriptor(kernelWidth: param.filter.tensorDim[3],
-                    let desc = MPSCNNConvolutionDescriptor(kernelWidth: param.filter.tensorDim[3],
                                                           kernelHeight: param.filter.tensorDim[2],
                                                           inputFeatureChannels: param.input.tensorDim[1],
                                                           outputFeatureChannels: param.output.tensorDim[1],
                                                           neuronFilter: nil)
-                    desc.strideInPixelsX = Int(param.stride[0])
+                    desc?.strideInPixelsX = Int(param.stride[0])
-                    desc.strideInPixelsY = Int(param.stride[1])
+                    desc?.strideInPixelsY = Int(param.stride[1])
+                } else if param.input.tensorDim[1] > 4 && param.output.tensorDim[1] > 4 {
-                    let tensorPointer = param.filter.convert(converter: MPSPointerConverter<P>.init())
+                    desc = MPSCNNDepthWiseConvolutionDescriptor(kernelWidth: param.filter.tensorDim[3],
-                    let yPointer = param.y.data.pointer
+                                                                    kernelHeight: param.filter.tensorDim[2],
+                                                                    inputFeatureChannels: param.input.tensorDim[1],
-                    tensorPointer.withMemoryRebound(to: Float.self, capacity: param.filter.numel()) { (weightPointer: UnsafeMutablePointer<Float>) in
+                                                                    outputFeatureChannels: param.output.tensorDim[1],
-                        yPointer.withMemoryRebound(to: Float.self, capacity: param.y.numel(), { (biasePointer: UnsafeMutablePointer<Float>) in
+                                                                    neuronFilter: nil)
-                            let conv = MPSCNNConvolution.init(device: device, convolutionDescriptor: desc, kernelWeights: weightPointer, biasTerms: biasePointer, flags: .none)
-                            conv.offset = MPSOffset.init(x: offsetX, y: offsetY, z: 0)
+                }
-                            conv.edgeMode = .zero
-                            convDic[key] = conv
+                desc?.strideInPixelsX = Int(param.stride[0])
-                        })
+                desc?.strideInPixelsY = Int(param.stride[1])
-                    }
+                if let inDesc = desc {
+                    let _ = param.filter.convert(converter: MPSPointerConverter<P>.init())
+                    let dataSource = ConvDataSource.init(inDesc: inDesc, inWeights: param.filter, inBiasTerms: param.y)
+                    let conv = MPSCNNConvolution.init(device: device, weights: dataSource)
+                    conv.offset = MPSOffset.init(x: offsetX, y: offsetY, z: 0)
+                    conv.edgeMode = .zero
+                    convDic[key] = conv
                    imageDic[identifyingKey + "_input"] = MPSImage.init(texture: param.input.metalTexture, featureChannels: param.input.tensorDim[1])
                    imageDic[identifyingKey + "_output"] = MPSImage.init(texture: param.output.metalTexture, featureChannels: param.output.tensorDim[1])
                    super.init(device: device, inFunctionName: "place_holder", initContext: initContext)

--- a/test/net/test_mobilenet_GPU.cpp
+++ b/test/net/test_mobilenet_GPU.cpp
@@ -25,11 +25,11 @@ int main() {
  paddle_mobile.SetCLPath("/data/local/tmp/bin");
 #endif
-  //  auto isok =
+  auto isok = paddle_mobile.Load(
-  //      paddle_mobile.Load(std::string(g_mobilenet_mul) + "/model",
+      std::string(g_mobilenet_vision) + "/vision_mobilenet_model",
-  //                         std::string(g_mobilenet_mul) + "/params", true);
+      std::string(g_mobilenet_vision) + "/vision_mobilenet_params", true);
-  auto isok = paddle_mobile.Load(std::string(g_mobilenet), true);
+  //  auto isok = paddle_mobile.Load(std::string(g_mobilenet), true);
  if (isok) {
    auto time2 = paddle_mobile::time();
    std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms"
@@ -37,12 +37,13 @@ int main() {
    std::vector<float> input;
    std::vector<int64_t> dims{1, 3, 224, 224};
-    GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
+    GetInput<float>(g_test_image_1x3x224x224_vision_mobilenet_input, &input,
+                    dims);
    std::vector<float> vec_result = paddle_mobile.Predict(input, dims);
    auto time3 = paddle_mobile::time();
-    int max = 10;
+    int max = 1;
    for (int i = 0; i < max; ++i) {
      vec_result = paddle_mobile.Predict(input, dims);
    }

--- a/test/net/test_mobilenet_combine.cpp
+++ b/test/net/test_mobilenet_combine.cpp
@@ -20,14 +20,18 @@ int main() {
  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
  paddle_mobile.SetThreadNum(4);
  auto time1 = time();
-  if (paddle_mobile.Load(std::string(g_mobilenet_combined) + "/model",
-                         std::string(g_mobilenet_combined) + "/params", true)) {
+  if (paddle_mobile.Load(
+          std::string(g_mobilenet_vision) + "/vision_mobilenet_model",
+          std::string(g_mobilenet_vision) + "/vision_mobilenet_params", true)) {
    auto time2 = time();
    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
    std::vector<float> input;
    std::vector<int64_t> dims{1, 3, 224, 224};
-    GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
+    GetInput<float>(g_test_image_1x3x224x224_vision_mobilenet_input, &input,
+                    dims);
    auto vec_result = paddle_mobile.Predict(input, dims);
    std::vector<float>::iterator biggest =
@@ -39,8 +43,9 @@ int main() {
    for (int i = 0; i < 10; ++i) {
      auto vec_result = paddle_mobile.Predict(input, dims);
    }
    auto time3 = time();
-    for (int i = 0; i < 10; ++i) {
+    for (int i = 0; i < 1; ++i) {
      auto vec_result = paddle_mobile.Predict(input, dims);
    }
    auto time4 = time();

--- a/test/net/test_yolo_combined.cpp
+++ b/test/net/test_yolo_combined.cpp
@@ -23,15 +23,15 @@ int main() {
  //  ../../../test/models/mobilenet
  auto time1 = time();
-  if (paddle_mobile.Load(std::string(g_yolo_combined) + "/model",
+  if (paddle_mobile.Load(std::string(g_yolo_vision) + "/model",
-                         std::string(g_yolo_combined) + "/params", true)) {
+                         std::string(g_yolo_vision) + "/params", true)) {
    auto time2 = time();
    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
    std::vector<int64_t> dims{1, 3, 416, 416};
    std::vector<float> input;
-    GetInput<float>(g_test_image_desktop_1_3_416_416_nchw_float, &input, dims);
+    GetInput<float>(g_test_image_1x3x416x416_vision_yolo_input, &input, dims);
    std::cout << "input.size():  " << input.size() << std::endl;
    for (int j = 0; j < 100; ++j) {
      std::cout << j << " :  " << input[j] << std::endl;
@@ -42,13 +42,6 @@ int main() {
    //        }
    auto time3 = time();
    const vector<float> vector_out = paddle_mobile.Predict(input, dims);
-    std::cout << "--------------------------------------------" << std::endl;
-    for (float i : vector_out) {
-      std::cout << i << std::endl;
-    }
-    std::cout << "--------------------------------------------" << std::endl;
    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;

--- a/test/net/test_yologpu.cpp
+++ b/test/net/test_yologpu.cpp
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <iostream>
-#include <thread>
+#include <thread>  // NOLINT
 #include "../../src/common/types.h"
 #include "../../src/io/paddle_test_inference_api.h"
 #include "../test_helper.h"
@@ -31,8 +31,9 @@ void t1() {
  paddle_mobile_gpu.SetCLPath("/data/local/tmp/bin");
 #endif
  auto time1 = paddle_mobile::time();
-  auto isok = paddle_mobile_gpu.Load(std::string(g_yolo_mul) + "/model",
+  auto isok =
-                                     std::string(g_yolo_mul) + "/params", true);
+      paddle_mobile_gpu.Load(std::string(g_yolo_vision) + "/model",
+                             std::string(g_yolo_vision) + "/params", true);
  //  auto isok = paddle_mobile.Load(std::string(g_yolo_mul), true);
  if (isok) {
@@ -42,13 +43,13 @@ void t1() {
    std::vector<float> input;
    std::vector<int64_t> dims{1, 3, 416, 416};
-    GetInput<float>(g_yolo_img, &input, dims);
+    GetInput<float>(g_test_image_1x3x416x416_vision_yolo_input, &input, dims);
    std::vector<float> vec_result;
    //            = paddle_mobile.Predict(input, dims);
    auto time3 = paddle_mobile::time();
-    int max = 10;
+    int max = 1;
    for (int i = 0; i < max; ++i) {
      vec_result = paddle_mobile_gpu.Predict(input, dims);
    }
@@ -129,9 +130,9 @@ void t2() {
 void t3() {
  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
  //    paddle_mobile.SetThreadNum(4);
-  //#ifdef PADDLE_MOBILE_CL
+  // #ifdef PADDLE_MOBILE_CL
  //  paddle_mobile.SetCLPath("/data/local/tmp/bin");
-  //#endif
+  // #endif
  auto time1 = paddle_mobile::time();
  auto isok = paddle_mobile.Load(std::string(g_yolo_mul) + "/model",
                                 std::string(g_yolo_mul) + "/params", true);

--- a/test/test_helper.h
+++ b/test/test_helper.h
@@ -51,6 +51,8 @@ static const char *g_yolo_combined = "../models/yolo_combined";
 static const char *g_yolo_mul = "../models/d";
 static const char *g_fluid_fssd_new = "../models/fluid_fssd_new";
 static const char *g_vgg16_ssd_combined = "../models/vgg16_ssd_combined";
+static const char *g_mobilenet_vision = "../models/vision_mobilenet";
+static const char *g_yolo_vision = "../models/vision_yolo";
 static const char *g_test_image_1x3x224x224 =
    "../images/test_image_1x3x224x224_float";
 static const char *g_test_image_1x3x224x224_banana =
@@ -65,10 +67,14 @@ static const char *g_img = "../images/img.bin";
 static const char *g_yolo_img = "../images/in_put_1_3_416_416_2";
 static const char *g_super_img = "../images/mingren_input_data";
 static const char *g_mobilenet_img = "../images/image";
+static const char *g_test_image_1x3x224x224_vision_mobilenet_input =
+    "../images/vision_mobilenet_input";
+static const char *g_test_image_1x3x416x416_vision_yolo_input =
+    "../images/yolo_input";
 using paddle_mobile::framework::DDim;
 using paddle_mobile::framework::Tensor;
-using namespace paddle_mobile;
+using namespace paddle_mobile;  // NOLINT
 template <typename T>
 void SetupTensor(paddle_mobile::framework::Tensor *input,