Merge pull request #855 from codeWorm2015/metal

update

Merge pull request #855 from codeWorm2015/metal
update
81191e32 · Ruilong Liu · GitHub · b5d66d48 · 1c92467b · 81191e32
10 changed file
--- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/PreProcessKernel.metal
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/PreProcessKernel.metal
@@ -80,7 +80,7 @@ kernel void genet_preprocess(texture2d<float, access::read> inTexture [[texture(
      gid.y >= outTexture.get_height()) {
    return;
  }
-  const auto means = float4(123.68f, 116.78f, 103.94f, 0.0f);
+  const auto means = float4(128.0f, 128.0f, 128.0f, 0.0f);
  const float4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017;
  outTexture.write(float4(inColor.z, inColor.y, inColor.x, 0.0f), gid);
 }

--- a/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift
+++ b/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift
@@ -79,7 +79,7 @@ class ViewController: UIViewController {
      return
    }
    do {
-      let max = 50
+      let max = 1
      let startDate = Date.init()
      for i in 0..<max {
        try net.predict(inTexture: inTexture) { [weak self] (result) in

--- a/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj
+++ b/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj
@@ -38,7 +38,6 @@
 		FC0E2DC020EE461F009C1FAC /* ElementwiseAddKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC0E2DBF20EE461F009C1FAC /* ElementwiseAddKernel.swift */; };
 		FC1B16B320EC9A4F00678B91 /* Kernels.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC1B16B220EC9A4F00678B91 /* Kernels.metal */; };
 		FC1B186620ECF1C600678B91 /* ResizeKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC1B186520ECF1C600678B91 /* ResizeKernel.swift */; };
-		FC27990E21341016000B6BAD /* BoxCoder.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC27990D21341016000B6BAD /* BoxCoder.metal */; };
 		FC3602CC2108819F00FACB58 /* PaddleMobileUnitTest.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC3602CB2108819F00FACB58 /* PaddleMobileUnitTest.swift */; };
 		FC4CB74920F0B954007C0C6D /* ConvKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC4CB74820F0B954007C0C6D /* ConvKernel.metal */; };
 		FC4CB74B20F12C30007C0C6D /* ProgramOptimize.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC4CB74A20F12C30007C0C6D /* ProgramOptimize.swift */; };
@@ -51,6 +50,7 @@
 		FC9D038420E23B01000F735A /* Texture.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC9D038320E23B01000F735A /* Texture.swift */; };
 		FCA3A1632132A4AC00084FE5 /* ReshapeKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCA3A1622132A4AC00084FE5 /* ReshapeKernel.metal */; };
 		FCA3A1652132A5EB00084FE5 /* Common.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCA3A1642132A5EB00084FE5 /* Common.metal */; };
+		FCA67B1721364EF000BD58AA /* ConvTransposeKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FCA67B1621364EF000BD58AA /* ConvTransposeKernel.metal */; };
 		FCBCCC572122F41300D94F7E /* DwConvBNReluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC562122F41300D94F7E /* DwConvBNReluOp.swift */; };
 		FCBCCC592122F42700D94F7E /* ConvBNReluOp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC582122F42700D94F7E /* ConvBNReluOp.swift */; };
 		FCBCCC5B2122F66F00D94F7E /* ConvBNReluKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FCBCCC5A2122F66F00D94F7E /* ConvBNReluKernel.swift */; };
@@ -136,6 +136,7 @@
 		FC9D038320E23B01000F735A /* Texture.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Texture.swift; sourceTree = "<group>"; };
 		FCA3A1622132A4AC00084FE5 /* ReshapeKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ReshapeKernel.metal; sourceTree = "<group>"; };
 		FCA3A1642132A5EB00084FE5 /* Common.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = Common.metal; sourceTree = "<group>"; };
+		FCA67B1621364EF000BD58AA /* ConvTransposeKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ConvTransposeKernel.metal; sourceTree = "<group>"; };
 		FCBCCC562122F41300D94F7E /* DwConvBNReluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DwConvBNReluOp.swift; sourceTree = "<group>"; };
 		FCBCCC582122F42700D94F7E /* ConvBNReluOp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvBNReluOp.swift; sourceTree = "<group>"; };
 		FCBCCC5A2122F66F00D94F7E /* ConvBNReluKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ConvBNReluKernel.swift; sourceTree = "<group>"; };
@@ -364,6 +365,7 @@
 				FCDDC6CE212FE14700E5EF74 /* PriorBoxKernel.metal */,
 				FCA3A1622132A4AC00084FE5 /* ReshapeKernel.metal */,
 				FCA3A1642132A5EB00084FE5 /* Common.metal */,
+				FCA67B1621364EF000BD58AA /* ConvTransposeKernel.metal */,
 			);
 			path = metal;
 			sourceTree = "<group>";
@@ -529,6 +531,7 @@
 				FCBCCC612122FBDF00D94F7E /* PriorBoxKernel.swift in Sources */,
 				FCBCCC5F2122FB3B00D94F7E /* PriorBoxOp.swift in Sources */,
 				FC9D038220E2312E000F735A /* FetchOp.swift in Sources */,
+				FCA67B1721364EF000BD58AA /* ConvTransposeKernel.metal in Sources */,
 				FC039BBD20E11CC20081E9F8 /* Program.swift in Sources */,
 				FC039BA220E11CB70081E9F8 /* Loader.swift in Sources */,
 				FCBCCC67212306B000D94F7E /* ConcatOp.swift in Sources */,

--- a/metal/paddle-mobile/paddle-mobile/Executor.swift
+++ b/metal/paddle-mobile/paddle-mobile/Executor.swift
@@ -62,7 +62,7 @@ public class Executor<P: PrecisionType> {
    queue = inQueue
    for block in inProgram.programDesc.blocks {
      //block.ops.count
-      for i in 0..<block.ops.count {
+      for i in 0..<4 {
        let op = block.ops[i]
        do {
          let op = try OpCreator<P>.shared.creat(device: inDevice, opDesc: op, scope: inProgram.scope)
@@ -110,13 +110,12 @@ public class Executor<P: PrecisionType> {
    }
    
    buffer.addCompletedHandler { (commandbuffer) in
-//      return;
    
 //      let inputArr = resInput.floatArray(res: { (p:P) -> P in
 //        return p
 //      })
-      
-//      writeToLibrary(fileName: "input_hand", array: inputArr)
+//
+//      writeToLibrary(fileName: "genet_input_hand", array: inputArr)
 //      print("write to library done")
 //      return
      //            print(inputArr)
@@ -125,10 +124,13 @@ public class Executor<P: PrecisionType> {
      //            print(stridableInput)
      
      //            let _: Flo? = input.logDesc(header: "input: ", stridable: true)
-//      for op in self.ops {
-//          op.delogOutput()
-//      }
-//                  return
+      for i in 0..<self.ops.count {
+        let op = self.ops[i]
+        print(" 第 \(i) 个 op: ")
+        op.delogOutput()
+      }
+      
+      return
      
      let afterDate = Date.init()
     

--- a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddOp.swift
@@ -97,6 +97,7 @@ class ConvAddOp<P: PrecisionType>: Operator<ConvAddKernel<P>, ConvAddParam<P>>,
  }
  
  func delogOutput() {
+    print(" \(type) output: ")
    print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
  }
 }
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvKernel.metal
@@ -356,78 +356,6 @@ kernel void depthwise_conv_add_batch_norm_relu_3x3(texture2d_array<float, access
    outTexture.write(output, gid.xy, gid.z);
 }

-struct MetalConvTransposeParam{
-  ushort kernelW;
-  ushort kernelH;
-  
-  ushort strideX;
-  ushort strideY;
-  
-  ushort paddingX;
-  ushort paddingY;
-  
-  ushort dilationX;
-  ushort dilationY;
-};
-
-kernel void conv_transpose(texture2d_array<float, access::sample> inTexture [[texture(0)]],
-                           texture2d_array<float, access::write> outTexture [[texture(1)]],
-                           constant MetalConvTransposeParam &param [[buffer(0)]],
-                           const device float4 *weights [[buffer(1)]],
-                           uint3 gid [[thread_position_in_grid]]){
-  if (gid.x >= outTexture.get_width() ||
-      gid.y >= outTexture.get_height() ||
-      gid.z >= outTexture.get_array_size()) {
-    return;
-  }
-  
-  int input_array_size = inTexture.get_array_size();
-  
-  uint kernel_one_output_slice = input_array_size * param.kernelW * param.kernelH;
-
-  uint kernel_stride_z = gid.z * 4 * (kernel_one_output_slice);
-  
-  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
-  
-  float4 output;
-  
-  for (int w = 0; w < param.kernelW; ++w) {
-    int input_x = (gid.x - w * param.dilationX + param.paddingX) / param.strideX;
-    if (input_x < 0 || input_x >= int(inTexture.get_width())) {
-      continue;
-    }
-    
-    for (int h = 0; h < param.kernelH; ++h) {
-      int input_y = (gid.y - h * param.dilationY + param.paddingY) / param.strideY;
-      if (input_y < 0 || input_y >= int(inTexture.get_height())) {
-        continue;
-      }
-      
-      uint kernel_index = (w * param.kernelH + h) * inTexture.get_array_size();
-      
-      for (int slice = 0; slice < input_array_size; ++slice) {
-        
-        float4 input;
-        float4 kernel_slice = weights[kernel_stride_z + 0 * kernel_one_output_slice + kernel_index + slice];
-        float4 kernel_slice1 = weights[kernel_stride_z + 1 * kernel_one_output_slice + kernel_index + slice];
-
-        float4 kernel_slice2 = weights[kernel_stride_z + 2 * kernel_one_output_slice + kernel_index + slice];
-
-        float4 kernel_slice3 = weights[kernel_stride_z + 3 * kernel_one_output_slice + kernel_index + slice];
-        
-        input = inTexture.sample(sample, float2(input_x,    input_x), slice);
-        output.x += dot(input, kernel_slice);
-        output.x += dot(input, kernel_slice1);
-        output.x += dot(input, kernel_slice2);
-        output.x += dot(input, kernel_slice3);
-      }
-    }
-  }
-
-  outTexture.write(output, gid.xy, gid.z);
-}
-
-
 // conv
 #pragma mark -- conv
 kernel void conv_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvTransposeKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvTransposeKernel.metal
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+ 
+ http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include <metal_stdlib>
+using namespace metal;
+
+struct MetalConvTransposeParam{
+  ushort kernelW;
+  ushort kernelH;
+  
+  ushort strideX;
+  ushort strideY;
+  
+  ushort paddingX;
+  ushort paddingY;
+  
+  ushort dilationX;
+  ushort dilationY;
+};
+
+kernel void conv_transpose(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                           texture2d_array<float, access::write> outTexture [[texture(1)]],
+                           constant MetalConvTransposeParam &param [[buffer(0)]],
+                           const device float4 *weights [[buffer(1)]],
+                           uint3 gid [[thread_position_in_grid]]){
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  
+  int input_array_size = inTexture.get_array_size();
+  
+  uint kernel_one_output_slice = input_array_size * param.kernelW * param.kernelH;
+  
+  uint kernel_stride_z = gid.z * 4 * (kernel_one_output_slice);
+  
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  
+  float4 output;
+  
+  for (int w = 0; w < param.kernelW; ++w) {
+    int input_x = (gid.x - w * param.dilationX + param.paddingX) / param.strideX;
+    if (input_x < 0 || input_x >= int(inTexture.get_width())) {
+      continue;
+    }
+    
+    for (int h = 0; h < param.kernelH; ++h) {
+      int input_y = (gid.y - h * param.dilationY + param.paddingY) / param.strideY;
+      if (input_y < 0 || input_y >= int(inTexture.get_height())) {
+        continue;
+      }
+      
+      uint kernel_index = (w * param.kernelH + h) * inTexture.get_array_size();
+      
+      for (int slice = 0; slice < input_array_size; ++slice) {
+        
+        float4 input;
+        float4 kernel_slice = weights[kernel_stride_z + 0 * kernel_one_output_slice + kernel_index + slice];
+        float4 kernel_slice1 = weights[kernel_stride_z + 1 * kernel_one_output_slice + kernel_index + slice];
+        
+        float4 kernel_slice2 = weights[kernel_stride_z + 2 * kernel_one_output_slice + kernel_index + slice];
+        
+        float4 kernel_slice3 = weights[kernel_stride_z + 3 * kernel_one_output_slice + kernel_index + slice];
+        
+        input = inTexture.sample(sample, float2(input_x,    input_x), slice);
+        output.x += dot(input, kernel_slice);
+        output.x += dot(input, kernel_slice1);
+        output.x += dot(input, kernel_slice2);
+        output.x += dot(input, kernel_slice3);
+      }
+    }
+  }
+  
+  outTexture.write(output, gid.xy, gid.z);
+}
+
--- a/metal/paddle-mobile/paddle-mobile/Operators/PoolOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/PoolOp.swift
@@ -58,12 +58,16 @@ class PoolOp<P: PrecisionType>: Operator<PoolKernel<P>, PoolParam<P>>, Runable,
  }
  
  func delogOutput() {
-    print("pool2d delog")
-    let _: P? = para.input.metalTexture.logDesc(header: "pool2d input: ", stridable: true)
-    print(para.ksize)
-    print(para.stride)
-    print(para.padding)
-    print(para.poolType)
-    let _: P? = para.output.metalTexture.logDesc(header: "pool2d output: ", stridable: true)
+    print(" \(type) output: ")
+    print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+
+    
+//    print("pool2d delog")
+//    let _: P? = para.input.metalTexture.logDesc(header: "pool2d input: ", stridable: true)
+//    print(para.ksize)
+//    print(para.stride)
+//    print(para.padding)
+//    print(para.poolType)
+//    let _: P? = para.output.metalTexture.logDesc(header: "pool2d output: ", stridable: true)
  }
 }
--- a/metal/paddle-mobile/paddle-mobile/Operators/PreluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/PreluOp.swift
@@ -50,8 +50,8 @@ class PreluOp<P: PrecisionType>: Operator<PreluKernel<P>, PreluParam<P>>, Runabl
  }
  
  func delogOutput() {
-    print("softmax delog")
-    let _: P? = para.input.metalTexture.logDesc(header: "softmax input: ", stridable: false)
-    let _: P? = para.output.metalTexture.logDesc(header: "softmax output: ", stridable: false)
+//    print("softmax delog")
+//    let _: P? = para.input.metalTexture.logDesc(header: "softmax input: ", stridable: false)
+//    let _: P? = para.output.metalTexture.logDesc(header: "softmax output: ", stridable: false)
  }
 }
--- a/metal/paddle-mobile/paddle-mobile/Operators/ReluOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/ReluOp.swift
@@ -43,6 +43,12 @@ class ReluOp<P: PrecisionType>: Operator<ReluKernel<P>, ReluParam<P>>, Runable,
      throw error
    }
  }
+  
+  func delogOutput() {
+    print(" \(type) output: ")
+    print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray())
+  }
+  
 }