Merge branch 'develop' into develop

e1351e5e · eclipsycn · GitHub · 6c86e48c · cd30eb8a · e1351e5e
9 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,7 +9,6 @@ option(LOG_PROFILE "log profile" ON)
 option(CPU "armv7 with neon" ON)
 option(MALI_GPU "mali gpu" OFF)
 option(FPGA "fpga" OFF)
-set(DEBUGING ON)
 if (ARM_LINUX)
 include("${CMAKE_CURRENT_LIST_DIR}/tools/arm-platform.cmake")

--- a/demo/android/PaddleMobile_Android/app/src/main/java/com/baidu/paddle/MainActivity.java
+++ b/demo/android/PaddleMobile_Android/app/src/main/java/com/baidu/paddle/MainActivity.java
@@ -121,7 +121,14 @@ public class MainActivity extends Activity {
                String assetPath = "pml_demo";
                String sdcardPath = Environment.getExternalStorageDirectory()
                        + File.separator + assetPath + File.separator + type;
-                PML.load(sdcardPath);
+                //PML.load(sdcardPath);
+                String modelPath = Environment.getExternalStorageDirectory()
+                                                           + File.separator + assetPath +
+                                                           File.separator + "googlenet_combine" + File.separator + "model";
+                String paramPath = Environment.getExternalStorageDirectory()
+                                                           + File.separator + assetPath +
+                                                           File.separator + "googlenet_combine" + File.separator + "params";
+                PML.loadCombined(modelPath, paramPath);
            }
        });

--- a/demo/android/PaddleMobile_Android/app/src/main/java/com/baidu/paddle/PML.java
+++ b/demo/android/PaddleMobile_Android/app/src/main/java/com/baidu/paddle/PML.java
@@ -8,6 +8,14 @@ public class PML {
     */
    public static native boolean load(String modelPath);
+    /**
+     * Load
+     * @param modelPath
+     * @param paramPath
+     * @return
+     */
+    public static native boolean loadCombined(String modelPath,String paramPath);
    /**
     * object detection

--- a/src/jni/paddle_mobile_jni.cpp
+++ b/src/jni/paddle_mobile_jni.cpp
@@ -60,6 +60,15 @@ JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_load(JNIEnv *env,
                                         optimize);
 }
+JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombined(
+    JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath) {
+  ANDROIDLOGI("load invoked");
+  bool optimize = true;
+  return getPaddleMobileInstance()->Load(jstring2cppstring(env, modelPath),
+                                         jstring2cppstring(env, paramPath),
+                                         optimize);
+}
 JNIEXPORT jfloatArray JNICALL
 Java_com_baidu_paddle_PML_predict(JNIEnv *env, jclass thiz, jfloatArray buf) {
  jfloatArray result = NULL;

--- a/src/jni/paddle_mobile_jni.h
+++ b/src/jni/paddle_mobile_jni.h
@@ -22,11 +22,16 @@ extern "C" {
 namespace paddle_mobile {
 namespace jni {
 /**
- * load model & params of the net for android
+ * load separated model for android
 */
 JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_load(JNIEnv *env,
                                                          jclass thiz,
                                                          jstring modelPath);
+/**
+ * load combined model  for android
+ */
+JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombined(
+    JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath);
 /**
 * object detection for anroid

--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
--- a/src/operators/math/gemm.h
+++ b/src/operators/math/gemm.h
@@ -19,12 +19,8 @@ limitations under the License. */
 #define B(i, j) B[(i)*ldb + (j)]
 #define C(i, j) C[(i)*ldc + (j)]
-// 分块计算的块大小，mc 与 kc 分别对应分块计算时的 m 与 k
-#define MC 128
-#define KC 128
-#define NC 1024
 #define MR 4
-#define NR 4
+#define NR 8
 #define s_min(i, j) ((i) < (j) ? (i) : (j))
@@ -49,28 +45,66 @@ void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
                  float *buffer);
 // 分块矩阵乘法
-void InnerKernel(int m, int n, int k, float alpha, const float *A, int lda,
+void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
-                 const float *B, int ldb, float beta, float *C, int ldc,
+                 float beta, float *c, float *C, int ldc, bool relu);
-                 int first_time);
+void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
+                       const float *b, float beta, float *c, float *C, int ldc,
+                       bool relu, float *new_scale, float *new_bias);
 // 向量矩阵乘法 (M = 1)
 void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
-                  const float *B, int ldb, float beta, float *C, int ldc);
+                  const float *B, int ldb, float beta, float *C, int ldc,
+                  bool relu);
-// 计算一个更小的 4 * 4 的 C 矩阵分块
-void AddDot4x4(int k, float alpha, const float *A, int lda, const float *B,
+void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
-               int ldb, float beta, float *C, int ldc, int mc, int nc);
+                        int lda, const float *B, int ldb, float beta, float *C,
+                        int ldc, bool relu, float *new_scale, float *new_bias);
-void AddDot4x4_relu(int k, float alpha, const float *a, int lda, const float *b,
-                    int ldb, float beta, float *C, int ldc, int mc, int nc,
+// 计算一个更小的 C 矩阵分块
-                    bool relu);
+void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc);
+void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc);
+// 分块矩阵乘法结果回写
+// C = A * B
+void WriteBasic(int mc, int nc, float *c, float *C, int ldc);
+// C = alpha * A * B + beta * C
+void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc);
+// C = A * B + C
+void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc);
+// C = A * B + C, relu(C)
+void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc);
+// C = A * B, batchnorm(C)
+void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale,
+                 float *new_bias);
+// C = A * B, batchnorm(C), relu(C)
+void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
+                     float *new_scale, float *new_bias);
+// 向量矩阵乘法结果回写
+// C = A * B
+void VecWriteBasic(int n, float *c, float *C, int ldc);
+// C = alpha * A * B + beta * C
+void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc);
+// C = A * B + C
+void VecWriteWithAdd(int n, float *c, float *C, int ldc);
+// C = A * B + C, relu(C)
+void VecWriteWithAddRelu(int n, float *c, float *C, int ldc);
+// C = A * B, batchnorm(C)
+void VecWriteWithBn(int n, float *c, float *C, int ldc, float *new_scale,
+                    float *new_bias);
+// C = A * B, batchnorm(C), relu(C)
+void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *new_scale,
+                        float *new_bias);
 // 32位 float 矩阵乘法
-void sgemm(int m, int n, int k, float alpha, const float *A, int lda,
+void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
-           const float *B, int ldb, float beta, float *C, int ldc);
+           const float *B, int ldb, float beta, float *C, int ldc, bool relu);
-void sgemm_relu(int m, int n, int k, float alpha, const float *A, int lda,
+// 32位 float 矩阵乘法, 并对结果进行 batchnrom
-                const float *B, int ldb, float beta, float *C, int ldc);
+void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
+                 const float *B, int ldb, float beta, float *C, int ldc,
+                 bool relu, float *new_scale, float *new_bias);
 // 64位 double 矩阵乘法
 void dgemm(int m, int n, int k, float alpha, const double *A, int lda,

--- a/src/operators/math/math_function.cpp
+++ b/src/operators/math/math_function.cpp
@@ -39,22 +39,18 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
  int M = dim_out[0];
  int N = dim_out[1];
-  int K = (trans_a == false) ? dim_a[1] : dim_a[0];
+  int K = (!trans_a) ? dim_a[1] : dim_a[0];
-  if (relu) {
+  Sgemm(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
-    sgemm_relu(M, N, K, alpha, matrix_a.data<float>(), K,
+        beta, matrix_out->data<float>(), N, relu);
-               matrix_b.data<float>(), N, beta, matrix_out->data<float>(), N);
-  } else {
-    sgemm(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
-          beta, matrix_out->data<float>(), N);
-  }
 }
 template <>
-void matmul<double>(const framework::Tensor &matrix_a, bool trans_a,
+void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a,
-                    const framework::Tensor &matrix_b, bool trans_b,
+                         const framework::Tensor &matrix_b, bool trans_b,
-                    double alpha, framework::Tensor *matrix_out, double beta,
+                         float alpha, framework::Tensor *matrix_out, float beta,
-                    bool relu) {
+                         bool relu, framework::Tensor *new_scale,
+                         framework::Tensor *new_bias) {
  auto dim_a = matrix_a.dims();
  auto dim_b = matrix_b.dims();
  auto dim_out = matrix_out->dims();
@@ -71,7 +67,11 @@ void matmul<double>(const framework::Tensor &matrix_a, bool trans_a,
  int M = dim_out[0];
  int N = dim_out[1];
-  int K = (trans_a == false) ? dim_a[1] : dim_a[0];
+  int K = (!trans_a) ? dim_a[1] : dim_a[0];
+  SgemmWithBn(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(),
+              N, beta, matrix_out->data<float>(), N, relu,
+              new_scale->data<float>(), new_bias->data<float>());
 }
 }  // namespace math

--- a/src/operators/math/math_function.h
+++ b/src/operators/math/math_function.h
@@ -26,6 +26,12 @@ template <typename T>
 void matmul(const framework::Tensor &matrix_a, bool trans_a,
            const framework::Tensor &matrix_b, bool trans_b, T alpha,
            framework::Tensor *matrix_out, T beta, bool relu = false);
+template <typename T>
+void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a,
+                  const framework::Tensor &matrix_b, bool trans_b, T alpha,
+                  framework::Tensor *matrix_out, T beta, bool relu,
+                  framework::Tensor *new_scale, framework::Tensor *new_bias);
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile