提交 e7b06b70 编写于 作者: qnqinan's avatar qnqinan

Merge remote-tracking branch 'origin/develop' into develop

......@@ -75,4 +75,5 @@ cmake-build-release
demo/ios/PaddleMobileDemo/PaddleMobileDemo/googlenet_combine/
demo/ios/PaddleMobileDemo/PaddleMobileDemo/*.jpg
demo/ios/PaddleMobileDemo/PaddleMobileDemo/PaddleMobile/*.a
*.xcuserstate
\ No newline at end of file
*.xcuserstate
/tools/quantification/quantify
......@@ -36,7 +36,7 @@ static int fd = -1;
static const char *device_path = "/dev/fpgadrv0";
static inline int do_ioctl(int req, void *arg) {
return ioctl(req, (long unsigned int)arg);
return ioctl(req, (unsigned int64_t)arg);
}
int open_device() {
......@@ -58,9 +58,13 @@ void fpga_copy(void *dest, const void *src, size_t num) {
memcpy(dest, src, num);
}
int ComputeFpgaConv(struct ConvArgs args) {}
int ComputeFpgaPool(struct PoolingArgs args) {}
int ComputeFpgaEWAdd(struct EWAddArgs args) {}
int ComputeFpgaConv(const struct ConvArgs &args) { return do_ioctl(21, &args); }
int ComputeFpgaPool(const struct PoolingArgs &args) {
return do_ioctl(22, &args);
}
int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
return do_ioctl(23, &args);
}
} // namespace fpga
} // namespace paddle_mobile
......@@ -31,6 +31,18 @@ void* fpga_malloc(size_t size);
void fpga_free(void* ptr);
void fpga_copy(void* dst, const void* src, size_t num);
enum DataConvertType {
DATA_NO_CONVERT = 0,
DATA_FP32_TO_FP16 = 1,
DATA_FP16_TO_FP32 = 2,
};
enum LayoutConvertType {
LAYOUT_NO_CONVERT = 0,
LAYOUT_CHW_TO_HWC = 1,
LAYOUT_HWC_TO_CHW = 2,
};
struct VersionArgs {
void* buffer;
};
......@@ -79,7 +91,7 @@ struct ConvArgs {
uint32_t filter_num;
uint32_t group_num;
struct BNArgs bn;
void* sb_address; // scale and bias are interlaced;
struct KernelArgs kernel;
struct ImageInputArgs image; // input image;
struct ImageOutputArgs output;
......@@ -102,6 +114,12 @@ struct EWAddArgs {
struct ImageOutputArgs output;
};
struct BypassArgs {
enum DataConvertType convert_type;
struct ImageInputArgs image;
struct ImageOutputArgs output;
};
struct FpgaRegWriteArgs {
uint64_t address; //
uint64_t value;
......@@ -115,8 +133,6 @@ struct FpgaRegReadArgs {
#define IOCTL_FPGA_MAGIC 'FPGA'
#define IOCTL_VERSION _IOW(IOCTL_FPGA_MAGIC, 01, struct VersionArgs)
#define IOCTL_FPGA_REG_READ _IOW(IOCTL_FPGA_MAGIC, 02, struct FpgaRegReadArgs)
#define IOCTL_FPGA_REG_WRITE _IOW(IOCTL_FPGA_MAGIC, 03, struct FpgaRegWriteArgs)
#define IOCTL_SEPARATOR_0 10
......@@ -127,6 +143,8 @@ struct FpgaRegReadArgs {
#define IOCTL_CONFIG_CONV _IOW(IOCTL_FPGA_MAGIC, 21, struct ConvArgs)
#define IOCTL_CONFIG_POOLING _IOW(IOCTL_FPGA_MAGIC, 22, struct PoolingArgs)
#define IOCTL_CONFIG_EW _IOW(IOCTL_FPGA_MAGIC, 23, struct EWAddArgs)
#define IOCTL_FPGA_REG_READ _IOW(IOCTL_FPGA_MAGIC, 28, struct FpgaRegReadArgs)
#define IOCTL_FPGA_REG_WRITE _IOW(IOCTL_FPGA_MAGIC, 29, struct FpgaRegWriteArgs)
enum FPGA_ERR_TYPE {
ERR_IOCTL_CMD = -1,
......@@ -154,9 +172,9 @@ enum FPGA_ERR_TYPE {
//============================== API =============================
int ComputeFpgaConv(struct ConvArgs args);
int ComputeFpgaPool(struct PoolingArgs args);
int ComputeFpgaEWAdd(struct EWAddArgs args);
int ComputeFpgaConv(const struct ConvArgs& args);
int ComputeFpgaPool(const struct PoolingArgs& args);
int ComputeFpgaEWAdd(const struct EWAddArgs& args);
} // namespace fpga
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "common/types.h"
#include "framework/lod_tensor.h"
#include "framework/operator.h"
#include "framework/scope.h"
#include "framework/tensor.h"
namespace paddle_mobile {
bool is_conv(std::string type) {
if (type.compare(G_OP_TYPE_CONV) == 0) {
return true;
}
if (type.compare(G_OP_TYPE_FUSION_CONV_ADD) == 0) {
return true;
}
if (type.compare(G_OP_TYPE_FUSION_CONV_ADD_RELU) == 0) {
return true;
}
if (type.compare(G_OP_TYPE_FUSION_CONV_BN_RELU) == 0) {
return true;
}
if (type.compare(G_OP_TYPE_FUSION_CONV_ADD_BN) == 0) {
return true;
}
return false;
}
template <typename Dtype>
void quantilize_op(std::shared_ptr<framework::OperatorBase<Dtype>> op,
std::shared_ptr<framework::Scope> scope) {
if (!is_conv(op.get()->Type())) {
return;
}
framework::Tensor* filter = nullptr;
auto var_vec = op.get()->Inputs().at("Filter");
if (!var_vec.empty()) {
auto var = scope.get()->FindVar(var_vec[0]);
filter = var->template GetMutable<framework::LoDTensor>();
}
float scale = 0;
// 32bit filter -> 8bit filter;
if (filter->type() == typeid(float)) {
framework::Tensor* originalFilter = filter;
framework::Tensor* quantFilter = new framework::Tensor();
float* floatData = originalFilter->data<float>();
int8_t* intData = quantFilter->mutable_data<int8_t>();
}
}
} // namespace paddle_mobile
......@@ -260,7 +260,7 @@ class Tensor {
inline float *scale_pointer() { return &scale; }
};
struct &fpga_args() const {
struct FPGAArgs &fpga_args() {
return fpgaArgs_;
}
#endif
......
......@@ -32,6 +32,10 @@ limitations under the License. */
#include "common/threadpool.h"
#endif
#ifdef PADDLE_MOBILE_FPGA
#include "fpga/fpga_quantilization.h"
#endif
namespace paddle_mobile {
using framework::Variable;
......@@ -96,6 +100,11 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
for (const auto &op : ops) {
op->Init();
}
#ifdef PADDLE_MOBILE_FPGA
for (const auto &op : ops) {
quantilize_op(op, program_.scope);
}
#endif
}
template <typename Dtype, Precision P>
......
......@@ -56,7 +56,8 @@ template <typename Dtype, Precision P>
const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
const std::string &model_path, const std::string &para_path, bool optimize,
bool quantification) {
auto program = this->LoadProgram(model_path, optimize);
auto program = this->LoadProgram(model_path, optimize, quantification);
program.para_path = para_path;
program.combined = true;
program.quantification = quantification;
......
......@@ -61,6 +61,15 @@ JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_load(JNIEnv *env,
optimize);
}
JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadQualified(
JNIEnv *env, jclass thiz, jstring modelPath) {
ANDROIDLOGI("loadQualified invoked");
bool optimize = true;
bool qualified = true;
return getPaddleMobileInstance()->Load(jstring2cppstring(env, modelPath),
optimize, qualified);
}
JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombined(
JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath) {
ANDROIDLOGI("loadCombined invoked");
......@@ -70,6 +79,16 @@ JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombined(
optimize);
}
JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombinedQualified(
JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath) {
ANDROIDLOGI("loadCombinedQualified invoked");
bool optimize = true;
bool qualified = true;
return getPaddleMobileInstance()->Load(jstring2cppstring(env, modelPath),
jstring2cppstring(env, paramPath),
optimize, qualified);
}
JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage(
JNIEnv *env, jclass thiz, jfloatArray buf, jintArray ddims) {
ANDROIDLOGI("predictImage invoked");
......
......@@ -27,12 +27,24 @@ namespace jni {
JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_load(JNIEnv *env,
jclass thiz,
jstring modelPath);
/**
* load separated qualified model for android
*/
JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadQualified(
JNIEnv *env, jclass thiz, jstring modelPath);
/**
* load combined model for android
*/
JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombined(
JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath);
/**
* load combined qualified model for android
*/
JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombinedQualified(
JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath);
/**
* object detection for anroid
*/
......
......@@ -107,20 +107,22 @@ void PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda,
*buffer++ = *a3++;
}
}
int i = m - m_tail;
a0 = &A(i, 0);
a1 = a0 + lda;
a2 = a0 + 2 * lda;
a3 = a0 + 3 * lda;
if (m_tail != 0) {
if (m_tail <= 3) {
a3 = zero;
}
if (m_tail <= 2) {
a2 = zero;
}
if (m_tail <= 1) {
a1 = zero;
a0 = &A(m - m_tail, 0);
a1 = a0 + lda;
a2 = a0 + 2 * lda;
a3 = a0 + 3 * lda;
switch (m_tail) {
case 1:
a1 = zero;
case 2:
a2 = zero;
case 3:
a3 = zero;
break;
default:
break;
}
for (int j = 0; j < k; ++j) {
*buffer++ = *a0++;
......@@ -150,28 +152,89 @@ void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda,
*buffer++ = *a5++;
}
}
int i = m - m_tail;
a0 = &A(i, 0);
a1 = a0 + lda;
a2 = a0 + 2 * lda;
a3 = a0 + 3 * lda;
a4 = a0 + 4 * lda;
a5 = a0 + 5 * lda;
if (m_tail != 0) {
if (m_tail <= 5) {
a5 = zero;
a0 = &A(m - m_tail, 0);
a1 = a0 + lda;
a2 = a0 + 2 * lda;
a3 = a0 + 3 * lda;
a4 = a0 + 4 * lda;
a5 = a0 + 5 * lda;
switch (m_tail) {
case 1:
a1 = zero;
case 2:
a2 = zero;
case 3:
a3 = zero;
case 4:
a4 = zero;
case 5:
a5 = zero;
break;
default:
break;
}
if (m_tail <= 4) {
a4 = zero;
}
if (m_tail <= 3) {
a3 = zero;
for (int j = 0; j < k; ++j) {
*buffer++ = *a0++;
*buffer++ = *a1++;
*buffer++ = *a2++;
*buffer++ = *a3++;
*buffer++ = *a4++;
*buffer++ = *a5++;
}
if (m_tail <= 2) {
a2 = zero;
}
}
void PackMatrixA_8r(int m, int k, int m_tail, const float *A, int lda,
float *buffer) {
const float *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7;
for (int i = 0; i < m - m_tail; i += MR) {
a0 = A + i * lda;
a1 = A + (i + 1) * lda;
a2 = A + (i + 2) * lda;
a3 = A + (i + 3) * lda;
a4 = A + (i + 4) * lda;
a5 = A + (i + 5) * lda;
a6 = A + (i + 6) * lda;
a7 = A + (i + 7) * lda;
for (int j = 0; j < k; ++j) {
*buffer++ = *a0++;
*buffer++ = *a1++;
*buffer++ = *a2++;
*buffer++ = *a3++;
*buffer++ = *a4++;
*buffer++ = *a5++;
*buffer++ = *a6++;
*buffer++ = *a7++;
}
if (m_tail <= 1) {
a1 = zero;
}
if (m_tail != 0) {
a0 = &A(m - m_tail, 0);
a1 = a0 + lda;
a2 = a0 + 2 * lda;
a3 = a0 + 3 * lda;
a4 = a0 + 4 * lda;
a5 = a0 + 5 * lda;
a6 = a0 + 6 * lda;
a7 = a0 + 7 * lda;
switch (m_tail) {
case 1:
a1 = zero;
case 2:
a2 = zero;
case 3:
a3 = zero;
case 4:
a4 = zero;
case 5:
a5 = zero;
case 6:
a6 = zero;
case 7:
a7 = zero;
break;
default:
break;
}
for (int j = 0; j < k; ++j) {
*buffer++ = *a0++;
......@@ -180,6 +243,8 @@ void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda,
*buffer++ = *a3++;
*buffer++ = *a4++;
*buffer++ = *a5++;
*buffer++ = *a6++;
*buffer++ = *a7++;
}
}
}
......@@ -234,15 +299,78 @@ void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb,
}
}
#if __aarch64__
void PackMatrixB_12c(int k, int n, int n_tail, const float *B, int ldb,
float *buffer) {
const float *b0;
for (int j = 0; j < n - n_tail; j += NR) {
for (int i = 0; i < k; ++i) {
b0 = &B(i, j);
asm volatile(
"prfm pldl2keep, [%[b0], #64] \n\t"
"ld1 {v0.4s, v1.4s, v2.4s}, [%[b0]] \n\t"
"st1 {v0.4s, v1.4s, v2.4s}, [%[buffer]], #48 \n\t"
: [buffer] "+r"(buffer)
: [b0] "r"(b0)
: "memory", "v0", "v1", "v2");
}
}
if (n_tail != 0) {
for (int i = 0; i < k; ++i) {
b0 = &B(i, n - n_tail);
for (int j = n - n_tail; j < n; ++j) {
*buffer++ = *b0++;
}
for (int j = n; j < n + (NR - n_tail); ++j) {
*buffer++ = 0;
}
}
}
}
void PackMatrixB_16c(int k, int n, int n_tail, const float *B, int ldb,
float *buffer) {
const float *b0;
for (int j = 0; j < n - n_tail; j += NR) {
for (int i = 0; i < k; ++i) {
b0 = &B(i, j);
asm volatile(
"prfm pldl2keep, [%[b0], #64] \n\t"
"ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[b0]] \n\t"
"st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[buffer]], #64 \n\t"
: [buffer] "+r"(buffer)
: [b0] "r"(b0)
: "memory", "v0", "v1", "v2", "v3");
}
}
if (n_tail != 0) {
for (int i = 0; i < k; ++i) {
b0 = &B(i, n - n_tail);
for (int j = n - n_tail; j < n; ++j) {
*buffer++ = *b0++;
}
for (int j = n; j < n + (NR - n_tail); ++j) {
*buffer++ = 0;
}
}
}
}
#endif // __aarch64__
// 分块矩阵乘法
void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
float beta, float *c, float *C, int ldc, bool relu) {
#pragma omp parallel for
for (int j = 0; j < nc; j += NR) {
for (int i = 0; i < mc; i += MR) {
#if __aarch64__
// AddDot8x12(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
AddDot6x16(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
#else
// AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
// AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
#endif
}
}
......@@ -271,9 +399,14 @@ void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
#pragma omp parallel for
for (int j = 0; j < nc; j += NR) {
for (int i = 0; i < mc; i += MR) {
#if __aarch64__
// AddDot8x12(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
AddDot6x16(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
#else
// AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
// AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
#endif
}
}
......@@ -1956,10 +2089,20 @@ void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
int mc, nc;
for (int j = 0; j < n; j += NC) {
nc = s_min(n - j, NC);
#if __aarch64__
// PackMatrixB_12c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
PackMatrixB_16c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
#else
PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
#endif
for (int i = 0; i < m; i += MC) {
mc = s_min(m - i, MC);
#if __aarch64__
PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
// PackMatrixA_8r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
#else
PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
#endif
InnerKernel(mc, nc, alpha, packedA, packedB, beta, packedC, &C(i, j), ldc,
relu);
}
......@@ -2009,10 +2152,20 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
int mc, nc;
for (int j = 0; j < n; j += NC) {
nc = s_min(n - j, NC);
#if __aarch64__
// PackMatrixB_12c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
PackMatrixB_16c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
#else
PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
#endif
for (int i = 0; i < m; i += MC) {
mc = s_min(m - i, MC);
#if __aarch64__
PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
// PackMatrixA_8r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
#else
PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
#endif
InnerKernelWithBn(mc, nc, alpha, packedA, packedB, beta, packedC,
&C(i, j), ldc, relu, new_scale + i, new_bias + i);
}
......@@ -2239,6 +2392,192 @@ void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
#endif // __ARM_NEON
}
#if __aarch64__
void AddDot8x12(int k, const float *a, const float *b, float *c, int ldc) {
const float *a_ptr, *b_ptr;
a_ptr = a;
b_ptr = b;
int kc1 = k;
int step = 4 * ldc;
asm volatile(
"dup v5.4s, wzr \n\t"
"dup v6.4s, wzr \n\t"
"dup v7.4s, wzr \n\t"
"dup v8.4s, wzr \n\t"
"dup v9.4s, wzr \n\t"
"dup v10.4s, wzr \n\t"
"dup v11.4s, wzr \n\t"
"dup v12.4s, wzr \n\t"
"dup v13.4s, wzr \n\t"
"dup v14.4s, wzr \n\t"
"dup v15.4s, wzr \n\t"
"dup v16.4s, wzr \n\t"
"dup v17.4s, wzr \n\t"
"dup v18.4s, wzr \n\t"
"dup v19.4s, wzr \n\t"
"dup v20.4s, wzr \n\t"
"dup v21.4s, wzr \n\t"
"dup v22.4s, wzr \n\t"
"dup v23.4s, wzr \n\t"
"dup v24.4s, wzr \n\t"
"dup v25.4s, wzr \n\t"
"dup v26.4s, wzr \n\t"
"dup v27.4s, wzr \n\t"
"dup v28.4s, wzr \n\t"
"subs %[kc1], %[kc1], #1 \n\t"
"blt end_kc1_%= \n\t"
"loop_kc1_%=: \n\t"
"prfm pldl1keep, [%[a_ptr], #32] \n\t"
"prfm pldl1keep, [%[b_ptr], #48] \n\t"
"ld1 {v0.4s, v1.4s}, [%[a_ptr]], #32 \n\t"
"ld1 {v2.4s, v3.4s, v4.4s}, [%[b_ptr]], #48 \n\t"
"fmla v5.4s, v2.4s, v0.s[0] \n\t"
"fmla v6.4s, v3.4s, v0.s[0] \n\t"
"fmla v7.4s, v4.4s, v0.s[0] \n\t"
"fmla v8.4s, v2.4s, v0.s[1] \n\t"
"fmla v9.4s, v3.4s, v0.s[1] \n\t"
"fmla v10.4s, v4.4s, v0.s[1] \n\t"
"fmla v11.4s, v2.4s, v0.s[2] \n\t"
"fmla v12.4s, v3.4s, v0.s[2] \n\t"
"fmla v13.4s, v4.4s, v0.s[2] \n\t"
"fmla v14.4s, v2.4s, v0.s[3] \n\t"
"fmla v15.4s, v3.4s, v0.s[3] \n\t"
"fmla v16.4s, v4.4s, v0.s[3] \n\t"
"fmla v17.4s, v2.4s, v1.s[0] \n\t"
"fmla v18.4s, v3.4s, v1.s[0] \n\t"
"fmla v19.4s, v4.4s, v1.s[0] \n\t"
"fmla v20.4s, v2.4s, v1.s[1] \n\t"
"fmla v21.4s, v3.4s, v1.s[1] \n\t"
"fmla v22.4s, v4.4s, v1.s[1] \n\t"
"fmla v23.4s, v2.4s, v1.s[2] \n\t"
"fmla v24.4s, v3.4s, v1.s[2] \n\t"
"fmla v25.4s, v4.4s, v1.s[2] \n\t"
"fmla v26.4s, v2.4s, v1.s[3] \n\t"
"fmla v27.4s, v3.4s, v1.s[3] \n\t"
"fmla v28.4s, v4.4s, v1.s[3] \n\t"
"subs %[kc1], %[kc1], #1 \n\t"
"bge loop_kc1_%= \n\t"
"end_kc1_%=: \n\t"
"st1 {v5.4s, v6.4s, v7.4s}, [%[c]], %[step] \n\t"
"st1 {v8.4s, v9.4s, v10.4s}, [%[c]], %[step] \n\t"
"st1 {v11.4s, v12.4s, v13.4s}, [%[c]], %[step] \n\t"
"st1 {v14.4s, v15.4s, v16.4s}, [%[c]], %[step] \n\t"
"st1 {v17.4s, v18.4s, v19.4s}, [%[c]], %[step] \n\t"
"st1 {v20.4s, v21.4s, v22.4s}, [%[c]], %[step] \n\t"
"st1 {v23.4s, v24.4s, v25.4s}, [%[c]], %[step] \n\t"
"st1 {v26.4s, v27.4s, v28.4s}, [%[c]], %[step] \n\t"
:
: [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
[step] "r"(step)
: "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
"v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
"v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28");
}
void AddDot6x16(int k, const float *a, const float *b, float *c, int ldc) {
const float *a_ptr, *b_ptr;
a_ptr = a;
b_ptr = b;
int kc1 = k;
int step = 4 * ldc;
int step1 = 4 * 6;
asm volatile(
"dup v6.4s, wzr \n\t"
"dup v7.4s, wzr \n\t"
"dup v8.4s, wzr \n\t"
"dup v9.4s, wzr \n\t"
"dup v10.4s, wzr \n\t"
"dup v11.4s, wzr \n\t"
"dup v12.4s, wzr \n\t"
"dup v13.4s, wzr \n\t"
"dup v14.4s, wzr \n\t"
"dup v15.4s, wzr \n\t"
"dup v16.4s, wzr \n\t"
"dup v17.4s, wzr \n\t"
"dup v18.4s, wzr \n\t"
"dup v19.4s, wzr \n\t"
"dup v20.4s, wzr \n\t"
"dup v21.4s, wzr \n\t"
"dup v22.4s, wzr \n\t"
"dup v23.4s, wzr \n\t"
"dup v24.4s, wzr \n\t"
"dup v25.4s, wzr \n\t"
"dup v26.4s, wzr \n\t"
"dup v27.4s, wzr \n\t"
"dup v28.4s, wzr \n\t"
"dup v29.4s, wzr \n\t"
"subs %[kc1], %[kc1], #1 \n\t"
"blt end_kc1_%= \n\t"
"loop_kc1_%=: \n\t"
"prfm pldl1keep, [%[a_ptr], #24] \n\t"
"prfm pldl1keep, [%[b_ptr], #64] \n\t"
"ld1 {v0.4s, v1.4s}, [%[a_ptr]], %[step1] \n\t"
"ld1 {v2.4s, v3.4s, v4.4s, v5.4s}, [%[b_ptr]], #64 \n\t"
"fmla v6.4s, v2.4s, v0.s[0] \n\t"
"fmla v7.4s, v3.4s, v0.s[0] \n\t"
"fmla v8.4s, v4.4s, v0.s[0] \n\t"
"fmla v9.4s, v5.4s, v0.s[0] \n\t"
"fmla v10.4s, v2.4s, v0.s[1] \n\t"
"fmla v11.4s, v3.4s, v0.s[1] \n\t"
"fmla v12.4s, v4.4s, v0.s[1] \n\t"
"fmla v13.4s, v5.4s, v0.s[1] \n\t"
"fmla v14.4s, v2.4s, v0.s[2] \n\t"
"fmla v15.4s, v3.4s, v0.s[2] \n\t"
"fmla v16.4s, v4.4s, v0.s[2] \n\t"
"fmla v17.4s, v5.4s, v0.s[2] \n\t"
"fmla v18.4s, v2.4s, v0.s[3] \n\t"
"fmla v19.4s, v3.4s, v0.s[3] \n\t"
"fmla v20.4s, v4.4s, v0.s[3] \n\t"
"fmla v21.4s, v5.4s, v0.s[3] \n\t"
"fmla v22.4s, v2.4s, v1.s[0] \n\t"
"fmla v23.4s, v3.4s, v1.s[0] \n\t"
"fmla v24.4s, v4.4s, v1.s[0] \n\t"
"fmla v25.4s, v5.4s, v1.s[0] \n\t"
"fmla v26.4s, v2.4s, v1.s[1] \n\t"
"fmla v27.4s, v3.4s, v1.s[1] \n\t"
"fmla v28.4s, v4.4s, v1.s[1] \n\t"
"fmla v29.4s, v5.4s, v1.s[1] \n\t"
"subs %[kc1], %[kc1], #1 \n\t"
"bge loop_kc1_%= \n\t"
"end_kc1_%=: \n\t"
"st1 {v6.4s, v7.4s, v8.4s, v9.4s}, [%[c]], %[step] \n\t"
"st1 {v10.4s, v11.4s, v12.4s, v13.4s}, [%[c]], %[step] \n\t"
"st1 {v14.4s, v15.4s, v16.4s, v17.4s}, [%[c]], %[step] \n\t"
"st1 {v18.4s, v19.4s, v20.4s, v21.4s}, [%[c]], %[step] \n\t"
"st1 {v22.4s, v23.4s, v24.4s, v25.4s}, [%[c]], %[step] \n\t"
"st1 {v26.4s, v27.4s, v28.4s, v29.4s}, [%[c]], %[step] \n\t"
:
: [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
[step] "r"(step), [step1] "r"(step1)
: "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
"v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
"v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29");
}
#endif // __aarch64__
} // namespace math
} // namespace operators
} // namespace paddle_mobile
......@@ -19,8 +19,13 @@ limitations under the License. */
#define B(i, j) B[(i)*ldb + (j)]
#define C(i, j) C[(i)*ldc + (j)]
#if __aarch64__
#define MR 6
#define NR 16
#else
#define MR 6
#define NR 8
#endif
#define s_min(i, j) ((i) < (j) ? (i) : (j))
......@@ -43,10 +48,16 @@ void PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda,
float *buffer);
void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda,
float *buffer);
void PackMatrixA_8r(int m, int k, int m_tail, const float *A, int lda,
float *buffer);
// 将 B 矩阵分块复制到连续内存(RowMajor)
void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb,
float *buffer);
void PackMatrixB_12c(int k, int n, int n_tail, const float *B, int ldb,
float *buffer);
void PackMatrixB_16c(int k, int n, int n_tail, const float *B, int ldb,
float *buffer);
// 分块矩阵乘法
void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
......@@ -70,6 +81,8 @@ void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc);
void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc);
void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc);
void AddDot8x12(int k, const float *a, const float *b, float *c, int ldc);
void AddDot6x16(int k, const float *a, const float *b, float *c, int ldc);
// 分块矩阵乘法结果回写
// C = A * B
......@@ -114,10 +127,6 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc,
bool relu, float *new_scale, float *new_bias);
// 64位 double 矩阵乘法
void dgemm(int m, int n, int k, float alpha, const double *A, int lda,
const double *B, int ldb, float beta, double *C, int ldc);
} // namespace math
} // namespace operators
} // namespace paddle_mobile
......@@ -114,8 +114,12 @@ else ()
target_link_libraries(test-softmax paddle-mobile)
# gen test
ADD_EXECUTABLE(test-gemm common/test_gemm.cpp)
target_link_libraries(test-gemm paddle-mobile)
ADD_EXECUTABLE(test-gemm-accuracy common/test_gemm_accuracy.cpp)
target_link_libraries(test-gemm-accuracy paddle-mobile)
# gen test
ADD_EXECUTABLE(test-gemm-perf common/test_gemm_perf.cpp)
target_link_libraries(test-gemm-perf paddle-mobile)
# gen test
ADD_EXECUTABLE(test-enforce common/test_enforce.cpp)
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <iostream>
#include "../test_helper.h"
#include "operators/math/gemm.h"
#include "operators/math/math_function.h"
#define a(i, j) a[(i)*lda + (j)]
#define b(i, j) b[(i)*ldb + (j)]
#define c1(i, j) c1[(i)*ldc + (j)]
#define m 1024
#define n 1024
#define k 1024
int main() {
Tensor aa, bb, cc, scale, bias;
auto aaptr = aa.mutable_data<float>({m, k});
auto bbptr = bb.mutable_data<float>({k, n});
auto ccptr = cc.mutable_data<float>({m, n});
auto scaleptr = scale.mutable_data<float>({m});
auto biasptr = bias.mutable_data<float>({m});
for (int i = 0; i < m * k; ++i) {
aaptr[i] = 2;
}
for (int i = 0; i < k * n; ++i) {
bbptr[i] = 2;
}
for (int i = 0; i < m * n; ++i) {
ccptr[i] = 2;
}
for (int i = 0; i < m; ++i) {
scaleptr[i] = 1;
biasptr[i] = 0;
}
auto time1 = time();
for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul<float>(aa, false, bb, false,
static_cast<float>(1), &cc,
static_cast<float>(0), false);
// paddle_mobile::operators::math::matmulWithBn<float>(
// aa, false, bb, false, static_cast<float>(1), &cc,
// static_cast<float>(0), true, &scale, &bias, 0);
}
auto time2 = time();
std::cout << "gemm cost :" << time_diff(time1, time2) / 10 << "ms\n";
return 0;
}
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <fstream>
#include "../test_helper.h"
#include "../test_include.h"
int main() {
paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
bool optimize = false;
if (paddle_mobile.Load(g_googlenet, optimize)) {
auto time2 = time();
DLOG << "load cost: " << time_diff(time1, time1) << "ms";
std::vector<float> input;
std::vector<int64_t> dims{1, 3, 224, 224};
GetInput<float>(g_test_image_1x3x224x224, &input, dims);
auto time3 = time();
auto vec_result = paddle_mobile.Predict(input, dims);
auto time4 = time();
DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
}
return 0;
}
......@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <fstream>
#include <iostream>
#include "../test_helper.h"
#include "../test_include.h"
......@@ -23,15 +23,20 @@ int main() {
auto time1 = time();
if (paddle_mobile.Load(g_googlenet, optimize)) {
auto time2 = time();
DLOG << "load cost: " << time_diff(time1, time1) << "ms";
std::cout << "load cost :" << time_diff(time1, time2) << "ms" << std::endl;
std::vector<float> input;
std::vector<int64_t> dims{1, 3, 224, 224};
GetInput<float>(g_test_image_1x3x224x224, &input, dims);
auto time3 = time();
// 预热一次
auto vec_result = paddle_mobile.Predict(input, dims);
auto time3 = time();
for (int i = 0; i < 10; ++i) {
auto vec_result = paddle_mobile.Predict(input, dims);
}
auto time4 = time();
DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
<< std::endl;
}
return 0;
}
......@@ -68,60 +68,60 @@ std::shared_ptr<ProgramDesc> loadParams(const std::string &model_path) {
}
void LoadWithDump(const paddle_mobile::framework::VarDesc &var_desc, char *dataP, FILE *out_file) {
void LoadWithDump(const paddle_mobile::framework::VarDesc &var_desc, char **dataP, FILE *out_file) {
// 1. version
uint32_t version = *reinterpret_cast<uint32_t *>(dataP);
uint32_t version = *reinterpret_cast<uint32_t *>(*dataP);
// write version
fwrite(&version, kSize32, 1, out_file);
dataP += kSize32;
*dataP += kSize32;
// 2 Lod information
auto *lod_level_ptr = new uint64_t();
memcpy(lod_level_ptr, dataP, kSize64);
memcpy(lod_level_ptr, *dataP, kSize64);
uint64_t lod_level = 0;
// write lod Information
fwrite(&lod_level, kSize64, 1, out_file);
delete lod_level_ptr;
dataP += kSize64;
*dataP += kSize64;
for (uint64_t i = 0; i < lod_level; ++i) {
uint64_t size = *reinterpret_cast<uint64_t *>(dataP);
uint64_t size = *reinterpret_cast<uint64_t *>(*dataP);
// write lod size
fwrite(&size, kSize64, 1, out_file);
(dataP) += kSize64;
(*dataP) += kSize64;
std::vector<size_t> tmp(size / sizeof(size_t));
for (unsigned long &k : tmp) {
k = *reinterpret_cast<size_t *>(dataP);
(dataP) += sizeof(size_t);
k = *reinterpret_cast<size_t *>(*dataP);
(*dataP) += sizeof(size_t);
}
// write lod size vector
fwrite(&tmp, sizeof(size_t), tmp.size(), out_file);
}
// 3. tensor version
uint32_t tensor_version = *reinterpret_cast<uint32_t *>(dataP);
uint32_t tensor_version = *reinterpret_cast<uint32_t *>(*dataP);
// write tensor version
fwrite(&tensor_version, kSize32, 1, out_file);
(dataP) += kSize32;
(*dataP) += kSize32;
// 4. tensor desc
int32_t size = *reinterpret_cast<int32_t *>(dataP);
int32_t size = *reinterpret_cast<int32_t *>(*dataP);
// write tensor desc
fwrite(&size, sizeof(int32_t), 1, out_file);
(dataP) += sizeof(int32_t);
(*dataP) += sizeof(int32_t);
std::unique_ptr<char[]> buf(new char[size]);
for (int m = 0; m < size; ++m) {
buf.get()[m] = (dataP)[m];
buf.get()[m] = (*dataP)[m];
}
fwrite(buf.get(), sizeof(char), static_cast<size_t>(size), out_file);
(dataP) += (sizeof(char) * size);
(*dataP) += (sizeof(char) * size);
const paddle_mobile::framework::TensorDesc &desc = var_desc.Tensor_desc();
int memory_size = 1;
......@@ -158,9 +158,9 @@ void LoadWithDump(const paddle_mobile::framework::VarDesc &var_desc, char *dataP
memory = new char[tensorSize];
for (int n = 0; n < tensorSize; ++n) {
static_cast<char *>(memory)[n] = (dataP)[n];
static_cast<char *>(memory)[n] = (*dataP)[n];
}
dataP += tensorSize;
*dataP += tensorSize;
// for float 32
float min_value = std::numeric_limits<float>::max();
......@@ -194,7 +194,7 @@ quantificate_combined(const std::string &model_path, const std::string &param_pa
if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
continue;
}
LoadWithDump(*var_desc, data, out_file);
LoadWithDump(*var_desc, &data, out_file);
}
}
}
......@@ -220,7 +220,7 @@ void quantificate_seperated(const std::string model_dir, const std::string param
FILE *out_file = fopen(file_name.c_str(), "wb");
char *origin_data = Get_binary_data(model_dir + "/" + var_desc->Name());
char *data = origin_data;
LoadWithDump(*var_desc, data, out_file);
LoadWithDump(*var_desc, &data, out_file);
delete origin_data;
fclose(out_file);
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册