提交 7f7c824e 编写于 作者: E eclipsycn 提交者: GitHub

Merge branch 'develop' into develop

cmake_minimum_required(VERSION 3.0)
cmake_minimum_required(VERSION 3.6)
project(paddle-mobile)
option(DEBUGING "enable debug mode" ON)
option(USE_OPENMP "openmp support" ON)
option(USE_OPENMP "openmp support" OFF)
option(USE_EXCEPTION "use std exception" ON)
option(LOG_PROFILE "log profile" ON)
# select the platform to build
......@@ -15,7 +15,7 @@ file(GLOB_RECURSE PADDLE_MOBILE_H src/*.h)
include_directories(src/)
if(IS_IOS)
set(CMAKE_CXX_FLAGS "-fobjc-abi-version=2 -fobjc-arc -std=gnu++11 -stdlib=libc++ -O3 -s -isysroot ${CMAKE_OSX_SYSROOT} ${CMAKE_CXX_FLAGS}")
set(CMAKE_CXX_FLAGS "-mfpu=neon -marm -fobjc-abi-version=2 -fobjc-arc -std=gnu++11 -stdlib=libc++ -O3 -s -isysroot ${CMAKE_OSX_SYSROOT} ${CMAKE_CXX_FLAGS}")
else()
set(CMAKE_CXX_FLAGS "-std=c++14 -O3 -s ${CMAKE_CXX_FLAGS}")
endif()
......@@ -43,7 +43,7 @@ if (LOG_PROFILE)
add_definitions(-DPADDLE_MOBILE_PROFILE)
endif()
if(USE_OPENMP)
if(USE_OPENMP AND NOT IS_IOS)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
add_definitions(-DPADDLE_MOBILE_USE_OPENMP)
endif()
......@@ -104,12 +104,21 @@ else()
foreach(f ${_tmp_list_h})
list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
endforeach()
endif()
file(GLOB_RECURSE _tmp_list src/fpga/*.cpp src/fpga/*.cc)
foreach(f ${_tmp_list})
list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
endforeach()
file(GLOB_RECURSE _tmp_list_h src/fpga/*.h)
foreach(f ${_tmp_list_h})
list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
endforeach()
endif()
if (ANDROID_NDK_TOOLCHAIN_INCLUDED)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog")
add_definitions(-DARMV7)
else()
list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/paddle_mobile_jni.h)
list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/paddle_mobile_jni.cpp)
......@@ -130,8 +139,8 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY build)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY build)
# NET default
set(NET "defult" CACHE STRING "select net type")
set_property(CACHE NET PROPERTY STRINGS "defult" "googlenet" "mobilenet" "yolo" "squeezenet")
set(NET "default" CACHE STRING "select net type")
set_property(CACHE NET PROPERTY STRINGS "default" "googlenet" "mobilenet" "yolo" "squeezenet" "FPGAnets")
include("${CMAKE_CURRENT_LIST_DIR}/tools/op.cmake")
......@@ -153,3 +162,4 @@ if(DEBUGING)
endif()
endif()
......@@ -27,10 +27,10 @@ Paddle-Moible是PaddlePaddle组织下的项目,是一个致力于嵌入式平
- **ARM CPU**
![](http://7xop3k.com1.z0.glb.clouddn.com/15312108766575.jpg)
![](http://mms-graph.bj.bcebos.com/paddle-mobile%2F2018_07_29.png)
arm cpu是paddle-mobile的主要支持方向,cpu的通用性一直是其优势。嵌入式深度学习,需要大量的cpu汇编实现。我们正在紧锣密鼓的编码,为的是能充分硬件的每一点加速能力。
arm cpu的优化工作还在进行中,现在使用了常规的cpu优化。在arm a73上paddle-mobile arm-v7现在单核运行一次mobilenet1.0是120+ms,显然这不是我们的最终目标,我们正在用大量的汇编改写,后续性能仍会有巨大提升空间, 目前只支持armv7, 未来我们也会支持armv8。
arm cpu的优化工作还在进行中,现在使用了常规的cpu优化。在arm a73上paddle-mobile arm-v7现在单核运行一次mobilenet1.0是110+ms,显然这不是我们的最终目标,我们正在用大量的汇编改写,后续性能仍会有巨大提升空间, 目前只支持armv7, 未来我们也会支持armv8。
- **Mali GPU**
......
# Quantification 模型量化、反量化
## 背景故事
部分网络如AlexNet训练出的模型体积较大,不适宜在移动设备上使用。
## 解决模型过大办法
1. 选用适合移动端的模型结构如:mobilenet、googlenet、 yolo、squeezenet 等;
2. 使用我们提供的量化工具,可以在几乎不影响精度的情况下将float32模型减小至原模型的 1/4;
- - - - -
## 量化工具介绍
### 模型转化工具目录:
- [量化工具目录](https://github.com/PaddlePaddle/paddle-mobile/tree/develop/tools/quantification)
- [模型转化工具](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/tools/quantification/convert.cpp)
#### 使用说明
- [工具使用](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/tools/quantification/README.md)
## 如何读取量化后的模型
load方法中添加了 quantification 参数,默认为false。 如果需要load量化后的模型,按需传参即可。
[我是源代码](https://github.com/PaddlePaddle/paddle-mobile/blob/55302b33ea3bd68c9797d8f65e527544792b8095/src/io/paddle_mobile.h)
```c++
bool Load(const std::string &dirname, bool optimize = false,
bool quantification = false, int batch_size = 1);
```
- - - - -
......@@ -17,38 +17,46 @@ limitations under the License. */
namespace paddle_mobile {
const std::string G_OP_TYPE_CONV = "conv2d";
const std::string G_OP_TYPE_BATCHNORM = "batch_norm";
const std::string G_OP_TYPE_BOX_CODER = "box_coder";
const std::string G_OP_TYPE_CONCAT = "concat";
const std::string G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add";
const std::string G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu";
const std::string G_OP_TYPE_FUSION_CONV_ADD_BN_RELU = "fusion_conv_add_bn_relu";
const std::string G_OP_TYPE_FUSION_DWCONV_BN_RELU = "fusion_dwconv_bn_relu";
const std::string G_OP_TYPE_FC = "fusion_fc";
const std::string G_OP_TYPE_FUSION_CONV_ADD = "fusion_conv_add";
const std::string G_OP_TYPE_LRN = "lrn";
const std::string G_OP_TYPE_MUL = "mul";
const std::string G_OP_TYPE_MULTICLASS_NMS = "multiclass_nms";
const std::string G_OP_TYPE_POOL2D = "pool2d";
const std::string G_OP_TYPE_PRIOR_BOX = "prior_box";
const std::string G_OP_TYPE_RELU = "relu";
const std::string G_OP_TYPE_RESHAPE = "reshape";
const std::string G_OP_TYPE_SIGMOID = "sigmoid";
const std::string G_OP_TYPE_SOFTMAX = "softmax";
const std::string G_OP_TYPE_TRANSPOSE = "transpose";
const std::string G_OP_TYPE_SPLIT = "split";
const std::string G_OP_TYPE_FEED = "feed";
const std::string G_OP_TYPE_FETCH = "fetch";
const std::string G_OP_TYPE_DEPTHWISE_CONV = "depthwise_conv2d";
const std::string G_OP_TYPE_IM2SEQUENCE = "im2sequence";
const std::string G_OP_TYPE_DROPOUT = "dropout";
const char *G_OP_TYPE_CONV = "conv2d";
const char *G_OP_TYPE_BATCHNORM = "batch_norm";
const char *G_OP_TYPE_BOX_CODER = "box_coder";
const char *G_OP_TYPE_CONCAT = "concat";
const char *G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add";
const char *G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu";
const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU = "fusion_conv_add_bn_relu";
const char *G_OP_TYPE_FUSION_DWCONV_BN_RELU = "fusion_dwconv_bn_relu";
const char *G_OP_TYPE_FUSION_CONV_BN_RELU = "fusion_conv_bn_relu";
const char *G_OP_TYPE_FC = "fusion_fc";
const char *G_OP_TYPE_FUSION_CONV_ADD = "fusion_conv_add";
const char *G_OP_TYPE_LRN = "lrn";
const char *G_OP_TYPE_MUL = "mul";
const char *G_OP_TYPE_MULTICLASS_NMS = "multiclass_nms";
const char *G_OP_TYPE_POOL2D = "pool2d";
const char *G_OP_TYPE_PRIOR_BOX = "prior_box";
const char *G_OP_TYPE_RELU = "relu";
const char *G_OP_TYPE_RESHAPE = "reshape";
const char *G_OP_TYPE_SIGMOID = "sigmoid";
const char *G_OP_TYPE_SOFTMAX = "softmax";
const char *G_OP_TYPE_TRANSPOSE = "transpose";
const char *G_OP_TYPE_SPLIT = "split";
const char *G_OP_TYPE_FEED = "feed";
const char *G_OP_TYPE_FETCH = "fetch";
const char *G_OP_TYPE_DEPTHWISE_CONV = "depthwise_conv2d";
const char *G_OP_TYPE_IM2SEQUENCE = "im2sequence";
const char *G_OP_TYPE_DROPOUT = "dropout";
const char *G_OP_TYPE_FUSION_CONV_ADD_BN = "fusion_conv_add_bn";
const char *G_OP_TYPE_FUSION_POOL_BN = "fusion_pool_bn";
const char *G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU =
"fusion_elementwise_add_relu";
const char *G_OP_TYPE_FUSION_FC_RELU = "fusion_fc_relu";
const char *G_OP_TYPE_REGION = "region";
std::unordered_map<
std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
op_input_output_key = {
{G_OP_TYPE_CONV, {{"Input"}, {"Output"}}},
{G_OP_TYPE_FUSION_DWCONV_BN_RELU, {{"Input"}, {"Out"}}},
{G_OP_TYPE_FUSION_CONV_BN_RELU, {{"Input"}, {"Out"}}},
{G_OP_TYPE_FUSION_CONV_ADD, {{"Input"}, {"Out"}}},
{G_OP_TYPE_RELU, {{"X"}, {"Out"}}},
{G_OP_TYPE_SOFTMAX, {{"X"}, {"Out"}}},
......@@ -72,6 +80,11 @@ std::unordered_map<
{G_OP_TYPE_DEPTHWISE_CONV, {{"Input"}, {"Output"}}},
{G_OP_TYPE_FUSION_CONV_ADD_RELU, {{"Input"}, {"Out"}}},
{G_OP_TYPE_IM2SEQUENCE, {{"X"}, {"Out"}}},
{G_OP_TYPE_DROPOUT, {{"X"}, {"Out"}}}};
{G_OP_TYPE_DROPOUT, {{"X"}, {"Out"}}},
{G_OP_TYPE_FUSION_CONV_ADD_BN, {{"Input"}, {"Out"}}},
{G_OP_TYPE_FUSION_POOL_BN, {{"X"}, {"Out"}}},
{G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU, {{"X", "Y"}, {"Out"}}},
{G_OP_TYPE_FUSION_FC_RELU, {{"X", "Y", "Z"}, {"Out"}}},
{G_OP_TYPE_REGION, {{"X"}, {"Out"}}}};
} // namespace paddle_mobile
......@@ -16,6 +16,7 @@ limitations under the License. */
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
namespace paddle_mobile {
......@@ -72,33 +73,40 @@ enum PMStatus {
PMWrongDevice = 0x08 /*!< un-correct device. */
};
extern const std::string G_OP_TYPE_CONV;
extern const std::string G_OP_TYPE_BATCHNORM;
extern const std::string G_OP_TYPE_BOX_CODER;
extern const std::string G_OP_TYPE_CONCAT;
extern const std::string G_OP_TYPE_ELEMENTWISE_ADD;
extern const std::string G_OP_TYPE_FUSION_CONV_ADD_RELU;
extern const std::string G_OP_TYPE_FC;
extern const std::string G_OP_TYPE_FUSION_CONV_ADD;
extern const std::string G_OP_TYPE_FUSION_CONV_ADD_BN_RELU;
extern const std::string G_OP_TYPE_FUSION_DWCONV_BN_RELU;
extern const std::string G_OP_TYPE_LRN;
extern const std::string G_OP_TYPE_MUL;
extern const std::string G_OP_TYPE_MULTICLASS_NMS;
extern const std::string G_OP_TYPE_POOL2D;
extern const std::string G_OP_TYPE_PRIOR_BOX;
extern const std::string G_OP_TYPE_RELU;
extern const std::string G_OP_TYPE_RESHAPE;
extern const std::string G_OP_TYPE_SIGMOID;
extern const std::string G_OP_TYPE_SOFTMAX;
extern const std::string G_OP_TYPE_TRANSPOSE;
extern const std::string G_OP_TYPE_SPLIT;
extern const std::string G_OP_TYPE_FEED;
extern const std::string G_OP_TYPE_FETCH;
extern const std::string G_OP_TYPE_DEPTHWISE_CONV;
extern const std::string G_OP_TYPE_IM2SEQUENCE;
extern const std::string G_OP_TYPE_DROPOUT;
extern const char *G_OP_TYPE_CONV;
extern const char *G_OP_TYPE_BATCHNORM;
extern const char *G_OP_TYPE_BOX_CODER;
extern const char *G_OP_TYPE_CONCAT;
extern const char *G_OP_TYPE_ELEMENTWISE_ADD;
extern const char *G_OP_TYPE_FUSION_CONV_ADD_RELU;
extern const char *G_OP_TYPE_FC;
extern const char *G_OP_TYPE_FUSION_CONV_ADD;
extern const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU;
extern const char *G_OP_TYPE_FUSION_DWCONV_BN_RELU;
extern const char *G_OP_TYPE_FUSION_CONV_BN_RELU;
extern const char *G_OP_TYPE_LRN;
extern const char *G_OP_TYPE_MUL;
extern const char *G_OP_TYPE_MULTICLASS_NMS;
extern const char *G_OP_TYPE_POOL2D;
extern const char *G_OP_TYPE_PRIOR_BOX;
extern const char *G_OP_TYPE_RELU;
extern const char *G_OP_TYPE_RESHAPE;
extern const char *G_OP_TYPE_SIGMOID;
extern const char *G_OP_TYPE_SOFTMAX;
extern const char *G_OP_TYPE_TRANSPOSE;
extern const char *G_OP_TYPE_SPLIT;
extern const char *G_OP_TYPE_FEED;
extern const char *G_OP_TYPE_FETCH;
extern const char *G_OP_TYPE_DEPTHWISE_CONV;
extern const char *G_OP_TYPE_IM2SEQUENCE;
extern const char *G_OP_TYPE_DROPOUT;
extern const char *G_OP_TYPE_FUSION_CONV_ADD_BN;
extern const char *G_OP_TYPE_FUSION_POOL_BN;
extern const char *G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU;
extern const char *G_OP_TYPE_FUSION_FC_RELU;
extern const char *G_OP_TYPE_REGION;
extern std::unordered_map<
std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
......
......@@ -84,7 +84,7 @@ struct Variant {
if (type_id == typeid(T).hash_code()) {
return *const_cast<T *>(reinterpret_cast<const T *>(&data));
} else {
PADDLE_MOBILE_THROW_EXCEPTION(" bad cast in variant ");
PADDLE_MOBILE_THROW_EXCEPTION(" bad cast in variant");
exit(0);
}
}
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <errno.h>
#include <fcntl.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <sys/types.h>
#include <algorithm>
#include <cmath>
#include <cstdio>
#include <cstring>
#include "fpga/api/fpga_api.h"
namespace paddle {
namespace mobile {
namespace fpga {
namespace api {
static int fd = -1;
static const char *device_path = "/dev/fpgadrv0";
static inline int do_ioctl(int req, void *arg) { return ioctl(req, arg); }
int open_device() {
if (fd == -1) {
fd = open(device_path, O_RDWR);
}
return fd;
}
// memory management;
void *fpga_malloc(size_t size) {
return reinterpret_cast<(void *)> mmap64(NULL, size, PROT_READ | PROT_WRITE,
MAP_SHARED, fd, 0);
}
void fpga_free(void *ptr) { munmap(ptr, 0); }
void fpga_copy(void *dest, const void *src, size_t num) {
memcpy(dest, src, num);
}
int ComputeFpgaConv(struct FpgaConvArgs) {}
int ComputeFpgaPool(struct FpgaPoolArgs) {}
int ComputeFpgaEWAdd(struct FpgaEWAddArgs) {}
} // namespace api
} // namespace fpga
} // namespace mobile
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cstddef>
#include <iostream>
#include <limits>
// memory management;
namespace paddle {
namespace mobile {
namespace fpga {
namespace api {
int open_device();
int close_device();
void *fpga_malloc(size_t size);
void fpga_free(void *ptr);
void fpga_copy(void *dst, const void *src, size_t num);
struct FpgaVersionArgs {
void *buf;
};
struct MemoryToPhysicalArgs {
const void *src;
uint64_t physical;
};
struct MemoryCopyArgs {
void *src;
void *dst;
size_t size;
};
struct FpgaQuantArgs {
float scale;
};
struct FpgaBNArgs {};
struct FpgaConvArgs {
bool enable_BN = false;
bool enable_Relu = false;
struct FpgaBNParam bn_parm;
};
struct FpgaPoolArgs {
bool enable_BN = false;
struct FpgaBNParam bn_parm;
};
struct FpgaEWAddArgs { // only support X + Y
bool enable_Relu = false;
};
int ComputeFpgaConv(struct FpgaConvArgs);
int ComputeFpgaPool(struct FpgaPoolArgs);
int ComputeFpgaEWAdd(struct FpgaEWAddArgs);
#define IOCTL_FPGA_MAGIC 'FPGA'
#define IOCTL_VERSION _IOW(IOCTL_FPGA_MAGIC, 1, struct FpgaVersionArgs)
#define IOCTL_GET_QUANT _IOW(IOCTL_FPGA_MAGIC, 2, struct FpgaQuantArgs)
#define IOCTL_SET_QUANT _IOW(IOCTL_FPGA_MAGIC, 3, struct FpgaArgs)
#define IOCTL_MEM_COPY _IOW(IOCTL_FPGA_MAGIC, 11, struct MemoryCopyArgs)
#define IOCTL_MEM_TOPHY _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryToPhysicalArgs)
#define IOCTL_CONFIG_CONV _IOW(IOCTL_FPGA_MAGIC, 21, struct FpgaConvArgs)
#define IOCTL_CONFIG_POOLING _IOW(IOCTL_FPGA_MAGIC, 22, struct FpgaPoolArgs)
#define IOCTL_CONFIG_EW _IOW(IOCTL_FPGA_MAGIC, 23, struct FpgaEWAddArgs)
} // namespace api
} // namespace fpga
} // namespace mobile
} // namespace paddle
......@@ -28,6 +28,16 @@ vector<string> OperatorBase<Dtype>::GetOutKeys() const {
return it->second.second;
}
template <typename Dtype>
vector<string> OperatorBase<Dtype>::GetInputKeys() const {
auto it = op_input_output_key.find(type_);
if (it == op_input_output_key.end()) {
DLOG << type_ << " has no outputs";
return {};
}
return it->second.first;
}
template <typename Dtype>
OperatorBase<Dtype>::OperatorBase(const std::string &type,
const VariableNameMap &inputs,
......@@ -49,6 +59,11 @@ template <typename Dtype>
void OperatorBase<Dtype>::Run() const {
RunImpl();
#ifdef PADDLE_MOBILE_DEBUG
vector<string> input_keys = GetInputKeys();
for (const auto key : input_keys) {
Tensor *input = GetVarValue<framework::LoDTensor>(key, inputs_, *scope_);
DLOG << type_ << " input- " << key << "=" << *input;
}
vector<string> output_keys = GetOutKeys();
for (const auto key : output_keys) {
Tensor *out_ = GetVarValue<framework::LoDTensor>(key, outputs_, *scope_);
......
......@@ -61,6 +61,7 @@ class OperatorBase {
virtual ~OperatorBase() {}
void Run() const;
std::vector<string> GetOutKeys() const;
std::vector<string> GetInputKeys() const;
virtual void RunImpl() const = 0;
virtual void Init() = 0;
......@@ -118,6 +119,10 @@ class OperatorWithKernel : public OperatorBase<Dtype> {
virtual void InferShape() const = 0;
void Init() {
// for (auto i : this->inputs_) {
// DLOG << i.first;
// DLOG << i.second;
// }
PADDLE_MOBILE_ENFORCE(kernel_.Init(&param_), " %s kernel init failed",
this->type_.c_str());
}
......@@ -146,7 +151,7 @@ class OpKernelBase {
}
#endif
virtual void Compute(const P &para) const = 0;
virtual bool Init(P *para) { return true; };
virtual bool Init(P *para) { return true; }
virtual ~OpKernelBase() = default;
private:
......
......@@ -42,8 +42,17 @@ class FusionOpRegister {
matchers_[matcher->Type()] = shared_matcher;
}
const std::map<std::string, std::shared_ptr<FusionOpMatcher>> Matchers() {
return matchers_;
const std::vector<std::shared_ptr<FusionOpMatcher>> Matchers() {
std::vector<std::shared_ptr<FusionOpMatcher>> matchers;
for (const auto& match : matchers_) {
matchers.push_back(match.second);
}
std::sort(matchers.begin(), matchers.end(),
[](std::shared_ptr<FusionOpMatcher> first,
std::shared_ptr<FusionOpMatcher> second) {
return first->BeginNode().Depth() > second->BeginNode().Depth();
});
return matchers;
}
private:
......
......@@ -44,23 +44,6 @@ bool Node::operator==(const Node &in) {
return true;
}
std::vector<std::shared_ptr<framework::OpDesc>> Node::OpDescs(int size) {
std::vector<std::shared_ptr<framework::OpDesc>> op_descs;
OpDescs(size - 1, &op_descs);
return op_descs;
}
void Node::OpDescs(int index,
std::vector<std::shared_ptr<framework::OpDesc>> *op_desc) {
if (index == 0) {
return;
}
op_desc->push_back(this->op_desc_);
for (auto &output : outputs_) {
output->OpDescs(index, op_desc);
}
}
std::shared_ptr<Node> Node::To(int size) {
std::shared_ptr<Node> node = std::make_shared<Node>();
this->To(size - 1, node);
......
......@@ -47,13 +47,10 @@ class Node {
std::map<std::string, std::vector<std::pair<std::string, std::string>>>
change,
std::vector<std::shared_ptr<Node>> *removed_nodes);
std::vector<std::shared_ptr<framework::OpDesc>> OpDescs(int size);
std::shared_ptr<framework::OpDesc> OpDescOfNode() { return op_desc_; }
std::string Type() { return type_; }
private:
void OpDescs(int size,
std::vector<std::shared_ptr<framework::OpDesc>> *op_desc);
void To(int index, std::shared_ptr<Node>);
void Folder(
std::shared_ptr<framework::OpDesc> op_desc,
......
......@@ -78,9 +78,8 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FusionOptimize(
}
for (auto &registed : FusionOpRegister::Instance()->Matchers()) {
std::string fusion_type = registed.first;
std::shared_ptr<FusionOpMatcher> matcher = registed.second;
// DLOG << " registed node \n " << matcher->BeginNode();
std::string fusion_type = registed->Type();
std::shared_ptr<FusionOpMatcher> matcher = registed;
auto match_vector = type_map[matcher->BeginType()];
......
......@@ -30,6 +30,7 @@ class Program {
std::string model_path;
std::string para_path;
bool combined = false;
bool quantification = false;
private:
};
......
......@@ -154,7 +154,7 @@ void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
tensor->Resize(framework::make_ddim(desc.Dims()));
void *memory = tensor;
void *memory = nullptr;
int type_size = 0;
switch (desc.DataType()) {
case framework::VARTYPE_TYPE_FP16:
......@@ -179,11 +179,25 @@ void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
default:
break;
}
for (int n = 0; n < memory_size * type_size; ++n) {
static_cast<char *>(memory)[n] = (*data)[n];
if (program_.quantification) {
float min_value;
float max_value;
memcpy(&min_value, *data, sizeof(float));
memcpy(&max_value, *data + sizeof(float), sizeof(float));
*data += 2 * sizeof(float);
const float factor = (max_value - min_value) / 255.0;
uint8_t *uint8_data = (uint8_t *)(*data);
for (int k = 0; k < memory_size; ++k) {
static_cast<float *>(memory)[k] = uint8_data[k] * factor + min_value;
}
*data += (memory_size * sizeof(uint8_t));
} else {
for (int n = 0; n < memory_size * type_size; ++n) {
static_cast<char *>(memory)[n] = (*data)[n];
}
(*data) += (sizeof(char) * memory_size * type_size);
}
(*data) += (sizeof(char) * memory_size * type_size);
}
template <typename Dtype, Precision P>
......
......@@ -44,26 +44,29 @@ static size_t ReadBuffer(const char *file_name, uint8_t **out) {
template <typename Dtype, Precision P>
const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
const std::string &dirname, bool optimize, bool can_add_split) {
auto program =
this->LoadProgram(dirname + "/__model__", optimize, can_add_split);
const std::string &dirname, bool optimize, bool quantification,
bool can_add_split) {
auto program = this->LoadProgram(dirname + "/__model__", optimize,
quantification, can_add_split);
program.model_path = dirname;
return program;
}
template <typename Dtype, Precision P>
const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
const std::string &model_path, const std::string &para_path,
bool optimize) {
const std::string &model_path, const std::string &para_path, bool optimize,
bool quantification) {
auto program = this->LoadProgram(model_path, optimize);
program.para_path = para_path;
program.combined = true;
program.quantification = quantification;
return program;
}
template <typename Dtype, Precision P>
const framework::Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
const std::string &model_path, bool optimize, bool can_add_split) {
const std::string &model_path, bool optimize, bool quantification,
bool can_add_split) {
std::string model_filename = model_path;
PaddleMobile__Framework__Proto__ProgramDesc *c_program;
uint8_t *buf = NULL;
......@@ -82,6 +85,7 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
framework::Program<Dtype, P> program;
program.originProgram = originProgramDesc;
program.quantification = quantification;
auto scope = std::make_shared<framework::Scope>();
program.scope = scope;
......
......@@ -30,6 +30,7 @@ class Loader {
* */
const framework::Program<Dtype, P> Load(const std::string &dirname,
bool optimize = false,
bool quantification = false,
bool can_add_split = false);
/*
......@@ -38,11 +39,13 @@ class Loader {
* */
const framework::Program<Dtype, P> Load(const std::string &model_path,
const std::string &para_path,
bool optimize = false);
bool optimize = false,
bool quantification = false);
private:
const framework::Program<Dtype, P> LoadProgram(const std::string &model_path,
bool optimize = false,
bool quantification = false,
bool can_add_split = false);
};
......
......@@ -26,7 +26,7 @@ void PaddleMobile<Dtype, P>::SetThreadNum(int num) {
template <typename Dtype, Precision P>
bool PaddleMobile<Dtype, P>::Load(const std::string &dirname, bool optimize,
int batch_size) {
bool quantification, int batch_size) {
if (loader_.get() == nullptr) {
loader_ = std::make_shared<Loader<Dtype, P>>();
} else {
......@@ -35,7 +35,7 @@ bool PaddleMobile<Dtype, P>::Load(const std::string &dirname, bool optimize,
if (executor_.get() == nullptr) {
executor_ = std::make_shared<Executor<Dtype, P>>(
loader_->Load(dirname, optimize), batch_size, optimize);
loader_->Load(dirname, optimize, quantification), batch_size, optimize);
} else {
LOG(kLOG_INFO) << "executor inited";
}
......@@ -46,7 +46,7 @@ bool PaddleMobile<Dtype, P>::Load(const std::string &dirname, bool optimize,
template <typename Dtype, Precision P>
bool PaddleMobile<Dtype, P>::Load(const std::string &model_path,
const std::string &para_path, bool optimize,
int batch_size) {
bool quantification, int batch_size) {
if (loader_.get() == nullptr) {
loader_ = std::make_shared<Loader<Dtype, P>>();
} else {
......@@ -55,7 +55,8 @@ bool PaddleMobile<Dtype, P>::Load(const std::string &model_path,
if (executor_.get() == nullptr) {
executor_ = std::make_shared<Executor<Dtype, P>>(
loader_->Load(model_path, para_path, optimize), batch_size, optimize);
loader_->Load(model_path, para_path, optimize, quantification),
batch_size, optimize);
} else {
LOG(kLOG_INFO) << "executor inited";
}
......
......@@ -39,14 +39,18 @@ class PaddleMobile {
* @b 加载分开形式的 fluid 模型
* */
bool Load(const std::string &dirname, bool optimize = false,
int batch_size = 1);
bool quantification = false, int batch_size = 1);
/*
* @b load combine format fluid mode
* @b 加载结合在一起格式的模型
* */
bool Load(const std::string &model_path, const std::string &para_path,
bool optimize = false, int batch_size = 1);
bool optimize = false, bool quantification = false,
int batch_size = 1);
/*
* @b 设置线程数, 当 cmake 中开启 openmp 时生效
* */
void SetThreadNum(int num);
/*
......
......@@ -16,10 +16,32 @@ limitations under the License. */
#include <cstdlib>
#include <cstring>
#ifdef PADDLE_MOBILE_FPGA
#include "fpga/api/fpga_api.h"
#endif
namespace paddle_mobile {
namespace memory {
const int MALLOC_ALIGN = 64;
#ifdef PADDLE_MOBILE_FPGA
namespace api = paddle::mobile::fpga::api;
void Copy(void *dst, const void *src, size_t num) {
std::memcpy(dst, src, num);
}
void *Alloc(size_t size) { return api::malloc(size); }
void Free(void *ptr) {
if (ptr) {
api::fpga_free(ptr);
}
}
#else
void Copy(void *dst, const void *src, size_t num) {
std::memcpy(dst, src, num);
}
......@@ -42,5 +64,7 @@ void Free(void *ptr) {
}
}
#endif
} // namespace memory
} // namespace paddle_mobile
......@@ -26,7 +26,7 @@ void BatchNormOp<Dtype, T>::InferShape() const {
auto x_dims = this->param_.InputX()->dims();
this->param_.OutputY()->Resize(x_dims);
}
template class BatchNormOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
......@@ -47,7 +47,7 @@ void BoxCoderOp<Dtype, T>::InferShape() const {
this->param_.OutputBox()->Resize(framework::make_ddim(
{input_targetbox_dims[0], input_priorbox_dims[0], 4}));
}
template class BoxCoderOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
......@@ -56,7 +56,6 @@ void ConcatOp<Dtype, T>::InferShape() const {
this->param_.Out()->Resize(out_dims);
}
template class ConcatOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
......@@ -48,8 +48,6 @@ void ConvOp<Dtype, T>::InferShape() const {
this->param_.Output()->Resize(ddim);
}
template class ConvOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
......@@ -49,8 +49,6 @@ void DepthwiseConvOp<Dtype, T>::InferShape() const {
this->param_.Output()->Resize(ddim);
}
template class DepthwiseConvOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
......@@ -22,7 +22,7 @@ void DropoutOp<Dtype, T>::InferShape() const {
auto input_dims = this->param_.InputX()->dims();
this->param_.Out()->Resize(input_dims);
}
template class DropoutOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
......@@ -24,7 +24,7 @@ void ElementwiseAddOp<Dtype, T>::InferShape() const {
auto x_dim = this->param_.InputX()->dims();
this->param_.Out()->Resize(x_dim);
}
template class ElementwiseAddOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
......@@ -14,10 +14,7 @@ limitations under the License. */
#include "feed_op.h"
namespace paddle_mobile {
namespace operators {
template class FeedOp<CPU, float>;
}
namespace operators {}
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
......
......@@ -14,10 +14,7 @@ limitations under the License. */
#include "fetch_op.h"
namespace paddle_mobile {
namespace operators {
template class FetchOp<CPU, float>;
}
namespace operators {}
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
......
......@@ -45,7 +45,6 @@ void FusionConvAddOp<Dtype, T>::InferShape() const {
this->param_.Output()->Resize(ddim);
}
template class FusionConvAddOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
......@@ -36,8 +36,6 @@ class FusionConvAddMatcher : public framework::FusionOpMatcher {
void FolderNodes(
framework::Node *node,
std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
vector<std::shared_ptr<framework::OpDesc>> origin_descs =
node->OpDescs(node_.Depth());
node->Folder(node_.Depth(), Type(),
{{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}}}, removed_nodes);
}
......@@ -68,11 +66,11 @@ class FusionConvAddOp : public framework::OperatorWithKernel<
#ifdef PADDLE_MOBILE_CPU
//#ifndef CONV_ADD_REGISTER
// static framework::FusionOpRegistrar convadd_registrar(
// new FusionConvAddMatcher());
//#define CONV_ADD_REGISTER
//#endif
#ifndef CONV_ADD_REGISTER
static framework::FusionOpRegistrar convadd_registrar(
new FusionConvAddMatcher());
#define CONV_ADD_REGISTER
#endif
#endif
......
......@@ -44,7 +44,7 @@ void FusionConvAddBNReluOp<Dtype, T>::InferShape() const {
framework::DDim ddim = framework::make_ddim(output_shape);
this->param_.Output()->Resize(ddim);
}
template class FusionConvAddBNReluOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
......@@ -39,8 +39,6 @@ class FusionConvAddBNReluMatcher : public framework::FusionOpMatcher {
void FolderNodes(
framework::Node *node,
std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
vector<std::shared_ptr<framework::OpDesc>> origin_descs =
node->OpDescs(node_.Depth());
node->Folder(node_.Depth(), Type(),
{{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}},
{G_OP_TYPE_BATCHNORM,
......
......@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef CONVADDRELU_OP
#ifdef FUSION_CONVADDRELU_OP
#include "fusion_conv_add_relu_op.h"
#include "operators/math/conv_func.h"
......
......@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef CONVADDRELU_OP
#ifdef FUSION_CONVADDRELU_OP
#pragma once
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVBNRELU_OP
#include "operators/fusion_conv_bn_relu_op.h"
#include "operators/math/conv_func.h"
namespace paddle_mobile {
namespace operators {
template <typename Dtype, typename T>
void FusionConvBNReluOp<Dtype, T>::InferShape() const {
auto in_dims = this->param_.Input()->dims();
auto filter_dims = this->param_.Filter()->dims();
const std::vector<int> &strides = this->param_.Strides();
std::vector<int> paddings = this->param_.Paddings();
int groups = this->param_.Groups();
std::vector<int> dilations = this->param_.Dilations();
PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
dilations.size() == paddings.size() &&
paddings.size() == strides.size()),
"ConvParam is not suitable");
std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
for (size_t i = 0; i < strides.size(); ++i) {
output_shape.push_back(
math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
paddings[i], strides[i]));
}
framework::DDim ddim = framework::make_ddim(output_shape);
this->param_.Output()->Resize(ddim);
}
} // namespace operators
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(fusion_conv_bn_relu, ops::FusionConvBNReluOp);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVBNRELU_OP
#pragma once
#include <string>
#include <vector>
#include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h"
#include "operators/kernel/conv_bn_relu_kernel.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
using std::string;
using std::vector;
class FusionConvBNReluMatcher : public framework::FusionOpMatcher {
public:
FusionConvBNReluMatcher() {
node_ = framework::Node(G_OP_TYPE_CONV);
node_ > std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM) >
std::make_shared<framework::Node>(G_OP_TYPE_RELU);
}
void FolderNodes(
framework::Node *node,
std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
node->Folder(node_.Depth(), Type(),
{{G_OP_TYPE_BATCHNORM,
{{"Scale", "Scale"},
{"Mean", "Mean"},
{"Bias", "Bias"},
{"Variance", "Variance"}}}},
removed_nodes);
}
std::string Type() { return G_OP_TYPE_FUSION_CONV_BN_RELU; }
};
template <typename DeviceType, typename T>
class FusionConvBNReluOp : public framework::OperatorWithKernel<
DeviceType, FusionConvBNReluParam,
operators::ConvBNReluKernel<DeviceType, T>> {
public:
FusionConvBNReluOp(const string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs,
const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<
DeviceType, FusionConvBNReluParam,
operators::ConvBNReluKernel<DeviceType, T>>(type, inputs, outputs,
attrs, scope) {}
using framework::OperatorWithKernel<
DeviceType, FusionConvBNReluParam,
operators::ConvBNReluKernel<DeviceType, T>>::OperatorWithKernel;
void InferShape() const override;
protected:
};
#ifdef PADDLE_MOBILE_CPU
#ifndef FUSION_CONV_BN_RELU_REGISTER
static framework::FusionOpRegistrar fusion_conv_bn_relu_registrar(
new FusionConvBNReluMatcher());
#define FUSION_CONV_BN_RELU_REGISTER
#endif
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
} // namespace operators
} // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(fusion_conv_bn_relu);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif
......@@ -44,7 +44,7 @@ void FusionDWConvBNReluOp<Dtype, T>::InferShape() const {
framework::DDim ddim = framework::make_ddim(output_shape);
this->param_.Output()->Resize(ddim);
}
template class FusionDWConvBNReluOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
......@@ -38,8 +38,6 @@ class FusionDWConvBNReluMatcher : public framework::FusionOpMatcher {
void FolderNodes(
framework::Node *node,
std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
vector<std::shared_ptr<framework::OpDesc>> origin_descs =
node->OpDescs(node_.Depth());
node->Folder(node_.Depth(), Type(),
{{G_OP_TYPE_BATCHNORM,
{{"Scale", "Scale"},
......
......@@ -50,7 +50,6 @@ void FusionFcOp<Dtype, T>::InferShape() const {
this->param_.Out()->Resize(ddim);
}
template class FusionFcOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
......@@ -47,8 +47,6 @@ void Im2SequenceOp<Dtype, T>::InferShape() const {
this->param_.Output()->Resize(ddim);
}
template class Im2SequenceOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
......@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADD_RELU_OP
#ifdef FUSION_CONVADDRELU_OP
#include "operators/kernel/conv_add_relu_kernel.h"
#include "operators/kernel/central-arm-func/conv_add_relu_arm_func.h"
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVBNRELU_OP
#include "operators/kernel/conv_bn_relu_kernel.h"
#include "operators/kernel/central-arm-func/conv_bn_relu_arm_func.h"
namespace paddle_mobile {
namespace operators {
template <>
bool ConvBNReluKernel<CPU, float>::Init(FusionConvBNReluParam *param) {
const Tensor *mean = param->InputMean();
const Tensor *variance = param->InputVariance();
const Tensor *scale = param->InputScale();
const Tensor *bias = param->InputBias();
const float epsilon = param->Epsilon();
// DLOG << "variance: " << *variance;
auto mean_ptr = mean->data<float>();
auto variance_ptr = variance->data<float>();
auto scale_ptr = scale->data<float>();
auto bias_ptr = bias->data<float>();
const int C = mean->numel();
float inv_std_ptr[C];
for (int i = 0; i < C; i++) {
inv_std_ptr[i] =
1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
}
Tensor *new_scale = new Tensor();
Tensor *new_bias = new Tensor();
auto new_scale_ptr = new_scale->mutable_data<float>({C});
auto new_bias_ptr = new_bias->mutable_data<float>({C});
for (int i = 0; i < C; i++) {
new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
}
param->SetNewScale(new_scale);
param->SetNewBias(new_bias);
return true;
}
template <>
void ConvBNReluKernel<CPU, float>::Compute(
const FusionConvBNReluParam &param) const {
ConvBNReluCompute<float>(param);
}
template class ConvBNReluKernel<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
#endif
......@@ -54,7 +54,40 @@ void BatchnormCompute(const BatchNormParam &param) {
int HXW = H * W;
#ifdef ARMV7
#if __ARM_NEON
#if __aarch64__
float *inv_std_ptr = new float[C];
for (int i = 0; i < C; i++) {
inv_std_ptr[i] =
1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
}
Tensor new_scale;
auto new_scale_ptr = new_scale.mutable_data<float>(framework::make_ddim({C}));
Tensor new_bias;
auto new_bias_ptr = new_bias.mutable_data<float>(framework::make_ddim({C}));
/// ((x - est_mean) * (inv_var) * scale + bias equal to
/// (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
for (int i = 0; i < C; i++) {
new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
{
for (int n = 0; n < N; n++) {
for (int h = 0; h < H; h++) {
int tmp_index = n * stride0 + i * stride1 + h * stride2;
for (int w = 0; w < W; w++) {
int index = tmp_index + w;
out_ptr[index] =
input_x_ptr[index] * new_scale_ptr[i] + new_bias_ptr[i];
}
}
}
}
}
delete[] inv_std_ptr;
#else
if (HXW > 32) {
int NXC = N * C;
float *inv_std_ptr = new float[NXC * 4];
......@@ -229,6 +262,7 @@ void BatchnormCompute(const BatchNormParam &param) {
delete[] inv_std_ptr;
}
#endif
#else
float *inv_std_ptr = new float[C];
for (int i = 0; i < C; i++) {
......
......@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADD_RELU_OP
#ifdef FUSION_CONVADDRELU_OP
#pragma once
#include <vector>
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVBNRELU_OP
#pragma once
#include <vector>
#include "operators/math/depthwise_conv_3x3.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
void ConvBNReluBasic(const FusionConvBNReluParam &param) {
const Tensor *input = param.Input();
Tensor filter = *param.Filter();
Tensor new_bias = *param.NewBias();
Tensor new_scale = *param.NewScale();
Tensor *output = param.Output();
int groups = param.Groups();
std::vector<int> strides = param.Strides();
std::vector<int> paddings = param.Paddings();
std::vector<int> dilations = param.Dilations();
const int batch_size = static_cast<int>(input->dims()[0]);
std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
size_t data_dim = filter_shape_vec.size() - 2;
std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
col_shape_vec[0] = input->dims()[1] / groups;
for (size_t j = 0; j < data_dim; ++j) {
col_shape_vec[j + 1] = filter_shape_vec[j + 2];
col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
}
framework::DDim col_shape(framework::make_ddim(col_shape_vec));
framework::DDim col_matrix_shape =
framework::flatten_to_2d(col_shape, data_dim + 1);
bool is_expand =
math::IsExpand(filter_shape_vec, strides, paddings, dilations);
Tensor col;
Tensor col_matrix;
if (is_expand) {
col.mutable_data<float>(col_shape);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
}
framework::DDim input_shape = framework::slice_ddim(
input->dims(), 1, static_cast<int>(input->dims().size()));
framework::DDim filter_matrix_shape = {filter.dims()[0],
filter.numel() / filter.dims()[0]};
filter.Resize(filter_matrix_shape);
framework::DDim output_matrix_shape = {
output->dims()[1],
output->numel() / (output->dims()[0] * output->dims()[1])};
// convolution operator: im2col(or vol2col) + gemm
int in_step = static_cast<int>(input->dims()[1]) / groups;
int out_step = static_cast<int>(output->dims()[1]) / groups;
math::Vol2ColFunctor<CPU, float> vol2col;
math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
for (int i = 0; i < batch_size; i++) {
Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
for (int g = 0; g < groups; g++) {
Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
if (!is_expand) {
col.ShareDataWith(in_slice);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
} else if (data_dim == 2U) {
// im2col
im2col(in_slice, dilations, strides,
std::vector<int>{paddings[0], paddings[1], paddings[0],
paddings[1]},
&col);
} else if (data_dim == 3U) {
// vol2col
vol2col(in_slice, dilations, strides, paddings, &col);
}
// gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmulWithBn<float>(
filter_slice, false, col_matrix, false, static_cast<float>(1),
&out_slice, static_cast<float>(0), true, &new_scale, &new_bias, g);
}
}
}
template <typename P>
void ConvBNReluCompute(const FusionConvBNReluParam &param) {
if (param.Groups() == param.Input()->dims()[1] &&
param.Input()->dims()[1] == param.Output()->dims()[1] &&
param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
math::DepthwiseConvAddBNRelu3x3s1p1(param.Input(), param.Filter(),
param.Output(), param.NewScale(),
param.NewBias(), true);
} else if (param.Groups() == param.Input()->dims()[1] &&
param.Input()->dims()[1] == param.Output()->dims()[1] &&
param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
// math::DepthwiseConvAddBNRelu3x3s2p1(param.Input(), param.Filter(),
// param.Output(), param.NewScale(),
// param.NewBias(), 1);
math::DepthwiseConvAddBNRelu3x3s2p1v2(param.Input(), param.Filter(),
param.Output(), param.NewScale(),
param.NewBias(), true);
} else {
ConvBNReluBasic(param);
}
}
} // namespace operators
} // namespace paddle_mobile
#endif
......@@ -76,15 +76,20 @@ void PoolCompute(const PoolParam &param) {
}
} else if (ksize[0] == 2 && ksize[0] == ksize[1]) {
#ifndef IOS
#if __ARM_NEON
#if __aarch64__
PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
#else
if (pooling_type == "max") {
math::Pool2x2Max(strides, paddings, in_x, out);
} else if (pooling_type == "avg") {
math::Pool2x2Avg(strides, paddings, in_x, out);
}
#endif
#else
PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
#endif
#endif // __ARM_NEON
} else {
PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
}
......
......@@ -68,6 +68,7 @@ void sigmoid(const Tensor *X, Tensor *Y) {
input_outer_ptr++;
}
}
#else
#endif
}
......
......@@ -14,7 +14,7 @@ limitations under the License. */
#pragma once
#ifdef FUSION_CONVADD_RELU_OP
#ifdef FUSION_CONVADDRELU_OP
#include <vector>
#include "framework/ddim.h"
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef FUSION_CONVBNRELU_OP
#include <vector>
#include "framework/ddim.h"
#include "framework/operator.h"
#include "operators/math/conv_func.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
using framework::DDim;
using framework::OpKernelBase;
template <typename DeviceType, typename T>
class ConvBNReluKernel
: public OpKernelBase<DeviceType, FusionConvBNReluParam> {
public:
void Compute(const FusionConvBNReluParam &param) const;
bool Init(FusionConvBNReluParam *param);
};
} // namespace operators
} // namespace paddle_mobile
#endif
......@@ -24,7 +24,7 @@ void LrnOp<Dtype, T>::InferShape() const {
auto x_dims = this->param_.InputX()->dims();
this->param_.Out()->Resize(x_dims);
}
template class LrnOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
......@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "operators/math/depthwise_conv_3x3.h"
#ifdef __ARM_NEON
#if __ARM_NEON
#include <arm_neon.h>
#endif
#include <vector>
......@@ -23,7 +23,6 @@ namespace math {
void DepthwiseConv3x3(const Tensor *input, vector<int> strides,
vector<int> paddings, const Tensor *filter, Tensor *bias,
Tensor *output, bool if_bias) {
#ifdef __ARM_NEON
const int batch_size = input->dims()[0];
const int input_height = input->dims()[2];
......@@ -181,7 +180,27 @@ void DepthwiseConv3x3(const Tensor *input, vector<int> strides,
}
} else {
#if defined(ARMV17)
#if __ARM_NEON
#if __aarch64__
const float32x4_t data1 = vld1q_f32(pos1);
const float32x4_t data2 = vld1q_f32(pos2);
const float32x4_t data3 = vld1q_f32(pos3);
const float32x4_t v_filter1 = vld1q_f32(filter1);
const float32x4_t v_filter2 = vld1q_f32(filter2);
const float32x4_t v_filter3 = vld1q_f32(filter3);
float32x4_t mula = vmulq_f32(data1, v_filter1);
mula = vmlaq_f32(mula, data2, v_filter2);
mula = vmlaq_f32(mula, data3, v_filter3);
float32x2_t res = vpadd_f32(
vget_high_f32(vsetq_lane_f32(0, mula, 3)), vget_low_f32(mula));
res = vpadd_f32(res, res);
if (if_bias) {
output_data[ph * output_width + pw] += vget_lane_f32(res, 0);
} else {
output_data[ph * output_width + pw] = vget_lane_f32(res, 0);
}
#else
asm volatile(
"vld1.32 {q1}, [%[pos1]] \n\t"
......@@ -209,26 +228,10 @@ void DepthwiseConv3x3(const Tensor *input, vector<int> strides,
[filter2] "r"(filter2), [filter3] "r"(filter3),
[output_ptr] "r"(output_ptr), [zero] "r"(zero)
: "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6");
#endif // __aarch64__
#else
const float32x4_t data1 = vld1q_f32(pos1);
const float32x4_t data2 = vld1q_f32(pos2);
const float32x4_t data3 = vld1q_f32(pos3);
const float32x4_t v_filter1 = vld1q_f32(filter1);
const float32x4_t v_filter2 = vld1q_f32(filter2);
const float32x4_t v_filter3 = vld1q_f32(filter3);
float32x4_t mula = vmulq_f32(data1, v_filter1);
mula = vmlaq_f32(mula, data2, v_filter2);
mula = vmlaq_f32(mula, data3, v_filter3);
float32x2_t res = vpadd_f32(
vget_high_f32(vsetq_lane_f32(0, mula, 3)), vget_low_f32(mula));
res = vpadd_f32(res, res);
if (if_bias) {
output_data[ph * output_width + pw] += vget_lane_f32(res, 0);
} else {
output_data[ph * output_width + pw] = vget_lane_f32(res, 0);
}
#endif
#endif // __ARM_NEON
}
}
}
......@@ -239,12 +242,11 @@ void DepthwiseConv3x3(const Tensor *input, vector<int> strides,
input_data += input_batch_stride;
output_data += output_batch_stride;
}
#endif
}
void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
Tensor *output, Tensor *bias, bool if_bias) {
#ifdef __ARM_NEON
#if __ARM_NEON
const float *input_data = input->data<float>();
const float *filter_data = filter->data<float>();
float *output_data = output->data<float>();
......@@ -520,7 +522,7 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
Tensor *output, const Tensor *new_scale,
const Tensor *new_bias, bool if_relu) {
#ifdef __ARM_NEON
#if __ARM_NEON
const float *input_data = input->data<float>();
const float *filter_data = filter->data<float>();
float *output_data = output->data<float>();
......@@ -824,7 +826,7 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
void DepthwiseConvAddBNRelu3x3s2p1(const Tensor *input, const Tensor *filter,
Tensor *output, const Tensor *new_scale,
const Tensor *new_bias, bool if_relu) {
#ifdef __ARM_NEON
#if __ARM_NEON
const int batch_size = input->dims()[0];
......@@ -1022,7 +1024,7 @@ void DepthwiseConvAddBNRelu3x3s2p1(const Tensor *input, const Tensor *filter,
void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter,
Tensor *output, Tensor bias, bool if_bias) {
#ifdef __ARM_NEON
#if __ARM_NEON
const float *input_data = input->data<float>();
const float *filter_data = filter->data<float>();
float *output_data = output->data<float>();
......@@ -1225,7 +1227,7 @@ void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter,
void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
Tensor *output, const Tensor *new_scale,
const Tensor *new_bias, bool if_relu) {
#ifdef __ARM_NEON
#if __ARM_NEON
const float *input_data = input->data<float>();
const float *filter_data = filter->data<float>();
float *output_data = output->data<float>();
......
此差异已折叠。
......@@ -19,7 +19,7 @@ limitations under the License. */
#define B(i, j) B[(i)*ldb + (j)]
#define C(i, j) C[(i)*ldc + (j)]
#define MR 4
#define MR 6
#define NR 8
#define s_min(i, j) ((i) < (j) ? (i) : (j))
......@@ -28,6 +28,7 @@ namespace paddle_mobile {
namespace operators {
namespace math {
/*
// 将 A 矩阵分块复制到连续内存(ColMajor)
void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
float *buffer);
......@@ -35,14 +36,17 @@ void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
// 将 B 矩阵分块复制到连续内存(ColMajor)
void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
float *buffer);
*/
// 将 A 矩阵分块复制到连续内存(RowMajor)
void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
float *buffer);
void PackMatrixA_4r(int m, int k, int m_tail, const float *A, int lda,
float *buffer);
void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda,
float *buffer);
// 将 B 矩阵分块复制到连续内存(RowMajor)
void PackMatrixB_(int k, int n, int n_tail, const float *B, int ldb,
float *buffer);
void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb,
float *buffer);
// 分块矩阵乘法
void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
......@@ -51,7 +55,7 @@ void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
const float *b, float beta, float *c, float *C, int ldc,
bool relu, float *new_scale, float *new_bias);
/*
// 向量矩阵乘法 (M = 1)
void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc,
......@@ -60,10 +64,12 @@ void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
int lda, const float *B, int ldb, float beta, float *C,
int ldc, bool relu, float *new_scale, float *new_bias);
*/
// 计算一个更小的 C 矩阵分块
void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc);
void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc);
void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc);
// 分块矩阵乘法结果回写
// C = A * B
......@@ -81,6 +87,7 @@ void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale,
void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
float *new_scale, float *new_bias);
/*
// 向量矩阵乘法结果回写
// C = A * B
void VecWriteBasic(int n, float *c, float *C, int ldc);
......@@ -96,6 +103,7 @@ void VecWriteWithBn(int n, float *c, float *C, int ldc, float *new_scale,
// C = A * B, batchnorm(C), relu(C)
void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *new_scale,
float *new_bias);
*/
// 32位 float 矩阵乘法
void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
......
......@@ -15,7 +15,7 @@ limitations under the License. */
#include "operators/math/im2col.h"
#include <vector>
#ifdef __ARM_NEON
#include "arm_neon.h"
#include <arm_neon.h>
#endif
#include "common/types.h"
namespace paddle_mobile {
......@@ -69,7 +69,7 @@ class Im2ColFunctor<ColFormat::kCFO, CPU, T> {
int channels_col = im_channels * filter_height * filter_width;
const T *im_data = im.data<T>();
T *col_data = col->data<T>();
#ifdef __ARM_NEON
#if __ARM_NEON
const int osize = col_height;
const int isize = im_height;
bool pad1 = padding[0] > 0;
......
......@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */
#ifdef POOL_OP
#include "pool_2x2.h"
#include "operators/math/pool_2x2.h"
#include <algorithm>
#include <vector>
namespace paddle_mobile {
namespace operators {
......@@ -21,10 +23,10 @@ namespace math {
void Pool2x2Max(vector<int> strides, vector<int> paddings, const Tensor *input,
Tensor *output) {
#ifdef __ARM_NEON
#ifdef ARMV7
#if __ARM_NEON
#if __aarch64__
#else
const int batch_size = input->dims()[0];
const int input_height = input->dims()[2];
......@@ -93,15 +95,16 @@ void Pool2x2Max(vector<int> strides, vector<int> paddings, const Tensor *input,
output_data += output_batch_stride;
}
#endif
#else
#endif
}
void Pool2x2Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
Tensor *output) {
#ifdef __ARM_NEON
#if __ARM_NEON
#ifdef ARMV7
#if __aarch64__
#else
const int batch_size = input->dims()[0];
const int input_height = input->dims()[2];
......@@ -171,12 +174,9 @@ void Pool2x2Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
input_data += input_batch_stride;
output_data += output_batch_stride;
}
#else
// TODO(): to imp other asm
#endif
#else
#endif
}
......
......@@ -17,7 +17,7 @@ limitations under the License. */
#include <omp.h>
#endif
#include "framework/tensor.h"
#include "pool_3x3.h"
#include "operators/math/pool_3x3.h"
#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON
......@@ -518,6 +518,8 @@ void Pool3x3Maxs1p1(const Tensor *input, Tensor *output) {
input_data += input_batch_stride;
out_data += output_batch_stride;
}
#else
#endif
}
......@@ -582,7 +584,18 @@ void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input,
}
output_seg[ph * output_width + pw] = max_value;
} else {
#if defined(ARMV7)
#if __aarch64__
const float32x4_t data1 = vld1q_f32(pos1);
const float32x4_t data2 = vld1q_f32(pos1 + input_width);
const float32x4_t data3 = vld1q_f32(pos1 + 2 * input_width);
const float32x4_t max_data =
vmaxq_f32(vmaxq_f32(data1, data2), data3);
float32x2_t res =
vpmax_f32(vget_high_f32(vsetq_lane_f32(-INT_MAX, max_data, 3)),
vget_low_f32(max_data));
res = vpmax_f32(res, res);
output_seg[ph * output_width + pw] = vget_lane_f32(res, 0);
#else
asm volatile(
"vld1.32 {q1}, [%[pos1]] \n\t"
"vld1.32 {q2}, [%[pos2]] \n\t"
......@@ -598,17 +611,6 @@ void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input,
[pos2] "r"(pos2), [pos3] "r"(pos3),
[output_ptr] "r"(output_ptr), [negative_max] "r"(negative_max)
: "memory", "q1", "q2", "q3", "q4");
#else
const float32x4_t data1 = vld1q_f32(pos1);
const float32x4_t data2 = vld1q_f32(pos1 + input_width);
const float32x4_t data3 = vld1q_f32(pos1 + 2 * input_width);
const float32x4_t max_data =
vmaxq_f32(vmaxq_f32(data1, data2), data3);
float32x2_t res =
vpmax_f32(vget_high_f32(vsetq_lane_f32(-INT_MAX, max_data, 3)),
vget_low_f32(max_data));
res = vpmax_f32(res, res);
output_seg[ph * output_width + pw] = vget_lane_f32(res, 0);
#endif
}
}
......@@ -676,8 +678,8 @@ void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
}
output_seg[ph * output_width + pw] = sum / 9.0;
} else {
#if defined(ARMV7)
#if __aarch64__
#else
asm volatile(
"vld1.32 {q1}, [%[pos1]] \n\t"
"vld1.32 {q2}, [%[pos2]] \n\t"
......@@ -696,7 +698,7 @@ void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
[output_ptr] "r"(output_ptr), [zero] "r"(zero),
[nine_ptr] "r"(nine_ptr)
: "memory", "r6", "q1", "q2", "q3", "q4");
#else
#endif
const float32x4_t data1 = vld1q_f32(pos1);
const float32x4_t data2 = vld1q_f32(pos2);
const float32x4_t data3 = vld1q_f32(pos3);
......@@ -707,7 +709,6 @@ void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
vget_low_f32(sum_data));
res = vpadd_f32(res, res);
output_seg[ph * output_width + pw] = vget_lane_f32(res, 0) / 9.0;
#endif
}
}
}
......@@ -715,6 +716,7 @@ void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
input_data += input_batch_stride;
output_data += output_batch_stride;
}
#else
#endif
}
} // namespace math
......
......@@ -135,6 +135,7 @@ class SoftmaxFuntor<CPU, T> {
}
}
}
#else
#endif // ARM_NEON
public:
......
......@@ -50,7 +50,7 @@ void MulOp<Dtype, T>::InferShape() const {
framework::DDim ddim = framework::make_ddim(output_dims);
this->param_.Out()->Resize(ddim);
}
template class MulOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
......@@ -34,7 +34,7 @@ void MultiClassNMSOp<Dtype, T>::InferShape() const {
// pre size, will change in Compute.
this->param_.Out()->Resize(framework::make_ddim({input_bboxes_dims[1], 6}));
}
template class MultiClassNMSOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
......@@ -232,7 +232,6 @@ class ConvParam : OpParam {
Print &operator<<(Print &printer, const ConvParam &conv_param);
#endif
#ifdef ELEMENTWISEADD_OP
class ElementwiseAddParam : OpParam {
public:
ElementwiseAddParam(const VariableNameMap &inputs,
......@@ -259,6 +258,8 @@ class ElementwiseAddParam : OpParam {
int axis_;
};
#ifdef FUSION_ELEMENTWISEADDRELU_OP
using ElementwiseAddReluParam = ElementwiseAddParam;
#endif
#ifdef MUL_OP
......@@ -371,7 +372,7 @@ class BatchNormParam : OpParam {
input_variance_ = InputVarianceFrom<LoDTensor>(inputs, scope);
epsilon_ = GetAttr<float>("epsilon", attrs);
momentum_ = GetAttr<float>("momentum", attrs);
is_test_ = GetAttr<bool>("is_test", attrs);
// is_test_ = GetAttr<bool>("is_test", attrs);
}
const Tensor *InputX() const { return input_x_; }
......@@ -421,7 +422,7 @@ class PoolParam : public OpParam {
strides_ = GetAttr<vector<int>>("strides", attrs);
paddings_ = GetAttr<vector<int>>("paddings", attrs);
ceil_mode_ = GetAttr<bool>("ceil_mode", attrs);
gloabal_pooling_ = GetAttr<bool>("global_pooling", attrs);
global_pooling_ = GetAttr<bool>("global_pooling", attrs);
}
const Tensor *Input() const { return input_; }
......@@ -438,7 +439,7 @@ class PoolParam : public OpParam {
bool isCeilMode() const { return ceil_mode_; }
bool isGlobalPooling() const { return gloabal_pooling_; }
bool isGlobalPooling() const { return global_pooling_; }
private:
Tensor *input_;
......@@ -448,9 +449,82 @@ class PoolParam : public OpParam {
vector<int> strides_;
vector<int> paddings_;
bool ceil_mode_;
bool gloabal_pooling_ = false;
bool global_pooling_ = false;
};
#endif
#ifdef FUSION_POOLBN_OP
class FusionPoolBNParam : OpParam {
public:
FusionPoolBNParam(const VariableNameMap &inputs,
const VariableNameMap &outputs, const AttributeMap &attrs,
const Scope &scope) {
input_ = InputXFrom<LoDTensor>(inputs, scope);
pooling_type_ = GetAttr<string>("pooling_type", attrs);
ksize_ = GetAttr<vector<int>>("ksize", attrs);
strides_ = GetAttr<vector<int>>("strides", attrs);
paddings_ = GetAttr<vector<int>>("paddings", attrs);
ceil_mode_ = GetAttr<bool>("ceil_mode", attrs);
global_pooling_ = GetAttr<bool>("global_pooling", attrs);
output_y_ = OutputYFrom<LoDTensor>(outputs, scope);
input_bias_ = InputBiasFrom<LoDTensor>(inputs, scope);
input_mean_ = InputMeanFrom<LoDTensor>(inputs, scope);
input_scale_ = InputScaleFrom<LoDTensor>(inputs, scope);
input_variance_ = InputVarianceFrom<LoDTensor>(inputs, scope);
epsilon_ = GetAttr<float>("epsilon", attrs);
momentum_ = GetAttr<float>("momentum", attrs);
// is_test_ = GetAttr<bool>("is_test", attrs);
}
const Tensor *Input() const { return input_; }
const string &PoolingType() const { return pooling_type_; }
const vector<int> &Ksize() const { return ksize_; }
const vector<int> &Strides() const { return strides_; }
const vector<int> &Paddings() const { return paddings_; }
bool isCeilMode() const { return ceil_mode_; }
bool isGlobalPooling() const { return global_pooling_; }
Tensor *OutputY() const { return output_y_; }
const Tensor *InputBias() const { return input_bias_; }
const Tensor *InputMean() const { return input_mean_; }
const Tensor *InputScale() const { return input_scale_; }
const Tensor *InputVariance() const { return input_variance_; }
const float &Epsilon() const { return epsilon_; }
const float &Momentum() const { return momentum_; }
const bool &IsTest() const { return is_test_; }
const string &DataFormat() const { return data_format_; }
private:
Tensor *input_;
string pooling_type_;
vector<int> ksize_;
vector<int> strides_;
vector<int> paddings_;
bool ceil_mode_;
bool global_pooling_ = false;
Tensor *output_y_;
Tensor *input_bias_;
Tensor *input_mean_;
Tensor *input_scale_;
Tensor *input_variance_;
float epsilon_;
float momentum_;
bool is_test_;
string data_format_;
};
#endif
#ifdef PRIORBOX_OP
......@@ -875,7 +949,6 @@ class PReluParam : public OpParam {
};
#endif
#ifdef FUSION_FC_OP
class FusionFcParam : public OpParam {
public:
FusionFcParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
......@@ -911,9 +984,11 @@ class FusionFcParam : public OpParam {
int y_num_col_dims_;
int axis_;
};
#ifdef FUSION_FCRELU_OP
using FusionFcReluParam = FusionFcParam;
#endif
#ifdef FUSION_CONVADD_OP
class FusionConvAddParam : public OpParam {
public:
FusionConvAddParam(const VariableNameMap &inputs,
......@@ -960,9 +1035,8 @@ class FusionConvAddParam : public OpParam {
};
Print &operator<<(Print &printer, const FusionConvAddParam &conv_param);
#endif
#ifdef FUSION_CONVADD_RELU_OP
#ifdef FUSION_CONVADDRELU_OP
class FusionConvAddReluParam : public FusionConvAddParam {
public:
FusionConvAddReluParam(const VariableNameMap &inputs,
......@@ -993,7 +1067,7 @@ class FusionConvAddBNReluParam : public OpParam {
input_variance_ = InputVarianceFrom<LoDTensor>(inputs, scope);
epsilon_ = GetAttr<float>("epsilon", attrs);
momentum_ = GetAttr<float>("momentum", attrs);
is_test_ = GetAttr<bool>("is_test", attrs);
// is_test_ = GetAttr<bool>("is_test", attrs);
}
Tensor *Bias() const { return bias_; }
......@@ -1055,8 +1129,91 @@ class FusionConvAddBNReluParam : public OpParam {
Tensor *new_bias_;
Tensor *new_scale_;
};
#endif
Print &operator<<(Print &printer, const FusionConvAddParam &conv_param);
#ifdef FUSION_CONVADDBN_OP
class FusionConvAddBNParam : public OpParam {
public:
FusionConvAddBNParam(const VariableNameMap &inputs,
const VariableNameMap &outputs,
const AttributeMap &attrs, const Scope &scope) {
bias_ = InputYFrom<LoDTensor>(inputs, scope);
axis_ = GetAttr<int>("axis", attrs);
filter_ = FilterFrom<LoDTensor>(inputs, scope);
input_ = InputFrom<LoDTensor>(inputs, scope);
output_y_ = OutputYFrom<LoDTensor>(outputs, scope);
strides_ = GetAttr<vector<int>>("strides", attrs);
paddings_ = GetAttr<vector<int>>("paddings", attrs);
dilations_ = GetAttr<vector<int>>("dilations", attrs);
groups = GetAttr<int>("groups", attrs);
input_bias_ = InputBiasFrom<LoDTensor>(inputs, scope);
input_mean_ = InputMeanFrom<LoDTensor>(inputs, scope);
input_scale_ = InputScaleFrom<LoDTensor>(inputs, scope);
input_variance_ = InputVarianceFrom<LoDTensor>(inputs, scope);
epsilon_ = GetAttr<float>("epsilon", attrs);
momentum_ = GetAttr<float>("momentum", attrs);
// is_test_ = GetAttr<bool>("is_test", attrs);
}
Tensor *Bias() const { return bias_; }
const int &Axis() const { return axis_; }
const Tensor *Input() const { return input_; }
const Tensor *Filter() const { return filter_; }
Tensor *OutputY() const { return output_y_; }
const vector<int> &Strides() const { return strides_; }
const vector<int> &Paddings() const { return paddings_; }
const vector<int> &Dilations() const { return dilations_; }
const int &Groups() const { return groups; }
const Tensor *InputBias() const { return input_bias_; }
const Tensor *InputMean() const { return input_mean_; }
const Tensor *InputScale() const { return input_scale_; }
const Tensor *InputVariance() const { return input_variance_; }
const float &Epsilon() const { return epsilon_; }
const float &Momentum() const { return momentum_; }
const bool &IsTest() const { return is_test_; }
void SetNewScale(Tensor *new_scale) { new_scale_ = new_scale; }
void SetNewBias(Tensor *new_bias) { new_bias_ = new_bias; }
const Tensor *NewScale() const { return new_scale_; }
const Tensor *NewBias() const { return new_bias_; }
protected:
Tensor *bias_;
int axis_;
Tensor *input_;
Tensor *output_y_;
Tensor *filter_;
vector<int> strides_;
vector<int> paddings_;
vector<int> dilations_;
int groups;
Tensor *input_bias_;
Tensor *input_mean_;
Tensor *input_scale_;
Tensor *input_variance_;
float epsilon_;
float momentum_;
bool is_test_;
Tensor *new_bias_;
Tensor *new_scale_;
};
#endif
#ifdef FUSION_DWCONVBNRELU_OP
......@@ -1078,7 +1235,7 @@ class FusionDWConvBNReluParam : public OpParam {
input_variance_ = InputVarianceFrom<LoDTensor>(inputs, scope);
epsilon_ = GetAttr<float>("epsilon", attrs);
momentum_ = GetAttr<float>("momentum", attrs);
is_test_ = GetAttr<bool>("is_test", attrs);
// is_test_ = GetAttr<bool>("is_test", attrs);
}
const Tensor *Input() const { return input_; }
......@@ -1139,6 +1296,85 @@ class FusionDWConvBNReluParam : public OpParam {
Print &operator<<(Print &printer, const FusionConvAddParam &conv_param);
#endif
#ifdef FUSION_CONVBNRELU_OP
class FusionConvBNReluParam : public OpParam {
public:
FusionConvBNReluParam(const VariableNameMap &inputs,
const VariableNameMap &outputs,
const AttributeMap &attrs, const Scope &scope) {
filter_ = FilterFrom<LoDTensor>(inputs, scope);
input_ = InputFrom<LoDTensor>(inputs, scope);
output_ = OutFrom<LoDTensor>(outputs, scope);
strides_ = GetAttr<vector<int>>("strides", attrs);
paddings_ = GetAttr<vector<int>>("paddings", attrs);
dilations_ = GetAttr<vector<int>>("dilations", attrs);
groups = GetAttr<int>("groups", attrs);
input_bias_ = InputBiasFrom<LoDTensor>(inputs, scope);
input_mean_ = InputMeanFrom<LoDTensor>(inputs, scope);
input_scale_ = InputScaleFrom<LoDTensor>(inputs, scope);
input_variance_ = InputVarianceFrom<LoDTensor>(inputs, scope);
epsilon_ = GetAttr<float>("epsilon", attrs);
momentum_ = GetAttr<float>("momentum", attrs);
// is_test_ = GetAttr<bool>("is_test", attrs);
}
const Tensor *Input() const { return input_; }
const Tensor *Filter() const { return filter_; }
Tensor *Output() const { return output_; }
const vector<int> &Strides() const { return strides_; }
const vector<int> &Paddings() const { return paddings_; }
const vector<int> &Dilations() const { return dilations_; }
const int &Groups() const { return groups; }
const Tensor *InputBias() const { return input_bias_; }
const Tensor *InputMean() const { return input_mean_; }
const Tensor *InputScale() const { return input_scale_; }
const Tensor *InputVariance() const { return input_variance_; }
const float &Epsilon() const { return epsilon_; }
const float &Momentum() const { return momentum_; }
const bool &IsTest() const { return is_test_; }
void SetNewScale(Tensor *new_scale) { new_scale_ = new_scale; }
void SetNewBias(Tensor *new_bias) { new_bias_ = new_bias; }
const Tensor *NewScale() const { return new_scale_; }
const Tensor *NewBias() const { return new_bias_; }
protected:
Tensor *input_;
Tensor *output_;
Tensor *filter_;
vector<int> strides_;
vector<int> paddings_;
vector<int> dilations_;
int groups;
Tensor *input_bias_;
Tensor *input_mean_;
Tensor *input_scale_;
Tensor *input_variance_;
float epsilon_;
float momentum_;
bool is_test_;
Tensor *new_bias_;
Tensor *new_scale_;
};
#endif
#ifdef IM2SEQUENCE_OP
class Im2SequenceParam : public OpParam {
public:
......@@ -1190,5 +1426,9 @@ class DropoutParam : public OpParam {
};
#endif
#ifdef REGION_OP
class RegionParam : public OpParam {};
#endif
} // namespace operators
} // namespace paddle_mobile
......@@ -54,7 +54,7 @@ void PoolOp<DeviceType, T>::InferShape() const {
}
this->param_.Output()->Resize(framework::make_ddim(output_shape));
}
template class PoolOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
......@@ -23,7 +23,7 @@ void PReluOp<Dtype, T>::InferShape() const {
auto input_dims = this->param_.InputX()->dims();
this->param_.Out()->Resize(input_dims);
}
template class PReluOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
......@@ -44,7 +44,7 @@ void PriorBoxOp<Dtype, T>::InferShape() const {
this->param_.OutputBoxes()->Resize(framework::make_ddim(dim_vec));
this->param_.OutputVariances()->Resize(framework::make_ddim(dim_vec));
}
template class PriorBoxOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
......@@ -23,7 +23,7 @@ void ReluOp<Dtype, T>::InferShape() const {
auto input_dims = this->param_.InputX()->dims();
this->param_.Out()->Resize(input_dims);
}
template class ReluOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
......@@ -27,7 +27,7 @@ void ReshapeOp<Dtype, T>::InferShape() const {
auto out_dims = ValidateShape(shape, input_x_dims);
this->param_.Out()->Resize(out_dims);
}
template class ReshapeOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
......@@ -24,7 +24,7 @@ void ResizeOp<Dtype, T>::InferShape() const {
auto out_dims = CalOutputShape(this->param_);
this->param_.Out()->Resize(out_dims);
}
template class ResizeOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
......@@ -24,7 +24,7 @@ void ScaleOp<Dtype, T>::InferShape() const {
auto input_dims = this->param_.InputX()->dims();
this->param_.Out()->Resize(input_dims);
}
template class ScaleOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
......@@ -22,7 +22,7 @@ template <typename DeviceType, typename T>
void SigmoidOp<DeviceType, T>::InferShape() const {
this->param_.Out()->Resize(this->param_.InputX()->dims());
}
template class SigmoidOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
......@@ -23,7 +23,7 @@ template <typename Dtype, typename T>
void SliceOp<Dtype, T>::InferShape() const {
/// todo: add InputShape() detection.
}
template class SliceOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
......@@ -22,7 +22,7 @@ template <typename DeviceType, typename T>
void SoftmaxOp<DeviceType, T>::InferShape() const {
this->param_.Out()->Resize(this->param_.InputX()->dims());
}
template class SoftmaxOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
......@@ -47,7 +47,7 @@ void TransposeOp<Dtype, T>::InferShape() const {
}
this->param_.Out()->Resize(out_dims);
}
template class TransposeOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
set(dir ${CMAKE_CURRENT_SOURCE_DIR})
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${dir}/build")
if (NET STREQUAL "googlenet")
if ("googlenet" IN_LIST NET)
# gen test
ADD_EXECUTABLE(test-googlenet net/test_googlenet.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-googlenet paddle-mobile)
elseif (NET STREQUAL "mobilenet")
elseif ("mobilenet" IN_LIST NET)
# gen test
ADD_EXECUTABLE(test-mobilenet net/test_mobilenet.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-mobilenet paddle-mobile)
elseif (NET STREQUAL "yolo")
elseif ("yolo" IN_LIST NET)
# gen test
ADD_EXECUTABLE(test-yolo net/test_yolo.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-yolo paddle-mobile)
elseif (NET STREQUAL "squeezenet")
elseif ("squeezenet" IN_LIST NET)
# gen test
ADD_EXECUTABLE(test-squeezenet net/test_squeezenet.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-squeezenet paddle-mobile)
elseif(NET STREQUAL "resnet")
elseif("resnet" IN_LIST NET)
# gen test
ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-resnet paddle-mobile)
......@@ -145,6 +145,10 @@ else ()
ADD_EXECUTABLE(test-conv-add-relu-op operators/test_conv_add_relu_op.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-conv-add-relu-op paddle-mobile)
# gen test
ADD_EXECUTABLE(test-conv-add-bn-relu-op operators/test_fusion_conv_add_bn_relu_op.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-conv-add-bn-relu-op paddle-mobile)
#add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp)
endif()
......@@ -43,7 +43,7 @@ template <typename DeviceType, typename OpType>
class Executor4Test : public Executor<DeviceType> {
public:
Executor4Test(Program<DeviceType> p, string op_type,
bool use_optimize = false)
bool use_optimize = false, int predict_op_count = 1)
: Executor<DeviceType>() {
this->use_optimize_ = use_optimize;
this->program_ = p;
......@@ -57,12 +57,14 @@ class Executor4Test : public Executor<DeviceType> {
LOG(paddle_mobile::LogLevel::kLOG_ERROR)
<< "to_predict_program_ == nullptr";
}
const std::vector<std::shared_ptr<BlockDesc>> blocks =
this->to_predict_program_->Blocks();
for (std::shared_ptr<BlockDesc> block_desc : blocks) {
std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
for (std::shared_ptr<OpDesc> op : ops) {
if (op->Type() == op_type) {
for (int i = 0; i < ops.size(); ++i) {
auto op = ops[i];
if (op->Type() == op_type && i < predict_op_count) {
DLOG << "匹配到: " << op->Type();
/// test first meeting op in program
......@@ -72,11 +74,17 @@ class Executor4Test : public Executor<DeviceType> {
op->Type(), op->GetInputs(), op->GetOutputs(),
op->GetAttrMap(), this->program_.scope);
this->ops_of_block_[*block_desc.get()].push_back(op_ptr);
break;
}
}
}
this->InitMemory();
std::shared_ptr<paddle_mobile::framework::BlockDesc> to_predict_block =
this->to_predict_program_->Block(0);
auto &ops = this->ops_of_block_[*to_predict_block.get()];
for (const auto &op : ops) {
op->Init();
}
}
template <typename T = LoDTensor>
......@@ -130,9 +138,6 @@ class Executor4Test : public Executor<DeviceType> {
auto *output_tensor = con_output->GetMutable<LoDTensor>();
output_tensor->mutable_data<float>(dDim);
std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
out_tensor.reset(output_tensor);
std::shared_ptr<paddle_mobile::framework::BlockDesc> to_predict_block =
this->to_predict_program_->Block(0);
for (int j = 0; j < this->ops_of_block_[*to_predict_block.get()].size();
......@@ -141,6 +146,7 @@ class Executor4Test : public Executor<DeviceType> {
op->Run();
}
return out_tensor;
return std::make_shared<paddle_mobile::framework::Tensor>(
paddle_mobile::framework::Tensor(*output_tensor));
}
};
......@@ -19,7 +19,9 @@ int main() {
paddle_mobile::Loader<paddle_mobile::CPU> loader;
// ../../../test/models/googlenet
// ../../../test/models/mobilenet
auto program = loader.Load(g_googlenet, true);
// auto program = loader.Load(g_googlenet, true);
auto program = loader.Load(g_mobilenet_ssd, true);
// auto program = loader.Load(g_googlenet_combine + "/model",
// g_googlenet_combine +
// "/params", true);
......
......@@ -23,7 +23,7 @@ int main() {
auto time1 = time();
if (paddle_mobile.Load(g_googlenet, optimize)) {
auto time2 = time();
DLOG << "load cost :" << time_diff(time1, time1) << "ms";
DLOG << "load cost: " << time_diff(time1, time1) << "ms";
std::vector<float> input;
std::vector<int64_t> dims{1, 3, 224, 224};
GetInput<float>(g_test_image_1x3x224x224, &input, dims);
......
......@@ -12,28 +12,31 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <fstream>
#include <iostream>
#include "../test_helper.h"
#include "../test_include.h"
int main() {
paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
paddle_mobile.SetThreadNum(4);
auto time1 = time();
if (paddle_mobile.Load(g_mobilenet_ssd, true)) {
auto isok = paddle_mobile.Load(
std::string(g_mobilenet_ssd_gesture) + "/model",
std::string(g_mobilenet_ssd_gesture) + "/params", true);
// auto isok = paddle_mobile.Load(g_mobilenet_ssd, false);
if (isok) {
auto time2 = time();
DLOG << "load cost :" << time_diff(time1, time1) << "ms";
std::cout << "load cost :" << time_diff(time1, time2) << "ms" << std::endl;
std::vector<float> input;
std::vector<int64_t> dims{1, 3, 300, 300};
Tensor input_tensor;
SetupTensor<float>(&input_tensor, {1, 3, 300, 300}, static_cast<float>(0),
static_cast<float>(1));
GetInput<float>(g_hand, &input, dims);
std::vector<float> input(input_tensor.data<float>(),
input_tensor.data<float>() + input_tensor.numel());
auto time3 = time();
paddle_mobile.Predict(input, dims);
auto output = paddle_mobile.Predict(input, dims);
auto time4 = time();
DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
std::cout << "predict cost :" << time_diff(time3, time4) << "ms"
<< std::endl;
}
return 0;
}
......@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <fstream>
#include <iostream>
#include "../test_helper.h"
#include "../test_include.h"
......@@ -22,20 +22,23 @@ int main() {
auto time1 = time();
if (paddle_mobile.Load(g_mobilenet, true)) {
auto time2 = time();
DLOG << "load cost :" << time_diff(time1, time1) << "ms";
std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
std::vector<float> input;
std::vector<int64_t> dims{1, 3, 224, 224};
Tensor input_tensor;
SetupTensor<float>(&input_tensor, {1, 3, 224, 224}, static_cast<float>(0),
static_cast<float>(1));
std::vector<float> input(input_tensor.data<float>(),
input_tensor.data<float>() + input_tensor.numel());
auto time3 = time();
auto vec_result = paddle_mobile.Predict(input, dims);
auto time4 = time();
DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
GetInput<float>(g_test_image_1x3x224x224, &input, dims);
for (int i = 0; i < 10; ++i) {
auto time3 = time();
auto vec_result = paddle_mobile.Predict(input, dims);
auto time4 = time();
std::vector<float>::iterator biggest =
std::max_element(std::begin(vec_result), std::end(vec_result));
std::cout << " Max element is " << *biggest << " at position "
<< std::distance(std::begin(vec_result), biggest) << std::endl;
std::cout << "predict cost :" << time_diff(time3, time4) << "ms"
<< std::endl;
}
}
return 0;
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "../test_include.h"
#include "operators/fusion_conv_add_bn_relu_op.h"
int main() {
paddle_mobile::Loader<paddle_mobile::CPU> loader;
// ../models/image_classification_resnet.inference.model
auto program = loader.Load(g_mobilenet, true);
PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
"program file read fail");
Executor4Test<paddle_mobile::CPU,
paddle_mobile::operators::FusionConvAddBNReluOp<
paddle_mobile::CPU, float>>
executor(program, "fusion_conv_add_bn_relu", true);
std::cout << "executor 4 test: " << std::endl;
paddle_mobile::framework::Tensor input;
GetInput<float>(g_test_image_1x3x224x224_banana, &input, {1, 3, 224, 224});
// // use SetupTensor if not has local input image .
// SetupTensor<float>(&input, {1, 3, 224, 224}, static_cast<float>(0),
// static_cast<float>(1));
DLOG << " fuck: " << input;
auto out_ddim = paddle_mobile::framework::make_ddim({1, 32, 112, 112});
std::cout << "before predict: " << std::endl;
auto output =
executor.Predict(input, "data", "conv2_1_dw_bn.tmp_2", out_ddim);
std::cout << "after predict " << std::endl;
auto output_ptr = output->data<float>();
int stride = output->numel() / 100;
for (int i = 0; i < 100; i++) {
DLOG << " index:" << i * stride << " value: " << output_ptr[i * stride];
}
// for (int i = 0; i < 100; i++) {
// DLOG << " index:" << i << " value: "<< output_ptr[i];
// }
// for (int j = 0; j < output->numel(); ++j) {
// std::cout << " (index: " << j << " value: " << output_ptr[j] << ") ";
// }
std::cout << std::endl;
return 0;
}
......@@ -16,22 +16,29 @@ limitations under the License. */
#include <fstream>
#include <random>
#include <string>
#include <vector>
#include "common/common.h"
#include "common/log.h"
#include "framework/ddim.h"
#include "framework/tensor.h"
static const std::string g_mobilenet_ssd = "../models/mobilenet+ssd";
static const std::string g_squeezenet = "../models/squeezenet";
static const std::string g_googlenet = "../models/googlenet";
static const std::string g_mobilenet = "../models/mobilenet";
static const std::string g_resnet_50 = "../models/resnet_50";
static const std::string g_resnet = "../models/resnet";
static const std::string g_googlenet_combine = "../models/googlenet_combine";
static const std::string g_yolo = "../models/yolo";
static const std::string g_test_image_1x3x224x224 =
static const char *g_mobilenet_ssd = "../models/mobilenet+ssd";
static const char *g_mobilenet_ssd_gesture = "../models/mobilenet+ssd_gesture";
static const char *g_squeezenet = "../models/squeezenet";
static const char *g_googlenet = "../models/googlenet";
static const char *g_mobilenet = "../models/mobilenet";
static const char *g_resnet_50 = "../models/resnet_50";
static const char *g_resnet = "../models/resnet";
static const char *g_googlenet_combine = "../models/googlenet_combine";
static const char *g_yolo = "../models/yolo";
static const char *g_test_image_1x3x224x224 =
"../images/test_image_1x3x224x224_float";
static const char *g_test_image_1x3x224x224_banana =
"../images/input_3x224x224_banana";
static const char *g_hand = "../images/hand_image";
using paddle_mobile::framework::DDim;
using paddle_mobile::framework::Tensor;
......@@ -62,9 +69,9 @@ void GetInput(const std::string &input_name, std::vector<T> *input,
size *= dim;
}
T *input_ptr = (T *)malloc(sizeof(T) * size);
T *input_ptr = reinterpret_cast<T *>(malloc(sizeof(T) * size));
std::ifstream in(input_name, std::ios::in | std::ios::binary);
in.read((char *)(input_ptr), size * sizeof(T));
in.read(reinterpret_cast<char *>(input_ptr), size * sizeof(T));
in.close();
for (int i = 0; i < size; ++i) {
input->push_back(input_ptr[i]);
......@@ -79,6 +86,6 @@ void GetInput(const std::string &input_name,
T *input_ptr = input->mutable_data<T>(dims);
std::ifstream in(input_name, std::ios::in | std::ios::binary);
in.read((char *)(input_ptr), input->numel() * sizeof(T));
in.read(reinterpret_cast<char *>(input_ptr), input->numel() * sizeof(T));
in.close();
}
#!/usr/bin/env bash
NETS=""
declare -a supportedNets=("googlenet" "mobilenet" "yolo" "squeezenet" "resnet")
build_for_mac() {
if [ ! `which brew` ]; then
......@@ -38,7 +40,8 @@ build_for_android() {
fi
if [ -z "$PLATFORM" ]; then
PLATFORM="arm-v7a" # Users could choose "arm-v8a" or other platforms from the command line.
PLATFORM="arm-v7a" # Users could choose "arm-v8a" platform.
# PLATFORM="arm-v8a"
fi
if [ "${PLATFORM}" = "arm-v7a" ]; then
......@@ -59,7 +62,8 @@ build_for_android() {
ANDROID_PLATFORM_VERSION="android-22"
TOOLCHAIN_FILE="./tools/android-cmake/android.toolchain.cmake"
ANDROID_ARM_MODE="arm"
if [ $# -eq 1 ]; then
if [ "${#NETS}" > 1 ]; then
cmake .. \
-B"../build/release/${PLATFORM}" \
-DANDROID_ABI="${ABI}" \
......@@ -69,7 +73,7 @@ build_for_android() {
-DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \
-DANDROID_STL=c++_static \
-DANDROID=true \
-DNET=$1 \
-DNET="${NETS}" \
-D"${ARM_PLATFORM}"=true
else
......@@ -92,23 +96,25 @@ build_for_ios() {
# rm -rf "../build"
PLATFORM="ios"
MODE="Release"
BUILD_DIR=../build/release/"${PLATFORM}"
BUILD_DIR=../build/release/"${PLATFORM}"/
TOOLCHAIN_FILE="./tools/ios-cmake/ios.toolchain.cmake"
mkdir -p "${BUILD_DIR}"
if [ $# -eq 1 ]; then
if [ "${#NETS}" > 1 ]; then
cmake .. \
-B"${BUILD_DIR}" \
-DCMAKE_BUILD_TYPE="${MODE}" \
-DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
-DIOS_PLATFORM=OS \
-DNET=$1 \
-DIOS_ARCH="${IOS_ARCH}" \
-DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
-DNET="${NETS}" \
-DIS_IOS="true"
else
cmake .. \
-B"${BUILD_DIR}" \
-DCMAKE_BUILD_TYPE="${MODE}" \
-DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
-DIOS_PLATFORM=OS \
-DIOS_ARCH="${IOS_ARCH}" \
-DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
-DIS_IOS="true"
fi
cd "${BUILD_DIR}"
......@@ -120,7 +126,7 @@ build_for_ios() {
}
build_error() {
echo "unknown argument"
echo "unknown target : $1"
}
if [ $# -lt 1 ]; then
......@@ -128,31 +134,37 @@ if [ $# -lt 1 ]; then
echo "available targets: ios|android"
echo "sample usage: ./build.sh android"
else
if [ $# -eq 2 ]; then
if [ $2 != "googlenet" -a $2 != "mobilenet" -a $2 != "yolo" -a $2 != "squeezenet" -a $2 != "resnet" ]; then
if [ $1 = "android" ]; then
build_for_android
elif [ $1 = "ios" ]; then
build_for_ios
else
build_error
fi
else
if [ $1 = "android" ]; then
build_for_android $2
elif [ $1 = "ios" ]; then
build_for_ios $2
else
build_error
fi
params=($@)
for(( i=1; i<$#; i++ )); do
if [ ${i} != 1 ]; then
NETS=$NETS$";"
fi
NETS=$NETS$"${params[i]}"
done
params=${@:2}
supported=false
for name in ${params[@]}; do
for net in ${supportedNets[@]}; do
match=false
if [ "$name"x = "$net"x ];then
supported=true
match=true
break 1
fi
done
if [ "$match" = false ];then
echo "${name} not supported!"
echo "supported nets are: ${supportedNets[@]}"
exit -1
fi
done
if [ $1 = "android" ]; then
build_for_android
elif [ $1 = "ios" ]; then
build_for_ios
else
if [ $1 = "android" ]; then
build_for_android
elif [ $1 = "ios" ]; then
build_for_ios
else
build_error
fi
fi
build_error "$1"
fi
fi
\ No newline at end of file
......@@ -34,6 +34,7 @@ set (CMAKE_SYSTEM_VERSION 1)
set (UNIX True)
set (APPLE True)
set (IOS True)
set (IOS_ARCH armv7 armv7s arm64)
# Required as of cmake 2.8.10
set (CMAKE_OSX_DEPLOYMENT_TARGET "" CACHE STRING "Force unset of the deployment target for iOS" FORCE)
......@@ -159,7 +160,6 @@ set (CMAKE_OSX_SYSROOT ${CMAKE_IOS_SDK_ROOT} CACHE PATH "Sysroot used for iOS su
# set the architecture for iOS
if (${IOS_PLATFORM} STREQUAL "OS")
set (IOS_ARCH armv7 armv7s arm64)
elseif (${IOS_PLATFORM} STREQUAL "SIMULATOR")
set (IOS_ARCH i386)
elseif (${IOS_PLATFORM} STREQUAL "SIMULATOR64")
......
BEGIN {
print "digraph G {"
}
/op:/ {
id++
opname[id] = $NF
}
/input/ {
type = "input"
para = $NF
if (input[id]) {
input[id] = input[id] "|"
}
input[id] = input[id] "<" para ">" para
}
/output/ {
type = "output"
para = $NF
if (output[id]) {
output[id] = output[id] "|"
}
output[id] = output[id] "<" para ">" para
}
/attr/ {
type = "attr"
aname = $NF
if (attr_key[id]) {
attr_key[id] = attr_key[id] "|"
attr_value[id] = attr_value[id] "|"
}
attr_key[id] = attr_key[id] $NF
}
/argument/ {
if (type == "attr") {
split($0, arr, " - ")
attr_value[id] = attr_value[id] arr[2]
} else if ((type == "input") || (type == "output")) {
if (!var2id[$NF]) {
var_id++
var[var_id] = $NF
var2id[$NF] = var_id
}
varid = var2id[$NF]
lid++
if (type == "input") {
line[lid] = "var_" varid " -> " "op_" id ":<" para ">"
if (xout[$NF]) {
xi++
xline[xi] = "xop_" xout[$NF] " -> " "xop_" id
}
} else if (type == "output") {
line[lid] = "op_" id ":<" para ">" " -> " "var_" varid
xout[$NF] = id
}
}
}
/var name/ {
varname = $NF
vid = var2id[varname]
}
/var tensor desc dim / {
if (tensor[vid]) tensor[vid] = tensor[vid] " x "
tensor[vid] = tensor[vid] $NF
}
END {
print "subgraph cluster_G0 {"
for (i = 1; i <= id; i++) {
print "xop_" i "[label=\"" i ". " opname[i] "\"]"
}
for (i = 1; i <= xi; i++) {
print xline[i]
}
print "}"
for (i = 1; i <= id; i++) {
print "op_" i "[group=op;shape=record;label=\"{{" input[i] "}|<op>" i ". " opname[i] "|{" output[i] "}}\"]"
}
for (i = 1; i <= var_id; i++) {
print "var_" i "[label=\"" var[i] " [" tensor[i] "]\"]"
}
for (i = 1; i <= lid; i++) {
print line[i]
}
for (i = 1; i <= id; i++) {
print "attr_" i "[shape=record;label=\"{" attr_key[i] "}|{" attr_value[i] "}\"]"
print "attr_" i " -> " "op_" i ":<op>"
}
print "}"
}
if (NET STREQUAL "googlenet")
set(FOUND_MATCH OFF)
if ("googlenet" IN_LIST NET)
message("googlenet enabled")
set(CONCAT_OP ON)
set(CONV_OP ON)
set(LRN_OP ON)
......@@ -8,8 +10,13 @@ if (NET STREQUAL "googlenet")
set(POOL_OP ON)
set(RELU_OP ON)
set(FUSION_CONVADD_OP ON)
set(FUSION_CONVADD_RELU_OP ON)
elseif (NET STREQUAL "mobilenet")
set(FUSION_CONVADDRELU_OP ON)
set(FOUND_MATCH ON)
endif()
if ("mobilenet" IN_LIST NET)
message("mobilenet enabled")
set(CONV_OP ON)
set(ELEMENTWISEADD_OP ON)
set(RELU_OP ON)
......@@ -21,12 +28,23 @@ elseif (NET STREQUAL "mobilenet")
set(RESHAPE_OP ON)
set(FUSION_CONVADDBNRELU_OP ON)
set(FUSION_CONVADD_OP ON)
elseif (NET STREQUAL "yolo")
set(FOUND_MATCH ON)
endif()
if ("yolo" IN_LIST NET)
message("yolo enabled")
set(BATCHNORM_OP ON)
set(CONV_OP ON)
set(RELU_OP ON)
set(ELEMENTWISEADD_OP ON)
elseif (NET STREQUAL "squeezenet")
set(FOUND_MATCH ON)
endif()
if ("squeezenet" IN_LIST NET)
message("squeezenet enabled")
set(CONCAT_OP ON)
set(CONV_OP ON)
set(RELU_OP ON)
......@@ -34,15 +52,45 @@ elseif (NET STREQUAL "squeezenet")
set(POOL_OP ON)
set(RESHAPE_OP ON)
set(SOFTMAX_OP ON)
elseif (NET STREQUAL "resnet")
set(FOUND_MATCH ON)
endif()
if ("resnet" IN_LIST NET)
message("resnet enabled")
set(CONCAT_OP ON)
set(CONV_OP ON)
set(BATCHNORM_OP ON)
set(RELU_OP ON)
set(ELEMENTWISEADD_OP ON)
set(POOL_OP ON)
set(RESHAPE_OP ON)
set(SOFTMAX_OP ON)
set(MUL_OP ON)
set(FOUND_MATCH ON)
endif()
if ("FPGAnets" IN_LIST NET)
message("FPGAnets enabled")
set(FUSION_CONVADDRELU_OP ON)
set(FUSION_CONVADDBNRELU_OP ON)
set(FUSION_CONVADDBN_OP ON)
set(FUSION_POOLBN_OP ON)
set(FUSION_ELEMENTWISEADDRELU_OP ON)
set(FUSION_FC_OP ON)
set(FUSION_FCRELU_OP ON)
set(REGION_OP ON)
set(POOL_OP ON)
set(RELU_OP ON)
else ()
set(CONCAT_OP ON)
set(SOFTMAX_OP ON)
set(DROPOUT_OP ON)
set(FOUND_MATCH ON)
endif()
if(NOT FOUND_MATCH)
message("--default--")
set(BATCHNORM_OP ON)
set(BOXCODER_OP ON)
set(CONCAT_OP ON)
......@@ -50,7 +98,7 @@ else ()
set(DEPTHWISECONV_OP ON)
set(ELEMENTWISEADD_OP ON)
set(FUSION_CONVADD_OP ON)
set(CONVADDRELU_OP ON)
set(FUSION_CONVADDRELU_OP ON)
set(FUSION_FC_OP ON)
set(LRN_OP ON)
set(MUL_OP ON)
......@@ -62,15 +110,17 @@ else ()
set(SIGMOID_OP ON)
set(SOFTMAX_OP ON)
set(TRANSPOSE_OP ON)
set(FUSION_CONVADD_RELU_OP ON)
set(FUSION_CONVADDBNRELU_OP ON)
set(FUSION_DWCONVBNRELU_OP ON)
set(FUSION_CONVBNRELU_OP ON)
set(PRELU_OP ON)
set(RESIZE_OP ON)
set(SCALE_OP ON)
set(SLICE_OP ON)
set(DROPOUT_OP ON)
set(IM2SEQUENCE_OP ON)
endif()
# option(BATCHNORM_OP "" ON)
# option(BOXCODER_OP "" ON)
# option(CONCAT_OP "" ON)
......@@ -78,7 +128,7 @@ else ()
# option(DEPTHWISECONV_OP "" ON)
# option(ELEMENTWISEADD_OP "" ON)
# option(FUSION_CONVADD_OP "" ON)
# option(CONVADDRELU_OP "" ON)
# option(FUSION_CONVADDRELU_OP "" ON)
# option(FUSION_FC_OP "" ON)
# option(LRN_OP "" ON)
# option(MUL_OP "" ON)
......@@ -90,8 +140,7 @@ else ()
# option(SIGMOID_OP "" ON)
# option(SOFTMAX_OP "" ON)
# option(TRANSPOSE_OP "" ON)
# option(FUSION_CONVADD_RELU_OP "" ON)
endif ()
# endif ()
if (BATCHNORM_OP)
add_definitions(-DBATCHNORM_OP)
......@@ -114,8 +163,8 @@ endif()
if (FUSION_CONVADD_OP)
add_definitions(-DFUSION_CONVADD_OP)
endif()
if (CONVADDRELU_OP)
add_definitions(-DCONVADDRELU_OP)
if (FUSION_CONVADDRELU_OP)
add_definitions(-DFUSION_CONVADDRELU_OP)
endif()
if (FUSION_FC_OP)
add_definitions(-DFUSION_FC_OP)
......@@ -150,15 +199,17 @@ endif()
if (TRANSPOSE_OP)
add_definitions(-DTRANSPOSE_OP)
endif()
if (FUSION_CONVADD_RELU_OP)
add_definitions(-DFUSION_CONVADD_RELU_OP)
endif()
if (FUSION_CONVADDBNRELU_OP)
add_definitions(-DFUSION_CONVADDBNRELU_OP)
endif()
if (FUSION_DWCONVBNRELU_OP)
add_definitions(-DFUSION_DWCONVBNRELU_OP)
endif()
if (FUSION_CONVBNRELU_OP)
add_definitions(-DFUSION_CONVBNRELU_OP)
endif()
if (PRELU_OP)
add_definitions(-DPRELU_OP)
endif()
......@@ -177,3 +228,20 @@ endif()
if (IM2SEQUENCE_OP)
add_definitions(-DIM2SEQUENCE_OP)
endif()
if (FUSION_CONVADDBN_OP)
add_definitions(-DFUSION_CONVADDBN_OP)
endif()
if (FUSION_FCRELU_OP)
add_definitions(-DFUSION_FCRELU_OP)
endif()
if (FUSION_POOLBN_OP)
add_definitions(-DFUSION_POOLBN_OP)
endif()
if (FUSION_ELEMENTWISEADDRELU_OP)
add_definitions(-DFUSION_ELEMENTWISEADDRELU_OP)
endif()
if (REGION_OP)
add_definitions(-DREGION_OP)
endif()
cmake_minimum_required(VERSION 3.6)
project(quali)
add_definitions(-DENABLE_EXCEPTION)
set(CMAKE_CXX_STANDARD 11)
file(GLOB_RECURSE QULIFICATON_CC src/*.cc src/*.cpp src/*.c src/*.mm)
file(GLOB_RECURSE QULIFICATON_H src/*.h)
include_directories(. src/)
#add_library(paddle-mobile SHARED ${QULIFICATON_CC} ${QULIFICATON_H} convert.cpp)
add_executable(quantify convert.cpp ${QULIFICATON_CC} ${QULIFICATON_H})
\ No newline at end of file
# 模型量化脚本
#### 量化脚本使用指南
1. 在PaddleMobile项目目录下(如 ~/PaddleProject/paddle-mobile)
2. cd到 tools/quantification/ 目录
3. cmake编译
``` sh
cmake .
make
```
4. 运行量化脚本
```sh
./quantify (0:seperated. 1:combined ) (输入路径) (输出路径)
# quantify googlenet seperated from /Users/xiebaiyuan/PaddleProject/quali/models/googlenet to ./googlenet_min
./quantify 0 /Users/xiebaiyuan/PaddleProject/quali/models/googlenet ./googlenet_min
```
*注:*
*量化工具中*
*1.seperated模型model文件默认命名为 "__model__";*
*2.combined模型的model文件默认命名为 "model",参数文件默认命名为"params";*
##### 整体如下:
以googlenet非combined为例:
```sh
cd tools/quantification/
cmake .
make
./quantify 0 /Users/xiebaiyuan/PaddleProject/quali/models/googlenet ./googlenet_min
```
#include "src/enforce.h"
#include "src/var_desc.h"
#include "src/program_desc.h"
#include <cstdlib>
#include <string>
#include <cmath>
#include <iostream>
#include <utility>
#include <vector>
#include "src/framework.pb-c.h"
#include "src/protobuf-c.h"
#include <fstream>
#include <iostream>
const size_t kSize64 = sizeof(uint64_t);
const size_t kSize32 = sizeof(uint32_t);
char *Get_binary_data(const std::string &filename) {
FILE *file = fopen(filename.c_str(), "rb");
PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ",
filename.c_str());
fseek(file, 0, SEEK_END);
int64_t size = ftell(file);
PADDLE_MOBILE_ENFORCE(size > 0, "size is too small");
rewind(file);
auto *data = new char[size];
size_t bytes_read = fread(data, 1, static_cast<size_t>(size), file);
PADDLE_MOBILE_ENFORCE(bytes_read == size,
"read binary file bytes do not match with fseek");
fclose(file);
return data;
}
static size_t ReadBuffer(const char *file_name, uint8_t **out) {
FILE *fp;
fp = fopen(file_name, "rb");
PADDLE_MOBILE_ENFORCE(fp != nullptr, " %s open failed !", file_name);
fseek(fp, 0, SEEK_END);
auto size = static_cast<size_t>(ftell(fp));
rewind(fp);
*out = reinterpret_cast<uint8_t *>(malloc(size));
size_t cur_len = 0;
size_t nread;
while ((nread = fread(*out + cur_len, 1, size - cur_len, fp)) != 0) {
cur_len += nread;
}
fclose(fp);
return cur_len;
}
std::shared_ptr<ProgramDesc> loadParams(const std::string &model_path) {
PaddleMobile__Framework__Proto__ProgramDesc *c_program;
uint8_t *buf = nullptr;
size_t read_size = ReadBuffer(model_path.c_str(), &buf);
PADDLE_MOBILE_ENFORCE(buf != nullptr, "read from __model__ is null");
c_program = paddle_mobile__framework__proto__program_desc__unpack(
nullptr, read_size, buf);
PADDLE_MOBILE_ENFORCE(c_program != nullptr, "program is null");
auto originProgramDesc = std::make_shared<ProgramDesc>(c_program);
return originProgramDesc;
}
void LoadWithDump(const paddle_mobile::framework::VarDesc &var_desc, char *dataP, FILE *out_file) {
// 1. version
uint32_t version = *reinterpret_cast<uint32_t *>(dataP);
// write version
fwrite(&version, kSize32, 1, out_file);
dataP += kSize32;
// 2 Lod information
auto *lod_level_ptr = new uint64_t();
memcpy(lod_level_ptr, dataP, kSize64);
uint64_t lod_level = 0;
// write lod Information
fwrite(&lod_level, kSize64, 1, out_file);
delete lod_level_ptr;
dataP += kSize64;
for (uint64_t i = 0; i < lod_level; ++i) {
uint64_t size = *reinterpret_cast<uint64_t *>(dataP);
// write lod size
fwrite(&size, kSize64, 1, out_file);
(dataP) += kSize64;
std::vector<size_t> tmp(size / sizeof(size_t));
for (unsigned long &k : tmp) {
k = *reinterpret_cast<size_t *>(dataP);
(dataP) += sizeof(size_t);
}
// write lod size vector
fwrite(&tmp, sizeof(size_t), tmp.size(), out_file);
}
// 3. tensor version
uint32_t tensor_version = *reinterpret_cast<uint32_t *>(dataP);
// write tensor version
fwrite(&tensor_version, kSize32, 1, out_file);
(dataP) += kSize32;
// 4. tensor desc
int32_t size = *reinterpret_cast<int32_t *>(dataP);
// write tensor desc
fwrite(&size, sizeof(int32_t), 1, out_file);
(dataP) += sizeof(int32_t);
std::unique_ptr<char[]> buf(new char[size]);
for (int m = 0; m < size; ++m) {
buf.get()[m] = (dataP)[m];
}
fwrite(buf.get(), sizeof(char), static_cast<size_t>(size), out_file);
(dataP) += (sizeof(char) * size);
const paddle_mobile::framework::TensorDesc &desc = var_desc.Tensor_desc();
int memory_size = 1;
for (auto l : desc.Dims()) {
memory_size *= l;
}
void *memory = nullptr;
int type_size = 0;
switch (desc.DataType()) {
case paddle_mobile::framework::VARTYPE_TYPE_FP16:
type_size = 2;
break;
case paddle_mobile::framework::VARTYPE_TYPE_FP32:
type_size = 4;
break;
case paddle_mobile::framework::VARTYPE_TYPE_FP64:
type_size = 8;
break;
case paddle_mobile::framework::VARTYPE_TYPE_INT32:
type_size = 4;
break;
case paddle_mobile::framework::VARTYPE_TYPE_INT64:
type_size = 8;
break;
case paddle_mobile::framework::VARTYPE_TYPE_BOOL:
type_size = 1;
break;
default:
break;
}
size_t tensorSize = sizeof(char) * memory_size * type_size;
memory = new char[tensorSize];
for (int n = 0; n < tensorSize; ++n) {
static_cast<char *>(memory)[n] = (dataP)[n];
}
dataP += tensorSize;
// for float 32
float min_value = std::numeric_limits<float>::max();
float max_value = std::numeric_limits<float>::min();
for (int k = 0; k < memory_size; ++k) {
min_value = std::min(min_value, static_cast<float *> (memory)[k]);
max_value = std::max(max_value, static_cast<float *> (memory)[k]);
}
fwrite(&min_value, sizeof(float), 1, out_file);
fwrite(&max_value, sizeof(float), 1, out_file);
for (int g = 0; g < memory_size; ++g) {
float value = static_cast<float *> (memory)[g];
auto factor = (uint8_t) round((value - min_value) / (max_value - min_value) * 255);
fwrite(&factor, sizeof(uint8_t), 1, out_file);
}
}
void
quantificate_combined(const std::string &model_path, const std::string &param_path, const std::string &param_min_path) {
auto program = loadParams(model_path);
char *origin_data = Get_binary_data(param_path);
char *data = origin_data;
FILE *out_file = fopen(param_min_path.c_str(), "wb");
for (const auto &block : program->Blocks()) {
for (const auto &var_desc : block->Vars()) {
if (var_desc->Persistable()) {
if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
continue;
}
LoadWithDump(*var_desc, data, out_file);
}
}
}
fclose(out_file);
delete origin_data;
}
void quantificate_seperated(const std::string model_dir, const std::string param_min_path) {
auto program = loadParams(model_dir + "/__model__");
std::string shell_command = "mkdir " + param_min_path;
system(shell_command.c_str());
for (const auto &block : program->Blocks()) {
for (const auto &var_desc : block->Vars()) {
if (var_desc->Persistable()) {
if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
continue;
}
std::string file_name = param_min_path + "/" + var_desc->Name();
FILE *out_file = fopen(file_name.c_str(), "wb");
char *origin_data = Get_binary_data(model_dir + "/" + var_desc->Name());
char *data = origin_data;
LoadWithDump(*var_desc, data, out_file);
delete origin_data;
fclose(out_file);
}
}
}
}
int main(int argc, char **argv) {
const std::string kNoteEg = "( eg: ./quantify 1 your_combined_model_path output_path or ./quantify 0 your_seperated_model_path output_path)";
PADDLE_MOBILE_ENFORCE(argc > 1, "wee need params.%s ", kNoteEg.c_str());
std::string action_type = argv[1];
PADDLE_MOBILE_ENFORCE(argc > 1 && (action_type) == "1" || action_type == "0",
"only 1 or 2 supported, current is %s %s ",
action_type.c_str(),
kNoteEg.c_str());
PADDLE_MOBILE_ENFORCE(argc > 2, "we need your model path. %s ", kNoteEg.c_str());
std::string base_path = argv[2];
PADDLE_MOBILE_ENFORCE(argc > 3, "we need your output path. %s ", kNoteEg.c_str());
std::string output_path = argv[3];
if (action_type == "0") {
// for seperated
const std::string &seperated_min_dir = output_path;
quantificate_seperated(base_path, seperated_min_dir);
return 0;
}
if (action_type == "1") {
// for combined
const std::string &combined_min_dir = output_path;
std::string model_path = base_path + "/model";
std::string param_path = base_path + "/params";
quantificate_combined(model_path, param_path, combined_min_dir);
return 0;
}
return -1;
}
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
set(ANDROID_ARM_NEON ON)
include("${CMAKE_CURRENT_LIST_DIR}/../android-cmake/android.toolchain.cmake")
\ No newline at end of file
set(ANDROID_PIE TRUE)
set(ANDROID_STL "c++_static")
set(ANDROID_PLATFORM "android-22")
include("${CMAKE_CURRENT_LIST_DIR}/../android-cmake/android.toolchain.cmake")
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册