提交 7b3df2f0 编写于 作者: D dolphin8 提交者: GitHub

Merge branch 'develop' into develop

cmake_minimum_required(VERSION 3.0)
cmake_minimum_required(VERSION 3.6)
project(paddle-mobile)
option(DEBUGING "enable debug mode" ON)
option(USE_OPENMP "openmp support" ON)
option(USE_OPENMP "openmp support" OFF)
option(USE_EXCEPTION "use std exception" ON)
option(LOG_PROFILE "log profile" ON)
# select the platform to build
option(CPU "armv7 with neon" ON)
option(MALI_GPU "mali gpu" OFF)
option(FPGA "fpga" OFF)
option(QUANTI "quantification" OFF)
file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c src/*.mm)
file(GLOB_RECURSE PADDLE_MOBILE_H src/*.h)
include_directories(src/)
if(IS_IOS)
set(CMAKE_CXX_FLAGS "-fobjc-abi-version=2 -fobjc-arc -std=gnu++11 -stdlib=libc++ -O3 -s -isysroot ${CMAKE_OSX_SYSROOT} ${CMAKE_CXX_FLAGS}")
set(CMAKE_CXX_FLAGS "-mfpu=neon -marm -fobjc-abi-version=2 -fobjc-arc -std=gnu++11 -stdlib=libc++ -O3 -s -isysroot ${CMAKE_OSX_SYSROOT} ${CMAKE_CXX_FLAGS}")
else()
set(CMAKE_CXX_FLAGS "-std=c++14 -O3 -s ${CMAKE_CXX_FLAGS}")
endif()
......@@ -43,7 +44,7 @@ if (LOG_PROFILE)
add_definitions(-DPADDLE_MOBILE_PROFILE)
endif()
if(USE_OPENMP)
if(USE_OPENMP AND NOT IS_IOS)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
add_definitions(-DPADDLE_MOBILE_USE_OPENMP)
endif()
......@@ -104,12 +105,21 @@ else()
foreach(f ${_tmp_list_h})
list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
endforeach()
endif()
file(GLOB_RECURSE _tmp_list src/fpga/*.cpp src/fpga/*.cc)
foreach(f ${_tmp_list})
list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
endforeach()
file(GLOB_RECURSE _tmp_list_h src/fpga/*.h)
foreach(f ${_tmp_list_h})
list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
endforeach()
endif()
if (ANDROID_NDK_TOOLCHAIN_INCLUDED)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog")
add_definitions(-DARMV7)
else()
list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/paddle_mobile_jni.h)
list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/paddle_mobile_jni.cpp)
......@@ -131,7 +141,7 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY build)
# NET default
set(NET "defult" CACHE STRING "select net type")
set_property(CACHE NET PROPERTY STRINGS "defult" "googlenet" "mobilenet" "yolo" "squeezenet")
set_property(CACHE NET PROPERTY STRINGS "defult" "googlenet" "mobilenet" "yolo" "squeezenet" "FPGAnets")
include("${CMAKE_CURRENT_LIST_DIR}/tools/op.cmake")
......@@ -153,3 +163,7 @@ if(DEBUGING)
endif()
endif()
if (QUANTI)
add_subdirectory(tools/quantification)
endif ()
......@@ -27,10 +27,10 @@ Paddle-Moible是PaddlePaddle组织下的项目,是一个致力于嵌入式平
- **ARM CPU**
![](http://7xop3k.com1.z0.glb.clouddn.com/15312108766575.jpg)
![](http://mms-graph.bj.bcebos.com/paddle-mobile%2F2018_07_18.png)
arm cpu是paddle-mobile的主要支持方向,cpu的通用性一直是其优势。嵌入式深度学习,需要大量的cpu汇编实现。我们正在紧锣密鼓的编码,为的是能充分硬件的每一点加速能力。
arm cpu的优化工作还在进行中,现在使用了常规的cpu优化。在arm a73上paddle-mobile arm-v7现在单核运行一次mobilenet1.0是120+ms,显然这不是我们的最终目标,我们正在用大量的汇编改写,后续性能仍会有巨大提升空间, 目前只支持armv7, 未来我们也会支持armv8。
arm cpu的优化工作还在进行中,现在使用了常规的cpu优化。在arm a73上paddle-mobile arm-v7现在单核运行一次mobilenet1.0是110+ms,显然这不是我们的最终目标,我们正在用大量的汇编改写,后续性能仍会有巨大提升空间, 目前只支持armv7, 未来我们也会支持armv8。
- **Mali GPU**
......
......@@ -24,6 +24,8 @@ const std::string G_OP_TYPE_CONCAT = "concat";
const std::string G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add";
const std::string G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu";
const std::string G_OP_TYPE_FUSION_CONV_ADD_BN_RELU = "fusion_conv_add_bn_relu";
const std::string G_OP_TYPE_FUSION_DWCONV_BN_RELU = "fusion_dwconv_bn_relu";
const std::string G_OP_TYPE_FUSION_CONV_BN_RELU = "fusion_conv_bn_relu";
const std::string G_OP_TYPE_FC = "fusion_fc";
const std::string G_OP_TYPE_FUSION_CONV_ADD = "fusion_conv_add";
const std::string G_OP_TYPE_LRN = "lrn";
......@@ -42,11 +44,21 @@ const std::string G_OP_TYPE_FETCH = "fetch";
const std::string G_OP_TYPE_DEPTHWISE_CONV = "depthwise_conv2d";
const std::string G_OP_TYPE_IM2SEQUENCE = "im2sequence";
const std::string G_OP_TYPE_DROPOUT = "dropout";
const std::string G_OP_TYPE_FUSION_CONV_RELU = "fusion_conv_relu";
const std::string G_OP_TYPE_FUSION_CONV_BN_SCALE = "fusion_conv_bn_scale";
const std::string G_OP_TYPE_FUSION_CONV_BN_SCALE_RELU =
"fusion_conv_bn_scale_relu";
const std::string G_OP_TYPE_FUSION_POOL_BN = "fusion_pool_bn";
const std::string G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU =
"fusion_elementwise_add_relu";
const std::string G_OP_TYPE_REGION = "region";
std::unordered_map<
std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
op_input_output_key = {
{G_OP_TYPE_CONV, {{"Input"}, {"Output"}}},
{G_OP_TYPE_FUSION_DWCONV_BN_RELU, {{"Input"}, {"Out"}}},
{G_OP_TYPE_FUSION_CONV_BN_RELU, {{"Input"}, {"Out"}}},
{G_OP_TYPE_FUSION_CONV_ADD, {{"Input"}, {"Out"}}},
{G_OP_TYPE_RELU, {{"X"}, {"Out"}}},
{G_OP_TYPE_SOFTMAX, {{"X"}, {"Out"}}},
......@@ -70,6 +82,12 @@ std::unordered_map<
{G_OP_TYPE_DEPTHWISE_CONV, {{"Input"}, {"Output"}}},
{G_OP_TYPE_FUSION_CONV_ADD_RELU, {{"Input"}, {"Out"}}},
{G_OP_TYPE_IM2SEQUENCE, {{"X"}, {"Out"}}},
{G_OP_TYPE_DROPOUT, {{"X"}, {"Out"}}}};
{G_OP_TYPE_DROPOUT, {{"X"}, {"Out"}}},
{G_OP_TYPE_FUSION_CONV_RELU, {{"Input"}, {"Out"}}},
{G_OP_TYPE_FUSION_CONV_BN_SCALE, {{"Input"}, {"Out"}}},
{G_OP_TYPE_FUSION_CONV_BN_SCALE_RELU, {{"Input"}, {"Out"}}},
{G_OP_TYPE_FUSION_POOL_BN, {{"X"}, {"Out"}}},
{G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU, {{"X", "Y"}, {"Out"}}},
{G_OP_TYPE_REGION, {{"X"}, {"Out"}}}};
} // namespace paddle_mobile
......@@ -16,6 +16,7 @@ limitations under the License. */
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
namespace paddle_mobile {
......@@ -81,6 +82,8 @@ extern const std::string G_OP_TYPE_FUSION_CONV_ADD_RELU;
extern const std::string G_OP_TYPE_FC;
extern const std::string G_OP_TYPE_FUSION_CONV_ADD;
extern const std::string G_OP_TYPE_FUSION_CONV_ADD_BN_RELU;
extern const std::string G_OP_TYPE_FUSION_DWCONV_BN_RELU;
extern const std::string G_OP_TYPE_FUSION_CONV_BN_RELU;
extern const std::string G_OP_TYPE_LRN;
extern const std::string G_OP_TYPE_MUL;
......@@ -99,6 +102,13 @@ extern const std::string G_OP_TYPE_DEPTHWISE_CONV;
extern const std::string G_OP_TYPE_IM2SEQUENCE;
extern const std::string G_OP_TYPE_DROPOUT;
extern const std::string G_OP_TYPE_FUSION_CONV_RELU;
extern const std::string G_OP_TYPE_FUSION_CONV_BN_SCALE;
extern const std::string G_OP_TYPE_FUSION_CONV_BN_SCALE_RELU;
extern const std::string G_OP_TYPE_FUSION_POOL_BN;
extern const std::string G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU;
extern const std::string G_OP_TYPE_REGION;
extern std::unordered_map<
std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
op_input_output_key;
......
......@@ -84,7 +84,7 @@ struct Variant {
if (type_id == typeid(T).hash_code()) {
return *const_cast<T *>(reinterpret_cast<const T *>(&data));
} else {
PADDLE_MOBILE_THROW_EXCEPTION(" bad cast in variant ");
PADDLE_MOBILE_THROW_EXCEPTION(" bad cast in variant");
exit(0);
}
}
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <errno.h>
#include <fcntl.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <sys/types.h>
#include <algorithm>
#include <cmath>
#include <cstdio>
#include <cstring>
#include "fpga/api/fpga_api.h"
namespace paddle {
namespace mobile {
namespace fpga {
namespace api {
static int fd = -1;
static const char *device_path = "/dev/fpgadrv0";
static inline int do_ioctl(int req, void *arg) { return ioctl(req, arg); }
int open_device() {
if (fd == -1) {
fd = open(device_path, O_RDWR);
}
return fd;
}
// memory management;
void *fpga_malloc(size_t size) {
return reinterpret_cast<(void *)> mmap64(NULL, size, PROT_READ | PROT_WRITE,
MAP_SHARED, fd, 0);
}
void fpga_free(void *ptr) { munmap(ptr, 0); }
void fpga_copy(void *dest, const void *src, size_t num) {
memcpy(dest, src, num);
}
} // namespace api
} // namespace fpga
} // namespace mobile
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cstddef>
#include <iostream>
#include <limits>
// memory management;
namespace paddle {
namespace mobile {
namespace fpga {
namespace api {
int open_device();
int close_device();
void *fpga_malloc(size_t size);
void fpga_free(void *ptr);
void fpga_copy(void *dst, const void *src, size_t num);
struct CnnVersionArgs {
void *buf;
};
struct QuantArgs {
float scale;
};
struct BatchNormalizationArgs {
bool enable;
};
struct ScaleArgs {};
#define IOCTL_CNN_MAGIC 'CNN'
#define IOCTL_VERSION _IOW(IOCTL_CNN_MAGIC, 1, struct CnnVersionArgs)
#define IOCTL_GET_QUANT _IOW(IOCTL_CNN_MAGIC, 2, struct QuantArgs)
#define IOCTL_SET_QUANT _IOW(IOCTL_CNN_MAGIC, 3, struct QuantArgs)
} // namespace api
} // namespace fpga
} // namespace mobile
} // namespace paddle
......@@ -28,6 +28,16 @@ vector<string> OperatorBase<Dtype>::GetOutKeys() const {
return it->second.second;
}
template <typename Dtype>
vector<string> OperatorBase<Dtype>::GetInputKeys() const {
auto it = op_input_output_key.find(type_);
if (it == op_input_output_key.end()) {
DLOG << type_ << " has no outputs";
return {};
}
return it->second.first;
}
template <typename Dtype>
OperatorBase<Dtype>::OperatorBase(const std::string &type,
const VariableNameMap &inputs,
......@@ -49,6 +59,11 @@ template <typename Dtype>
void OperatorBase<Dtype>::Run() const {
RunImpl();
#ifdef PADDLE_MOBILE_DEBUG
vector<string> input_keys = GetInputKeys();
for (const auto key : input_keys) {
Tensor *input = GetVarValue<framework::LoDTensor>(key, inputs_, *scope_);
DLOG << type_ << " input- " << key << "=" << *input;
}
vector<string> output_keys = GetOutKeys();
for (const auto key : output_keys) {
Tensor *out_ = GetVarValue<framework::LoDTensor>(key, outputs_, *scope_);
......
......@@ -61,6 +61,7 @@ class OperatorBase {
virtual ~OperatorBase() {}
void Run() const;
std::vector<string> GetOutKeys() const;
std::vector<string> GetInputKeys() const;
virtual void RunImpl() const = 0;
virtual void Init() = 0;
......@@ -118,6 +119,10 @@ class OperatorWithKernel : public OperatorBase<Dtype> {
virtual void InferShape() const = 0;
void Init() {
// for (auto i : this->inputs_) {
// DLOG << i.first;
// DLOG << i.second;
// }
PADDLE_MOBILE_ENFORCE(kernel_.Init(&param_), " %s kernel init failed",
this->type_.c_str());
}
......@@ -146,7 +151,7 @@ class OpKernelBase {
}
#endif
virtual void Compute(const P &para) const = 0;
virtual bool Init(P *para) { return true; };
virtual bool Init(P *para) { return true; }
virtual ~OpKernelBase() = default;
private:
......
......@@ -42,8 +42,17 @@ class FusionOpRegister {
matchers_[matcher->Type()] = shared_matcher;
}
const std::map<std::string, std::shared_ptr<FusionOpMatcher>> Matchers() {
return matchers_;
const std::vector<std::shared_ptr<FusionOpMatcher>> Matchers() {
std::vector<std::shared_ptr<FusionOpMatcher>> matchers;
for (const auto& match : matchers_) {
matchers.push_back(match.second);
}
std::sort(matchers.begin(), matchers.end(),
[](std::shared_ptr<FusionOpMatcher> first,
std::shared_ptr<FusionOpMatcher> second) {
return first->BeginNode().Depth() > second->BeginNode().Depth();
});
return matchers;
}
private:
......
......@@ -44,23 +44,6 @@ bool Node::operator==(const Node &in) {
return true;
}
std::vector<std::shared_ptr<framework::OpDesc>> Node::OpDescs(int size) {
std::vector<std::shared_ptr<framework::OpDesc>> op_descs;
OpDescs(size - 1, &op_descs);
return op_descs;
}
void Node::OpDescs(int index,
std::vector<std::shared_ptr<framework::OpDesc>> *op_desc) {
if (index == 0) {
return;
}
op_desc->push_back(this->op_desc_);
for (auto &output : outputs_) {
output->OpDescs(index, op_desc);
}
}
std::shared_ptr<Node> Node::To(int size) {
std::shared_ptr<Node> node = std::make_shared<Node>();
this->To(size - 1, node);
......
......@@ -47,13 +47,10 @@ class Node {
std::map<std::string, std::vector<std::pair<std::string, std::string>>>
change,
std::vector<std::shared_ptr<Node>> *removed_nodes);
std::vector<std::shared_ptr<framework::OpDesc>> OpDescs(int size);
std::shared_ptr<framework::OpDesc> OpDescOfNode() { return op_desc_; }
std::string Type() { return type_; }
private:
void OpDescs(int size,
std::vector<std::shared_ptr<framework::OpDesc>> *op_desc);
void To(int index, std::shared_ptr<Node>);
void Folder(
std::shared_ptr<framework::OpDesc> op_desc,
......
......@@ -78,9 +78,8 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FusionOptimize(
}
for (auto &registed : FusionOpRegister::Instance()->Matchers()) {
std::string fusion_type = registed.first;
std::shared_ptr<FusionOpMatcher> matcher = registed.second;
// DLOG << " registed node \n " << matcher->BeginNode();
std::string fusion_type = registed->Type();
std::shared_ptr<FusionOpMatcher> matcher = registed;
auto match_vector = type_map[matcher->BeginType()];
......
......@@ -30,6 +30,7 @@ class Program {
std::string model_path;
std::string para_path;
bool combined = false;
bool quantification = false;
private:
};
......
......@@ -154,7 +154,7 @@ void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
tensor->Resize(framework::make_ddim(desc.Dims()));
void *memory = tensor;
void *memory = nullptr;
int type_size = 0;
switch (desc.DataType()) {
case framework::VARTYPE_TYPE_FP16:
......@@ -179,11 +179,25 @@ void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
default:
break;
}
for (int n = 0; n < memory_size * type_size; ++n) {
static_cast<char *>(memory)[n] = (*data)[n];
if (program_.quantification) {
float min_value;
float max_value;
memcpy(&min_value, *data, sizeof(float));
memcpy(&max_value, *data + sizeof(float), sizeof(float));
*data += 2 * sizeof(float);
const float factor = (max_value - min_value) / 255.0;
uint8_t *uint8_data = (uint8_t *)(*data);
for (int k = 0; k < memory_size; ++k) {
static_cast<float *>(memory)[k] = uint8_data[k] * factor + min_value;
}
*data += (memory_size * sizeof(uint8_t));
} else {
for (int n = 0; n < memory_size * type_size; ++n) {
static_cast<char *>(memory)[n] = (*data)[n];
}
(*data) += (sizeof(char) * memory_size * type_size);
}
(*data) += (sizeof(char) * memory_size * type_size);
}
template <typename Dtype, Precision P>
......
......@@ -44,26 +44,29 @@ static size_t ReadBuffer(const char *file_name, uint8_t **out) {
template <typename Dtype, Precision P>
const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
const std::string &dirname, bool optimize, bool can_add_split) {
auto program =
this->LoadProgram(dirname + "/__model__", optimize, can_add_split);
const std::string &dirname, bool optimize, bool quantification,
bool can_add_split) {
auto program = this->LoadProgram(dirname + "/__model__", optimize,
quantification, can_add_split);
program.model_path = dirname;
return program;
}
template <typename Dtype, Precision P>
const framework::Program<Dtype, P> Loader<Dtype, P>::Load(
const std::string &model_path, const std::string &para_path,
bool optimize) {
const std::string &model_path, const std::string &para_path, bool optimize,
bool quantification) {
auto program = this->LoadProgram(model_path, optimize);
program.para_path = para_path;
program.combined = true;
program.quantification = quantification;
return program;
}
template <typename Dtype, Precision P>
const framework::Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
const std::string &model_path, bool optimize, bool can_add_split) {
const std::string &model_path, bool optimize, bool quantification,
bool can_add_split) {
std::string model_filename = model_path;
PaddleMobile__Framework__Proto__ProgramDesc *c_program;
uint8_t *buf = NULL;
......@@ -82,6 +85,7 @@ const framework::Program<Dtype, P> Loader<Dtype, P>::LoadProgram(
framework::Program<Dtype, P> program;
program.originProgram = originProgramDesc;
program.quantification = quantification;
auto scope = std::make_shared<framework::Scope>();
program.scope = scope;
......
......@@ -30,6 +30,7 @@ class Loader {
* */
const framework::Program<Dtype, P> Load(const std::string &dirname,
bool optimize = false,
bool quantification = false,
bool can_add_split = false);
/*
......@@ -38,11 +39,13 @@ class Loader {
* */
const framework::Program<Dtype, P> Load(const std::string &model_path,
const std::string &para_path,
bool optimize = false);
bool optimize = false,
bool quantification = false);
private:
const framework::Program<Dtype, P> LoadProgram(const std::string &model_path,
bool optimize = false,
bool quantification = false,
bool can_add_split = false);
};
......
......@@ -26,7 +26,7 @@ void PaddleMobile<Dtype, P>::SetThreadNum(int num) {
template <typename Dtype, Precision P>
bool PaddleMobile<Dtype, P>::Load(const std::string &dirname, bool optimize,
int batch_size) {
bool quantification, int batch_size) {
if (loader_.get() == nullptr) {
loader_ = std::make_shared<Loader<Dtype, P>>();
} else {
......@@ -35,7 +35,7 @@ bool PaddleMobile<Dtype, P>::Load(const std::string &dirname, bool optimize,
if (executor_.get() == nullptr) {
executor_ = std::make_shared<Executor<Dtype, P>>(
loader_->Load(dirname, optimize), batch_size, optimize);
loader_->Load(dirname, optimize, quantification), batch_size, optimize);
} else {
LOG(kLOG_INFO) << "executor inited";
}
......@@ -46,7 +46,7 @@ bool PaddleMobile<Dtype, P>::Load(const std::string &dirname, bool optimize,
template <typename Dtype, Precision P>
bool PaddleMobile<Dtype, P>::Load(const std::string &model_path,
const std::string &para_path, bool optimize,
int batch_size) {
bool quantification, int batch_size) {
if (loader_.get() == nullptr) {
loader_ = std::make_shared<Loader<Dtype, P>>();
} else {
......@@ -55,7 +55,8 @@ bool PaddleMobile<Dtype, P>::Load(const std::string &model_path,
if (executor_.get() == nullptr) {
executor_ = std::make_shared<Executor<Dtype, P>>(
loader_->Load(model_path, para_path, optimize), batch_size, optimize);
loader_->Load(model_path, para_path, optimize, quantification),
batch_size, optimize);
} else {
LOG(kLOG_INFO) << "executor inited";
}
......
......@@ -39,14 +39,18 @@ class PaddleMobile {
* @b 加载分开形式的 fluid 模型
* */
bool Load(const std::string &dirname, bool optimize = false,
int batch_size = 1);
bool quantification = false, int batch_size = 1);
/*
* @b load combine format fluid mode
* @b 加载结合在一起格式的模型
* */
bool Load(const std::string &model_path, const std::string &para_path,
bool optimize = false, int batch_size = 1);
bool optimize = false, bool quantification = false,
int batch_size = 1);
/*
* @b 设置线程数, 当 cmake 中开启 openmp 时生效
* */
void SetThreadNum(int num);
/*
......
......@@ -16,10 +16,32 @@ limitations under the License. */
#include <cstdlib>
#include <cstring>
#ifdef PADDLE_MOBILE_FPGA
#include "fpga/api/fpga_api.h"
#endif
namespace paddle_mobile {
namespace memory {
const int MALLOC_ALIGN = 64;
#ifdef PADDLE_MOBILE_FPGA
namespace api = paddle::mobile::fpga::api;
void Copy(void *dst, const void *src, size_t num) {
std::memcpy(dst, src, num);
}
void *Alloc(size_t size) { return api::malloc(size); }
void Free(void *ptr) {
if (ptr) {
api::fpga_free(ptr);
}
}
#else
void Copy(void *dst, const void *src, size_t num) {
std::memcpy(dst, src, num);
}
......@@ -42,5 +64,7 @@ void Free(void *ptr) {
}
}
#endif
} // namespace memory
} // namespace paddle_mobile
......@@ -26,7 +26,7 @@ void BatchNormOp<Dtype, T>::InferShape() const {
auto x_dims = this->param_.InputX()->dims();
this->param_.OutputY()->Resize(x_dims);
}
template class BatchNormOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
......@@ -47,7 +47,7 @@ void BoxCoderOp<Dtype, T>::InferShape() const {
this->param_.OutputBox()->Resize(framework::make_ddim(
{input_targetbox_dims[0], input_priorbox_dims[0], 4}));
}
template class BoxCoderOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
......@@ -56,7 +56,6 @@ void ConcatOp<Dtype, T>::InferShape() const {
this->param_.Out()->Resize(out_dims);
}
template class ConcatOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
......@@ -48,8 +48,6 @@ void ConvOp<Dtype, T>::InferShape() const {
this->param_.Output()->Resize(ddim);
}
template class ConvOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
......@@ -49,8 +49,6 @@ void DepthwiseConvOp<Dtype, T>::InferShape() const {
this->param_.Output()->Resize(ddim);
}
template class DepthwiseConvOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
......@@ -22,7 +22,7 @@ void DropoutOp<Dtype, T>::InferShape() const {
auto input_dims = this->param_.InputX()->dims();
this->param_.Out()->Resize(input_dims);
}
template class DropoutOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
......@@ -24,7 +24,7 @@ void ElementwiseAddOp<Dtype, T>::InferShape() const {
auto x_dim = this->param_.InputX()->dims();
this->param_.Out()->Resize(x_dim);
}
template class ElementwiseAddOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
......@@ -14,10 +14,7 @@ limitations under the License. */
#include "feed_op.h"
namespace paddle_mobile {
namespace operators {
template class FeedOp<CPU, float>;
}
namespace operators {}
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
......
......@@ -14,10 +14,7 @@ limitations under the License. */
#include "fetch_op.h"
namespace paddle_mobile {
namespace operators {
template class FetchOp<CPU, float>;
}
namespace operators {}
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
......
......@@ -45,7 +45,6 @@ void FusionConvAddOp<Dtype, T>::InferShape() const {
this->param_.Output()->Resize(ddim);
}
template class FusionConvAddOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
......@@ -36,8 +36,6 @@ class FusionConvAddMatcher : public framework::FusionOpMatcher {
void FolderNodes(
framework::Node *node,
std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
vector<std::shared_ptr<framework::OpDesc>> origin_descs =
node->OpDescs(node_.Depth());
node->Folder(node_.Depth(), Type(),
{{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}}}, removed_nodes);
}
......
......@@ -44,7 +44,7 @@ void FusionConvAddBNReluOp<Dtype, T>::InferShape() const {
framework::DDim ddim = framework::make_ddim(output_shape);
this->param_.Output()->Resize(ddim);
}
template class FusionConvAddBNReluOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
......@@ -39,8 +39,6 @@ class FusionConvAddBNReluMatcher : public framework::FusionOpMatcher {
void FolderNodes(
framework::Node *node,
std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
vector<std::shared_ptr<framework::OpDesc>> origin_descs =
node->OpDescs(node_.Depth());
node->Folder(node_.Depth(), Type(),
{{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}},
{G_OP_TYPE_BATCHNORM,
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVBNRELU_OP
#include "operators/fusion_conv_bn_relu_op.h"
#include "operators/math/conv_func.h"
namespace paddle_mobile {
namespace operators {
template <typename Dtype, typename T>
void FusionConvBNReluOp<Dtype, T>::InferShape() const {
auto in_dims = this->param_.Input()->dims();
auto filter_dims = this->param_.Filter()->dims();
const std::vector<int> &strides = this->param_.Strides();
std::vector<int> paddings = this->param_.Paddings();
int groups = this->param_.Groups();
std::vector<int> dilations = this->param_.Dilations();
PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
dilations.size() == paddings.size() &&
paddings.size() == strides.size()),
"ConvParam is not suitable");
std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
for (size_t i = 0; i < strides.size(); ++i) {
output_shape.push_back(
math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
paddings[i], strides[i]));
}
framework::DDim ddim = framework::make_ddim(output_shape);
this->param_.Output()->Resize(ddim);
}
} // namespace operators
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(fusion_conv_bn_relu, ops::FusionConvBNReluOp);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVBNRELU_OP
#pragma once
#include <string>
#include <vector>
#include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h"
#include "operators/kernel/conv_bn_relu_kernel.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
using std::string;
using std::vector;
class FusionConvBNReluMatcher : public framework::FusionOpMatcher {
public:
FusionConvBNReluMatcher() {
node_ = framework::Node(G_OP_TYPE_CONV);
node_ > std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM) >
std::make_shared<framework::Node>(G_OP_TYPE_RELU);
}
void FolderNodes(
framework::Node *node,
std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
node->Folder(node_.Depth(), Type(),
{{G_OP_TYPE_BATCHNORM,
{{"Scale", "Scale"},
{"Mean", "Mean"},
{"Bias", "Bias"},
{"Variance", "Variance"}}}},
removed_nodes);
}
std::string Type() { return G_OP_TYPE_FUSION_CONV_BN_RELU; }
};
template <typename DeviceType, typename T>
class FusionConvBNReluOp : public framework::OperatorWithKernel<
DeviceType, FusionConvBNReluParam,
operators::ConvBNReluKernel<DeviceType, T>> {
public:
FusionConvBNReluOp(const string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs,
const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<
DeviceType, FusionConvBNReluParam,
operators::ConvBNReluKernel<DeviceType, T>>(type, inputs, outputs,
attrs, scope) {}
using framework::OperatorWithKernel<
DeviceType, FusionConvBNReluParam,
operators::ConvBNReluKernel<DeviceType, T>>::OperatorWithKernel;
void InferShape() const override;
protected:
};
#ifdef PADDLE_MOBILE_CPU
#ifndef FUSION_CONV_BN_RELU_REGISTER
static framework::FusionOpRegistrar fusion_conv_bn_relu_registrar(
new FusionConvBNReluMatcher());
#define FUSION_CONV_BN_RELU_REGISTER
#endif
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
} // namespace operators
} // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(fusion_conv_bn_relu);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_DWCONVBNRELU_OP
#include "operators/fusion_dwconv_bn_relu_op.h"
#include "operators/math/conv_func.h"
namespace paddle_mobile {
namespace operators {
template <typename Dtype, typename T>
void FusionDWConvBNReluOp<Dtype, T>::InferShape() const {
auto in_dims = this->param_.Input()->dims();
auto filter_dims = this->param_.Filter()->dims();
const std::vector<int> &strides = this->param_.Strides();
std::vector<int> paddings = this->param_.Paddings();
int groups = this->param_.Groups();
std::vector<int> dilations = this->param_.Dilations();
PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
dilations.size() == paddings.size() &&
paddings.size() == strides.size()),
"ConvParam is not suitable");
std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
for (size_t i = 0; i < strides.size(); ++i) {
output_shape.push_back(
math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
paddings[i], strides[i]));
}
framework::DDim ddim = framework::make_ddim(output_shape);
this->param_.Output()->Resize(ddim);
}
} // namespace operators
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(fusion_dwconv_bn_relu, ops::FusionDWConvBNReluOp);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_DWCONVBNRELU_OP
#pragma once
#include <string>
#include <vector>
#include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h"
#include "op_param.h"
#include "operators/kernel/dwconv_bn_relu_kernel.h"
namespace paddle_mobile {
namespace operators {
using std::string;
using std::vector;
class FusionDWConvBNReluMatcher : public framework::FusionOpMatcher {
public:
FusionDWConvBNReluMatcher() {
node_ = framework::Node(G_OP_TYPE_DEPTHWISE_CONV);
node_ > std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM) >
std::make_shared<framework::Node>(G_OP_TYPE_RELU);
}
void FolderNodes(
framework::Node *node,
std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
node->Folder(node_.Depth(), Type(),
{{G_OP_TYPE_BATCHNORM,
{{"Scale", "Scale"},
{"Mean", "Mean"},
{"Bias", "Bias"},
{"Variance", "Variance"}}}},
removed_nodes);
}
std::string Type() { return G_OP_TYPE_FUSION_DWCONV_BN_RELU; }
};
template <typename DeviceType, typename T>
class FusionDWConvBNReluOp : public framework::OperatorWithKernel<
DeviceType, FusionDWConvBNReluParam,
operators::DWConvBNReluKernel<DeviceType, T>> {
public:
FusionDWConvBNReluOp(const string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs,
const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<
DeviceType, FusionDWConvBNReluParam,
operators::DWConvBNReluKernel<DeviceType, T>>(type, inputs, outputs,
attrs, scope) {}
using framework::OperatorWithKernel<
DeviceType, FusionDWConvBNReluParam,
operators::DWConvBNReluKernel<DeviceType, T>>::OperatorWithKernel;
void InferShape() const override;
protected:
};
#ifdef PADDLE_MOBILE_CPU
#ifndef FUSION_DWCONV_BN_RELU_REGISTER
static framework::FusionOpRegistrar fusion_dwconv_bn_relu_registrar(
new FusionDWConvBNReluMatcher());
#define FUSION_DWCONV_BN_RELU_REGISTER
#endif
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#ifndef FUSION_DWCONV_BN_RELU_REGISTER
static framework::FusionOpRegistrar fusion_dwconv_bn_relu_registrar(
new FusionDWConvBNReluMatcher());
#define FUSION_DWCONV_BN_RELU_REGISTER
#endif
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
} // namespace operators
} // namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU(fusion_dwconv_bn_relu);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
#endif
#endif
......@@ -50,7 +50,6 @@ void FusionFcOp<Dtype, T>::InferShape() const {
this->param_.Out()->Resize(ddim);
}
template class FusionFcOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
......@@ -47,8 +47,6 @@ void Im2SequenceOp<Dtype, T>::InferShape() const {
this->param_.Output()->Resize(ddim);
}
template class Im2SequenceOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
......@@ -15,7 +15,7 @@ limitations under the License. */
#ifdef FUSION_CONVADDBNRELU_OP
#include "operators/kernel/conv_add_bn_relu_kernel.h"
#include "operators/kernel/central-arm-func/conv_add_bn_relu_func.h"
#include "operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h"
namespace paddle_mobile {
namespace operators {
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVBNRELU_OP
#include "operators/kernel/conv_bn_relu_kernel.h"
#include "operators/kernel/central-arm-func/conv_bn_relu_arm_func.h"
namespace paddle_mobile {
namespace operators {
template <>
bool ConvBNReluKernel<CPU, float>::Init(FusionConvBNReluParam *param) {
const Tensor *mean = param->InputMean();
const Tensor *variance = param->InputVariance();
const Tensor *scale = param->InputScale();
const Tensor *bias = param->InputBias();
const float epsilon = param->Epsilon();
// DLOG << "variance: " << *variance;
auto mean_ptr = mean->data<float>();
auto variance_ptr = variance->data<float>();
auto scale_ptr = scale->data<float>();
auto bias_ptr = bias->data<float>();
const int C = mean->numel();
float inv_std_ptr[C];
for (int i = 0; i < C; i++) {
inv_std_ptr[i] =
1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
}
Tensor *new_scale = new Tensor();
Tensor *new_bias = new Tensor();
auto new_scale_ptr = new_scale->mutable_data<float>({C});
auto new_bias_ptr = new_bias->mutable_data<float>({C});
for (int i = 0; i < C; i++) {
new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
}
param->SetNewScale(new_scale);
param->SetNewBias(new_bias);
return true;
}
template <>
void ConvBNReluKernel<CPU, float>::Compute(
const FusionConvBNReluParam &param) const {
ConvBNReluCompute<float>(param);
}
template class ConvBNReluKernel<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_DWCONVBNRELU_OP
#include "operators/kernel/dwconv_bn_relu_kernel.h"
#include "operators/kernel/central-arm-func/dwconv_bn_relu_arm_func.h"
namespace paddle_mobile {
namespace operators {
template <>
bool DWConvBNReluKernel<CPU, float>::Init(FusionDWConvBNReluParam *param) {
const Tensor *mean = param->InputMean();
const Tensor *variance = param->InputVariance();
const Tensor *scale = param->InputScale();
const Tensor *bias = param->InputBias();
const float epsilon = param->Epsilon();
auto mean_ptr = mean->data<float>();
auto variance_ptr = variance->data<float>();
auto scale_ptr = scale->data<float>();
auto bias_ptr = bias->data<float>();
const int C = mean->numel();
float inv_std_ptr[C];
for (int i = 0; i < C; i++) {
inv_std_ptr[i] =
1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
}
Tensor *new_scale = new Tensor();
Tensor *new_bias = new Tensor();
auto new_scale_ptr = new_scale->mutable_data<float>({C});
auto new_bias_ptr = new_bias->mutable_data<float>({C});
for (int i = 0; i < C; i++) {
new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
}
param->SetNewScale(new_scale);
param->SetNewBias(new_bias);
return true;
}
template <>
void DWConvBNReluKernel<CPU, float>::Compute(
const FusionDWConvBNReluParam &param) const {
DWConvBNReluCompute<float>(param);
}
template class DWConvBNReluKernel<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
#endif
......@@ -54,7 +54,40 @@ void BatchnormCompute(const BatchNormParam &param) {
int HXW = H * W;
#ifdef ARMV7
#if __ARM_NEON
#if __aarch64__
float *inv_std_ptr = new float[C];
for (int i = 0; i < C; i++) {
inv_std_ptr[i] =
1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
}
Tensor new_scale;
auto new_scale_ptr = new_scale.mutable_data<float>(framework::make_ddim({C}));
Tensor new_bias;
auto new_bias_ptr = new_bias.mutable_data<float>(framework::make_ddim({C}));
/// ((x - est_mean) * (inv_var) * scale + bias equal to
/// (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
for (int i = 0; i < C; i++) {
new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
{
for (int n = 0; n < N; n++) {
for (int h = 0; h < H; h++) {
int tmp_index = n * stride0 + i * stride1 + h * stride2;
for (int w = 0; w < W; w++) {
int index = tmp_index + w;
out_ptr[index] =
input_x_ptr[index] * new_scale_ptr[i] + new_bias_ptr[i];
}
}
}
}
}
delete[] inv_std_ptr;
#else
if (HXW > 32) {
int NXC = N * C;
float *inv_std_ptr = new float[NXC * 4];
......@@ -229,6 +262,7 @@ void BatchnormCompute(const BatchNormParam &param) {
delete[] inv_std_ptr;
}
#endif
#else
float *inv_std_ptr = new float[C];
for (int i = 0; i < C; i++) {
......
......@@ -15,6 +15,8 @@ limitations under the License. */
#ifdef FUSION_CONVADDBNRELU_OP
#pragma once
#include <vector>
#include "operators/math/depthwise_conv_3x3.h"
#include "operators/op_param.h"
......@@ -23,14 +25,9 @@ namespace operators {
void ConvAddBNReluBasic(const FusionConvAddBNReluParam &param) {
const Tensor *input = param.Input();
Tensor filter = *param.Filter();
Tensor bias = *param.Bias();
Tensor new_bias = *param.NewBias();
Tensor new_scale = *param.NewScale();
int axis = param.Axis();
Tensor *output = param.Output();
math::expand_bias(bias, axis, output->dims());
output->ShareDataWith(bias);
int groups = param.Groups();
std::vector<int> strides = param.Strides();
std::vector<int> paddings = param.Paddings();
......@@ -107,7 +104,7 @@ void ConvAddBNReluBasic(const FusionConvAddBNReluParam &param) {
math::matmulWithBn<float>(
filter_slice, false, col_matrix, false, static_cast<float>(1),
&out_slice, static_cast<float>(0), true, &new_scale, &new_bias);
&out_slice, static_cast<float>(0), true, &new_scale, &new_bias, g);
}
}
}
......@@ -121,7 +118,7 @@ void ConvAddBNReluCompute(const FusionConvAddBNReluParam &param) {
param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
math::DepthwiseConvAddBNRelu3x3s1p1(param.Input(), param.Filter(),
param.Output(), param.NewScale(),
param.NewBias(), 1);
param.NewBias(), true);
} else if (param.Groups() == param.Input()->dims()[1] &&
param.Input()->dims()[1] == param.Output()->dims()[1] &&
param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVBNRELU_OP
#pragma once
#include <vector>
#include "operators/math/depthwise_conv_3x3.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
void ConvBNReluBasic(const FusionConvBNReluParam &param) {
const Tensor *input = param.Input();
Tensor filter = *param.Filter();
Tensor new_bias = *param.NewBias();
Tensor new_scale = *param.NewScale();
Tensor *output = param.Output();
int groups = param.Groups();
std::vector<int> strides = param.Strides();
std::vector<int> paddings = param.Paddings();
std::vector<int> dilations = param.Dilations();
const int batch_size = static_cast<int>(input->dims()[0]);
std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
size_t data_dim = filter_shape_vec.size() - 2;
std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
col_shape_vec[0] = input->dims()[1] / groups;
for (size_t j = 0; j < data_dim; ++j) {
col_shape_vec[j + 1] = filter_shape_vec[j + 2];
col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
}
framework::DDim col_shape(framework::make_ddim(col_shape_vec));
framework::DDim col_matrix_shape =
framework::flatten_to_2d(col_shape, data_dim + 1);
bool is_expand =
math::IsExpand(filter_shape_vec, strides, paddings, dilations);
Tensor col;
Tensor col_matrix;
if (is_expand) {
col.mutable_data<float>(col_shape);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
}
framework::DDim input_shape = framework::slice_ddim(
input->dims(), 1, static_cast<int>(input->dims().size()));
framework::DDim filter_matrix_shape = {filter.dims()[0],
filter.numel() / filter.dims()[0]};
filter.Resize(filter_matrix_shape);
framework::DDim output_matrix_shape = {
output->dims()[1],
output->numel() / (output->dims()[0] * output->dims()[1])};
// convolution operator: im2col(or vol2col) + gemm
int in_step = static_cast<int>(input->dims()[1]) / groups;
int out_step = static_cast<int>(output->dims()[1]) / groups;
math::Vol2ColFunctor<CPU, float> vol2col;
math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
for (int i = 0; i < batch_size; i++) {
Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
for (int g = 0; g < groups; g++) {
Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
if (!is_expand) {
col.ShareDataWith(in_slice);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
} else if (data_dim == 2U) {
// im2col
im2col(in_slice, dilations, strides,
std::vector<int>{paddings[0], paddings[1], paddings[0],
paddings[1]},
&col);
} else if (data_dim == 3U) {
// vol2col
vol2col(in_slice, dilations, strides, paddings, &col);
}
// gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmulWithBn<float>(
filter_slice, false, col_matrix, false, static_cast<float>(1),
&out_slice, static_cast<float>(0), true, &new_scale, &new_bias, g);
}
}
}
template <typename P>
void ConvBNReluCompute(const FusionConvBNReluParam &param) {
if (param.Groups() == param.Input()->dims()[1] &&
param.Input()->dims()[1] == param.Output()->dims()[1] &&
param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
math::DepthwiseConvAddBNRelu3x3s1p1(param.Input(), param.Filter(),
param.Output(), param.NewScale(),
param.NewBias(), true);
} else if (param.Groups() == param.Input()->dims()[1] &&
param.Input()->dims()[1] == param.Output()->dims()[1] &&
param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
// math::DepthwiseConvAddBNRelu3x3s2p1(param.Input(), param.Filter(),
// param.Output(), param.NewScale(),
// param.NewBias(), 1);
math::DepthwiseConvAddBNRelu3x3s2p1v2(param.Input(), param.Filter(),
param.Output(), param.NewScale(),
param.NewBias(), true);
} else {
ConvBNReluBasic(param);
}
}
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_DWCONVBNRELU_OP
#pragma once
#include <vector>
#include "operators/math/depthwise_conv_3x3.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
void DWConvBNReluBasic(const FusionDWConvBNReluParam &param) {
const Tensor *input = param.Input();
Tensor filter = *param.Filter();
Tensor new_bias = *param.NewBias();
Tensor new_scale = *param.NewScale();
Tensor *output = param.Output();
int groups = param.Groups();
std::vector<int> strides = param.Strides();
std::vector<int> paddings = param.Paddings();
std::vector<int> dilations = param.Dilations();
const int batch_size = static_cast<int>(input->dims()[0]);
std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
size_t data_dim = filter_shape_vec.size() - 2;
std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
col_shape_vec[0] = input->dims()[1] / groups;
for (size_t j = 0; j < data_dim; ++j) {
col_shape_vec[j + 1] = filter_shape_vec[j + 2];
col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
}
framework::DDim col_shape(framework::make_ddim(col_shape_vec));
framework::DDim col_matrix_shape =
framework::flatten_to_2d(col_shape, data_dim + 1);
bool is_expand =
math::IsExpand(filter_shape_vec, strides, paddings, dilations);
Tensor col;
Tensor col_matrix;
if (is_expand) {
col.mutable_data<float>(col_shape);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
}
framework::DDim input_shape = framework::slice_ddim(
input->dims(), 1, static_cast<int>(input->dims().size()));
framework::DDim filter_matrix_shape = {filter.dims()[0],
filter.numel() / filter.dims()[0]};
filter.Resize(filter_matrix_shape);
framework::DDim output_matrix_shape = {
output->dims()[1],
output->numel() / (output->dims()[0] * output->dims()[1])};
// convolution operator: im2col(or vol2col) + gemm
int in_step = static_cast<int>(input->dims()[1]) / groups;
int out_step = static_cast<int>(output->dims()[1]) / groups;
math::Vol2ColFunctor<CPU, float> vol2col;
math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
for (int i = 0; i < batch_size; i++) {
Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
for (int g = 0; g < groups; g++) {
Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
if (!is_expand) {
col.ShareDataWith(in_slice);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
} else if (data_dim == 2U) {
// im2col
im2col(in_slice, dilations, strides,
std::vector<int>{paddings[0], paddings[1], paddings[0],
paddings[1]},
&col);
} else if (data_dim == 3U) {
// vol2col
vol2col(in_slice, dilations, strides, paddings, &col);
}
// gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmulWithBn<float>(
filter_slice, false, col_matrix, false, static_cast<float>(1),
&out_slice, static_cast<float>(0), true, &new_scale, &new_bias, g);
}
}
}
template <typename P>
void DWConvBNReluCompute(const FusionDWConvBNReluParam &param) {
if (param.Groups() == param.Input()->dims()[1] &&
param.Input()->dims()[1] == param.Output()->dims()[1] &&
param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) {
math::DepthwiseConvAddBNRelu3x3s1p1(param.Input(), param.Filter(),
param.Output(), param.NewScale(),
param.NewBias(), true);
} else if (param.Groups() == param.Input()->dims()[1] &&
param.Input()->dims()[1] == param.Output()->dims()[1] &&
param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
// math::DepthwiseConvAddBNRelu3x3s2p1(param.Input(), param.Filter(),
// param.Output(), param.NewScale(),
// param.NewBias(), 1);
math::DepthwiseConvAddBNRelu3x3s2p1v2(param.Input(), param.Filter(),
param.Output(), param.NewScale(),
param.NewBias(), true);
} else {
DWConvBNReluBasic(param);
}
}
} // namespace operators
} // namespace paddle_mobile
#endif
......@@ -76,15 +76,20 @@ void PoolCompute(const PoolParam &param) {
}
} else if (ksize[0] == 2 && ksize[0] == ksize[1]) {
#ifndef IOS
#if __ARM_NEON
#if __aarch64__
PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
#else
if (pooling_type == "max") {
math::Pool2x2Max(strides, paddings, in_x, out);
} else if (pooling_type == "avg") {
math::Pool2x2Avg(strides, paddings, in_x, out);
}
#endif
#else
PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
#endif
#endif // __ARM_NEON
} else {
PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
}
......
......@@ -68,6 +68,7 @@ void sigmoid(const Tensor *X, Tensor *Y) {
input_outer_ptr++;
}
}
#else
#endif
}
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef FUSION_CONVBNRELU_OP
#include <vector>
#include "framework/ddim.h"
#include "framework/operator.h"
#include "operators/math/conv_func.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
using framework::DDim;
using framework::OpKernelBase;
template <typename DeviceType, typename T>
class ConvBNReluKernel
: public OpKernelBase<DeviceType, FusionConvBNReluParam> {
public:
void Compute(const FusionConvBNReluParam &param) const;
bool Init(FusionConvBNReluParam *param);
};
} // namespace operators
} // namespace paddle_mobile
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef FUSION_DWCONVBNRELU_OP
#include <vector>
#include "framework/ddim.h"
#include "framework/operator.h"
#include "operators/math/conv_func.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
using framework::DDim;
using framework::OpKernelBase;
template <typename DeviceType, typename T>
class DWConvBNReluKernel
: public OpKernelBase<DeviceType, FusionDWConvBNReluParam> {
public:
void Compute(const FusionDWConvBNReluParam &param) const;
bool Init(FusionDWConvBNReluParam *param);
};
} // namespace operators
} // namespace paddle_mobile
#endif
......@@ -24,7 +24,7 @@ void LrnOp<Dtype, T>::InferShape() const {
auto x_dims = this->param_.InputX()->dims();
this->param_.Out()->Resize(x_dims);
}
template class LrnOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
......@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "operators/math/depthwise_conv_3x3.h"
#ifdef __ARM_NEON
#if __ARM_NEON
#include <arm_neon.h>
#endif
#include <vector>
......@@ -23,7 +23,6 @@ namespace math {
void DepthwiseConv3x3(const Tensor *input, vector<int> strides,
vector<int> paddings, const Tensor *filter, Tensor *bias,
Tensor *output, bool if_bias) {
#ifdef __ARM_NEON
const int batch_size = input->dims()[0];
const int input_height = input->dims()[2];
......@@ -181,7 +180,27 @@ void DepthwiseConv3x3(const Tensor *input, vector<int> strides,
}
} else {
#if defined(ARMV17)
#if __ARM_NEON
#if __aarch64__
const float32x4_t data1 = vld1q_f32(pos1);
const float32x4_t data2 = vld1q_f32(pos2);
const float32x4_t data3 = vld1q_f32(pos3);
const float32x4_t v_filter1 = vld1q_f32(filter1);
const float32x4_t v_filter2 = vld1q_f32(filter2);
const float32x4_t v_filter3 = vld1q_f32(filter3);
float32x4_t mula = vmulq_f32(data1, v_filter1);
mula = vmlaq_f32(mula, data2, v_filter2);
mula = vmlaq_f32(mula, data3, v_filter3);
float32x2_t res = vpadd_f32(
vget_high_f32(vsetq_lane_f32(0, mula, 3)), vget_low_f32(mula));
res = vpadd_f32(res, res);
if (if_bias) {
output_data[ph * output_width + pw] += vget_lane_f32(res, 0);
} else {
output_data[ph * output_width + pw] = vget_lane_f32(res, 0);
}
#else
asm volatile(
"vld1.32 {q1}, [%[pos1]] \n\t"
......@@ -209,26 +228,10 @@ void DepthwiseConv3x3(const Tensor *input, vector<int> strides,
[filter2] "r"(filter2), [filter3] "r"(filter3),
[output_ptr] "r"(output_ptr), [zero] "r"(zero)
: "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6");
#endif // __aarch64__
#else
const float32x4_t data1 = vld1q_f32(pos1);
const float32x4_t data2 = vld1q_f32(pos2);
const float32x4_t data3 = vld1q_f32(pos3);
const float32x4_t v_filter1 = vld1q_f32(filter1);
const float32x4_t v_filter2 = vld1q_f32(filter2);
const float32x4_t v_filter3 = vld1q_f32(filter3);
float32x4_t mula = vmulq_f32(data1, v_filter1);
mula = vmlaq_f32(mula, data2, v_filter2);
mula = vmlaq_f32(mula, data3, v_filter3);
float32x2_t res = vpadd_f32(
vget_high_f32(vsetq_lane_f32(0, mula, 3)), vget_low_f32(mula));
res = vpadd_f32(res, res);
if (if_bias) {
output_data[ph * output_width + pw] += vget_lane_f32(res, 0);
} else {
output_data[ph * output_width + pw] = vget_lane_f32(res, 0);
}
#endif
#endif // __ARM_NEON
}
}
}
......@@ -239,12 +242,11 @@ void DepthwiseConv3x3(const Tensor *input, vector<int> strides,
input_data += input_batch_stride;
output_data += output_batch_stride;
}
#endif
}
void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
Tensor *output, Tensor *bias, bool if_bias) {
#ifdef __ARM_NEON
#if __ARM_NEON
const float *input_data = input->data<float>();
const float *filter_data = filter->data<float>();
float *output_data = output->data<float>();
......@@ -520,7 +522,7 @@ void DepthwiseConv3x3s1p1(const Tensor *input, const Tensor *filter,
void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
Tensor *output, const Tensor *new_scale,
const Tensor *new_bias, bool if_relu) {
#ifdef __ARM_NEON
#if __ARM_NEON
const float *input_data = input->data<float>();
const float *filter_data = filter->data<float>();
float *output_data = output->data<float>();
......@@ -824,7 +826,7 @@ void DepthwiseConvAddBNRelu3x3s1p1(const Tensor *input, const Tensor *filter,
void DepthwiseConvAddBNRelu3x3s2p1(const Tensor *input, const Tensor *filter,
Tensor *output, const Tensor *new_scale,
const Tensor *new_bias, bool if_relu) {
#ifdef __ARM_NEON
#if __ARM_NEON
const int batch_size = input->dims()[0];
......@@ -1022,7 +1024,7 @@ void DepthwiseConvAddBNRelu3x3s2p1(const Tensor *input, const Tensor *filter,
void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter,
Tensor *output, Tensor bias, bool if_bias) {
#ifdef __ARM_NEON
#if __ARM_NEON
const float *input_data = input->data<float>();
const float *filter_data = filter->data<float>();
float *output_data = output->data<float>();
......@@ -1225,7 +1227,7 @@ void DepthwiseConv3x3s2p1v2(const Tensor *input, const Tensor *filter,
void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
Tensor *output, const Tensor *new_scale,
const Tensor *new_bias, bool if_relu) {
#ifdef __ARM_NEON
#if __ARM_NEON
const float *input_data = input->data<float>();
const float *filter_data = filter->data<float>();
float *output_data = output->data<float>();
......
此差异已折叠。
......@@ -28,6 +28,7 @@ namespace paddle_mobile {
namespace operators {
namespace math {
/*
// 将 A 矩阵分块复制到连续内存(ColMajor)
void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
float *buffer);
......@@ -35,6 +36,7 @@ void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
// 将 B 矩阵分块复制到连续内存(ColMajor)
void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
float *buffer);
*/
// 将 A 矩阵分块复制到连续内存(RowMajor)
void PackMatrixA_(int m, int k, int m_tail, const float *A, int lda,
......@@ -51,7 +53,7 @@ void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
const float *b, float beta, float *c, float *C, int ldc,
bool relu, float *new_scale, float *new_bias);
/*
// 向量矩阵乘法 (M = 1)
void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
const float *B, int ldb, float beta, float *C, int ldc,
......@@ -60,6 +62,7 @@ void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
int lda, const float *B, int ldb, float beta, float *C,
int ldc, bool relu, float *new_scale, float *new_bias);
*/
// 计算一个更小的 C 矩阵分块
void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc);
......@@ -81,6 +84,7 @@ void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale,
void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
float *new_scale, float *new_bias);
/*
// 向量矩阵乘法结果回写
// C = A * B
void VecWriteBasic(int n, float *c, float *C, int ldc);
......@@ -96,6 +100,7 @@ void VecWriteWithBn(int n, float *c, float *C, int ldc, float *new_scale,
// C = A * B, batchnorm(C), relu(C)
void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *new_scale,
float *new_bias);
*/
// 32位 float 矩阵乘法
void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
......
......@@ -15,7 +15,7 @@ limitations under the License. */
#include "operators/math/im2col.h"
#include <vector>
#ifdef __ARM_NEON
#include "arm_neon.h"
#include <arm_neon.h>
#endif
#include "common/types.h"
namespace paddle_mobile {
......@@ -69,7 +69,7 @@ class Im2ColFunctor<ColFormat::kCFO, CPU, T> {
int channels_col = im_channels * filter_height * filter_width;
const T *im_data = im.data<T>();
T *col_data = col->data<T>();
#ifdef __ARM_NEON
#if __ARM_NEON
const int osize = col_height;
const int isize = im_height;
bool pad1 = padding[0] > 0;
......
......@@ -50,7 +50,7 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b,
float alpha, framework::Tensor *matrix_out, float beta,
bool relu, framework::Tensor *new_scale,
framework::Tensor *new_bias) {
framework::Tensor *new_bias, int group) {
auto dim_a = matrix_a.dims();
auto dim_b = matrix_b.dims();
auto dim_out = matrix_out->dims();
......@@ -71,7 +71,8 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a,
SgemmWithBn(M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(),
N, beta, matrix_out->data<float>(), N, relu,
new_scale->data<float>(), new_bias->data<float>());
new_scale->data<float>() + group,
new_bias->data<float>() + group);
}
} // namespace math
......
......@@ -31,7 +31,8 @@ template <typename T>
void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, T alpha,
framework::Tensor *matrix_out, T beta, bool relu,
framework::Tensor *new_scale, framework::Tensor *new_bias);
framework::Tensor *new_scale, framework::Tensor *new_bias,
int group);
} // namespace math
} // namespace operators
} // namespace paddle_mobile
......@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */
#ifdef POOL_OP
#include "pool_2x2.h"
#include "operators/math/pool_2x2.h"
#include <algorithm>
#include <vector>
namespace paddle_mobile {
namespace operators {
......@@ -21,10 +23,10 @@ namespace math {
void Pool2x2Max(vector<int> strides, vector<int> paddings, const Tensor *input,
Tensor *output) {
#ifdef __ARM_NEON
#ifdef ARMV7
#if __ARM_NEON
#if __aarch64__
#else
const int batch_size = input->dims()[0];
const int input_height = input->dims()[2];
......@@ -93,15 +95,16 @@ void Pool2x2Max(vector<int> strides, vector<int> paddings, const Tensor *input,
output_data += output_batch_stride;
}
#endif
#else
#endif
}
void Pool2x2Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
Tensor *output) {
#ifdef __ARM_NEON
#if __ARM_NEON
#ifdef ARMV7
#if __aarch64__
#else
const int batch_size = input->dims()[0];
const int input_height = input->dims()[2];
......@@ -171,12 +174,9 @@ void Pool2x2Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
input_data += input_batch_stride;
output_data += output_batch_stride;
}
#else
// TODO(): to imp other asm
#endif
#else
#endif
}
......
......@@ -17,7 +17,7 @@ limitations under the License. */
#include <omp.h>
#endif
#include "framework/tensor.h"
#include "pool_3x3.h"
#include "operators/math/pool_3x3.h"
#if __ARM_NEON
#include <arm_neon.h>
#endif // __ARM_NEON
......@@ -518,6 +518,8 @@ void Pool3x3Maxs1p1(const Tensor *input, Tensor *output) {
input_data += input_batch_stride;
out_data += output_batch_stride;
}
#else
#endif
}
......@@ -582,7 +584,18 @@ void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input,
}
output_seg[ph * output_width + pw] = max_value;
} else {
#if defined(ARMV7)
#if __aarch64__
const float32x4_t data1 = vld1q_f32(pos1);
const float32x4_t data2 = vld1q_f32(pos1 + input_width);
const float32x4_t data3 = vld1q_f32(pos1 + 2 * input_width);
const float32x4_t max_data =
vmaxq_f32(vmaxq_f32(data1, data2), data3);
float32x2_t res =
vpmax_f32(vget_high_f32(vsetq_lane_f32(-INT_MAX, max_data, 3)),
vget_low_f32(max_data));
res = vpmax_f32(res, res);
output_seg[ph * output_width + pw] = vget_lane_f32(res, 0);
#else
asm volatile(
"vld1.32 {q1}, [%[pos1]] \n\t"
"vld1.32 {q2}, [%[pos2]] \n\t"
......@@ -598,17 +611,6 @@ void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input,
[pos2] "r"(pos2), [pos3] "r"(pos3),
[output_ptr] "r"(output_ptr), [negative_max] "r"(negative_max)
: "memory", "q1", "q2", "q3", "q4");
#else
const float32x4_t data1 = vld1q_f32(pos1);
const float32x4_t data2 = vld1q_f32(pos1 + input_width);
const float32x4_t data3 = vld1q_f32(pos1 + 2 * input_width);
const float32x4_t max_data =
vmaxq_f32(vmaxq_f32(data1, data2), data3);
float32x2_t res =
vpmax_f32(vget_high_f32(vsetq_lane_f32(-INT_MAX, max_data, 3)),
vget_low_f32(max_data));
res = vpmax_f32(res, res);
output_seg[ph * output_width + pw] = vget_lane_f32(res, 0);
#endif
}
}
......@@ -676,8 +678,8 @@ void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
}
output_seg[ph * output_width + pw] = sum / 9.0;
} else {
#if defined(ARMV7)
#if __aarch64__
#else
asm volatile(
"vld1.32 {q1}, [%[pos1]] \n\t"
"vld1.32 {q2}, [%[pos2]] \n\t"
......@@ -696,7 +698,7 @@ void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
[output_ptr] "r"(output_ptr), [zero] "r"(zero),
[nine_ptr] "r"(nine_ptr)
: "memory", "r6", "q1", "q2", "q3", "q4");
#else
#endif
const float32x4_t data1 = vld1q_f32(pos1);
const float32x4_t data2 = vld1q_f32(pos2);
const float32x4_t data3 = vld1q_f32(pos3);
......@@ -707,7 +709,6 @@ void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
vget_low_f32(sum_data));
res = vpadd_f32(res, res);
output_seg[ph * output_width + pw] = vget_lane_f32(res, 0) / 9.0;
#endif
}
}
}
......@@ -715,6 +716,7 @@ void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *input,
input_data += input_batch_stride;
output_data += output_batch_stride;
}
#else
#endif
}
} // namespace math
......
......@@ -135,6 +135,7 @@ class SoftmaxFuntor<CPU, T> {
}
}
}
#else
#endif // ARM_NEON
public:
......
......@@ -50,7 +50,7 @@ void MulOp<Dtype, T>::InferShape() const {
framework::DDim ddim = framework::make_ddim(output_dims);
this->param_.Out()->Resize(ddim);
}
template class MulOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
......@@ -34,7 +34,7 @@ void MultiClassNMSOp<Dtype, T>::InferShape() const {
// pre size, will change in Compute.
this->param_.Out()->Resize(framework::make_ddim({input_bboxes_dims[1], 6}));
}
template class MultiClassNMSOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
......@@ -371,7 +371,7 @@ class BatchNormParam : OpParam {
input_variance_ = InputVarianceFrom<LoDTensor>(inputs, scope);
epsilon_ = GetAttr<float>("epsilon", attrs);
momentum_ = GetAttr<float>("momentum", attrs);
is_test_ = GetAttr<bool>("is_test", attrs);
// is_test_ = GetAttr<bool>("is_test", attrs);
}
const Tensor *InputX() const { return input_x_; }
......@@ -1059,6 +1059,165 @@ class FusionConvAddBNReluParam : public OpParam {
Print &operator<<(Print &printer, const FusionConvAddParam &conv_param);
#endif
#ifdef FUSION_DWCONVBNRELU_OP
class FusionDWConvBNReluParam : public OpParam {
public:
FusionDWConvBNReluParam(const VariableNameMap &inputs,
const VariableNameMap &outputs,
const AttributeMap &attrs, const Scope &scope) {
filter_ = FilterFrom<LoDTensor>(inputs, scope);
input_ = InputFrom<LoDTensor>(inputs, scope);
output_ = OutFrom<LoDTensor>(outputs, scope);
strides_ = GetAttr<vector<int>>("strides", attrs);
paddings_ = GetAttr<vector<int>>("paddings", attrs);
dilations_ = GetAttr<vector<int>>("dilations", attrs);
groups = GetAttr<int>("groups", attrs);
input_bias_ = InputBiasFrom<LoDTensor>(inputs, scope);
input_mean_ = InputMeanFrom<LoDTensor>(inputs, scope);
input_scale_ = InputScaleFrom<LoDTensor>(inputs, scope);
input_variance_ = InputVarianceFrom<LoDTensor>(inputs, scope);
epsilon_ = GetAttr<float>("epsilon", attrs);
momentum_ = GetAttr<float>("momentum", attrs);
// is_test_ = GetAttr<bool>("is_test", attrs);
}
const Tensor *Input() const { return input_; }
const Tensor *Filter() const { return filter_; }
Tensor *Output() const { return output_; }
const vector<int> &Strides() const { return strides_; }
const vector<int> &Paddings() const { return paddings_; }
const vector<int> &Dilations() const { return dilations_; }
const int &Groups() const { return groups; }
const Tensor *InputBias() const { return input_bias_; }
const Tensor *InputMean() const { return input_mean_; }
const Tensor *InputScale() const { return input_scale_; }
const Tensor *InputVariance() const { return input_variance_; }
const float &Epsilon() const { return epsilon_; }
const float &Momentum() const { return momentum_; }
const bool &IsTest() const { return is_test_; }
void SetNewScale(Tensor *new_scale) { new_scale_ = new_scale; }
void SetNewBias(Tensor *new_bias) { new_bias_ = new_bias; }
const Tensor *NewScale() const { return new_scale_; }
const Tensor *NewBias() const { return new_bias_; }
protected:
Tensor *input_;
Tensor *output_;
Tensor *filter_;
vector<int> strides_;
vector<int> paddings_;
vector<int> dilations_;
int groups;
Tensor *input_bias_;
Tensor *input_mean_;
Tensor *input_scale_;
Tensor *input_variance_;
float epsilon_;
float momentum_;
bool is_test_;
Tensor *new_bias_;
Tensor *new_scale_;
};
Print &operator<<(Print &printer, const FusionConvAddParam &conv_param);
#endif
#ifdef FUSION_CONVBNRELU_OP
class FusionConvBNReluParam : public OpParam {
public:
FusionConvBNReluParam(const VariableNameMap &inputs,
const VariableNameMap &outputs,
const AttributeMap &attrs, const Scope &scope) {
filter_ = FilterFrom<LoDTensor>(inputs, scope);
input_ = InputFrom<LoDTensor>(inputs, scope);
output_ = OutFrom<LoDTensor>(outputs, scope);
strides_ = GetAttr<vector<int>>("strides", attrs);
paddings_ = GetAttr<vector<int>>("paddings", attrs);
dilations_ = GetAttr<vector<int>>("dilations", attrs);
groups = GetAttr<int>("groups", attrs);
input_bias_ = InputBiasFrom<LoDTensor>(inputs, scope);
input_mean_ = InputMeanFrom<LoDTensor>(inputs, scope);
input_scale_ = InputScaleFrom<LoDTensor>(inputs, scope);
input_variance_ = InputVarianceFrom<LoDTensor>(inputs, scope);
epsilon_ = GetAttr<float>("epsilon", attrs);
momentum_ = GetAttr<float>("momentum", attrs);
// is_test_ = GetAttr<bool>("is_test", attrs);
}
const Tensor *Input() const { return input_; }
const Tensor *Filter() const { return filter_; }
Tensor *Output() const { return output_; }
const vector<int> &Strides() const { return strides_; }
const vector<int> &Paddings() const { return paddings_; }
const vector<int> &Dilations() const { return dilations_; }
const int &Groups() const { return groups; }
const Tensor *InputBias() const { return input_bias_; }
const Tensor *InputMean() const { return input_mean_; }
const Tensor *InputScale() const { return input_scale_; }
const Tensor *InputVariance() const { return input_variance_; }
const float &Epsilon() const { return epsilon_; }
const float &Momentum() const { return momentum_; }
const bool &IsTest() const { return is_test_; }
void SetNewScale(Tensor *new_scale) { new_scale_ = new_scale; }
void SetNewBias(Tensor *new_bias) { new_bias_ = new_bias; }
const Tensor *NewScale() const { return new_scale_; }
const Tensor *NewBias() const { return new_bias_; }
protected:
Tensor *input_;
Tensor *output_;
Tensor *filter_;
vector<int> strides_;
vector<int> paddings_;
vector<int> dilations_;
int groups;
Tensor *input_bias_;
Tensor *input_mean_;
Tensor *input_scale_;
Tensor *input_variance_;
float epsilon_;
float momentum_;
bool is_test_;
Tensor *new_bias_;
Tensor *new_scale_;
};
#endif
#ifdef IM2SEQUENCE_OP
class Im2SequenceParam : public OpParam {
public:
......
......@@ -54,7 +54,7 @@ void PoolOp<DeviceType, T>::InferShape() const {
}
this->param_.Output()->Resize(framework::make_ddim(output_shape));
}
template class PoolOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
......@@ -23,7 +23,7 @@ void PReluOp<Dtype, T>::InferShape() const {
auto input_dims = this->param_.InputX()->dims();
this->param_.Out()->Resize(input_dims);
}
template class PReluOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
......@@ -44,7 +44,7 @@ void PriorBoxOp<Dtype, T>::InferShape() const {
this->param_.OutputBoxes()->Resize(framework::make_ddim(dim_vec));
this->param_.OutputVariances()->Resize(framework::make_ddim(dim_vec));
}
template class PriorBoxOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
......@@ -23,7 +23,7 @@ void ReluOp<Dtype, T>::InferShape() const {
auto input_dims = this->param_.InputX()->dims();
this->param_.Out()->Resize(input_dims);
}
template class ReluOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
......@@ -27,7 +27,7 @@ void ReshapeOp<Dtype, T>::InferShape() const {
auto out_dims = ValidateShape(shape, input_x_dims);
this->param_.Out()->Resize(out_dims);
}
template class ReshapeOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
......@@ -24,7 +24,7 @@ void ResizeOp<Dtype, T>::InferShape() const {
auto out_dims = CalOutputShape(this->param_);
this->param_.Out()->Resize(out_dims);
}
template class ResizeOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
......@@ -24,7 +24,7 @@ void ScaleOp<Dtype, T>::InferShape() const {
auto input_dims = this->param_.InputX()->dims();
this->param_.Out()->Resize(input_dims);
}
template class ScaleOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
......@@ -22,7 +22,7 @@ template <typename DeviceType, typename T>
void SigmoidOp<DeviceType, T>::InferShape() const {
this->param_.Out()->Resize(this->param_.InputX()->dims());
}
template class SigmoidOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
......@@ -23,7 +23,7 @@ template <typename Dtype, typename T>
void SliceOp<Dtype, T>::InferShape() const {
/// todo: add InputShape() detection.
}
template class SliceOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
......@@ -22,7 +22,7 @@ template <typename DeviceType, typename T>
void SoftmaxOp<DeviceType, T>::InferShape() const {
this->param_.Out()->Resize(this->param_.InputX()->dims());
}
template class SoftmaxOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
......@@ -47,7 +47,7 @@ void TransposeOp<Dtype, T>::InferShape() const {
}
this->param_.Out()->Resize(out_dims);
}
template class TransposeOp<CPU, float>;
} // namespace operators
} // namespace paddle_mobile
......
......@@ -19,7 +19,9 @@ int main() {
paddle_mobile::Loader<paddle_mobile::CPU> loader;
// ../../../test/models/googlenet
// ../../../test/models/mobilenet
auto program = loader.Load(g_googlenet, true);
// auto program = loader.Load(g_googlenet, true);
auto program = loader.Load(g_mobilenet_ssd, true);
// auto program = loader.Load(g_googlenet_combine + "/model",
// g_googlenet_combine +
// "/params", true);
......
......@@ -23,7 +23,7 @@ int main() {
auto time1 = time();
if (paddle_mobile.Load(g_googlenet, optimize)) {
auto time2 = time();
DLOG << "load cost :" << time_diff(time1, time1) << "ms";
DLOG << "load cost: " << time_diff(time1, time1) << "ms";
std::vector<float> input;
std::vector<int64_t> dims{1, 3, 224, 224};
GetInput<float>(g_test_image_1x3x224x224, &input, dims);
......
......@@ -12,16 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <fstream>
#include <iostream>
#include "../test_helper.h"
#include "../test_include.h"
int main() {
paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
paddle_mobile.SetThreadNum(4);
auto time1 = time();
if (paddle_mobile.Load(g_mobilenet_ssd, true)) {
auto isok = paddle_mobile.Load(g_mobilenet_ssd_gesture + "/model",
g_mobilenet_ssd_gesture + "/params", true);
// auto isok = paddle_mobile.Load(g_mobilenet_ssd, false);
if (isok) {
auto time2 = time();
DLOG << "load cost :" << time_diff(time1, time1) << "ms";
std::cout << "load cost :" << time_diff(time1, time2) << "ms" << std::endl;
std::vector<int64_t> dims{1, 3, 300, 300};
Tensor input_tensor;
......@@ -33,7 +37,8 @@ int main() {
auto time3 = time();
paddle_mobile.Predict(input, dims);
auto time4 = time();
DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
std::cout << "predict cost :" << time_diff(time3, time4) << "ms"
<< std::endl;
}
return 0;
}
......@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <fstream>
#include <iostream>
#include "../test_helper.h"
#include "../test_include.h"
......@@ -22,7 +22,7 @@ int main() {
auto time1 = time();
if (paddle_mobile.Load(g_mobilenet, true)) {
auto time2 = time();
DLOG << "load cost :" << time_diff(time1, time1) << "ms";
std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
std::vector<int64_t> dims{1, 3, 224, 224};
Tensor input_tensor;
......@@ -35,7 +35,8 @@ int main() {
auto vec_result = paddle_mobile.Predict(input, dims);
auto time4 = time();
DLOG << "predict cost :" << time_diff(time3, time4) << "ms";
std::cout << "predict cost :" << time_diff(time3, time4) << "ms"
<< std::endl;
}
return 0;
......
......@@ -16,6 +16,8 @@ limitations under the License. */
#include <fstream>
#include <random>
#include <string>
#include <vector>
#include "common/common.h"
#include "common/log.h"
......@@ -23,6 +25,8 @@ limitations under the License. */
#include "framework/tensor.h"
static const std::string g_mobilenet_ssd = "../models/mobilenet+ssd";
static const std::string g_mobilenet_ssd_gesture =
"../models/mobilenet+ssd_gesture";
static const std::string g_squeezenet = "../models/squeezenet";
static const std::string g_googlenet = "../models/googlenet";
static const std::string g_mobilenet = "../models/mobilenet";
......@@ -62,9 +66,9 @@ void GetInput(const std::string &input_name, std::vector<T> *input,
size *= dim;
}
T *input_ptr = (T *)malloc(sizeof(T) * size);
T *input_ptr = reinterpret_cast<T *>(malloc(sizeof(T) * size));
std::ifstream in(input_name, std::ios::in | std::ios::binary);
in.read((char *)(input_ptr), size * sizeof(T));
in.read(reinterpret_cast<char *>(input_ptr), size * sizeof(T));
in.close();
for (int i = 0; i < size; ++i) {
input->push_back(input_ptr[i]);
......@@ -79,6 +83,6 @@ void GetInput(const std::string &input_name,
T *input_ptr = input->mutable_data<T>(dims);
std::ifstream in(input_name, std::ios::in | std::ios::binary);
in.read((char *)(input_ptr), input->numel() * sizeof(T));
in.read(reinterpret_cast<char *>(input_ptr), input->numel() * sizeof(T));
in.close();
}
......@@ -38,7 +38,8 @@ build_for_android() {
fi
if [ -z "$PLATFORM" ]; then
PLATFORM="arm-v7a" # Users could choose "arm-v8a" or other platforms from the command line.
PLATFORM="arm-v7a" # Users could choose "arm-v8a" platform.
# PLATFORM="arm-v8a"
fi
if [ "${PLATFORM}" = "arm-v7a" ]; then
......@@ -92,23 +93,28 @@ build_for_ios() {
# rm -rf "../build"
PLATFORM="ios"
MODE="Release"
BUILD_DIR=../build/release/"${PLATFORM}"
# IOS_ARCH="armv7"
# IOS_ARCH="armv7s"
IOS_ARCH="arm64" # Users could choose "armv7" or "armv7s" platforms.
BUILD_DIR=../build/release/"${PLATFORM}"/"${IOS_ARCH}"
TOOLCHAIN_FILE="./tools/ios-cmake/ios.toolchain.cmake"
mkdir -p "${BUILD_DIR}"
if [ $# -eq 1 ]; then
cmake .. \
-B"${BUILD_DIR}" \
-DCMAKE_BUILD_TYPE="${MODE}" \
-DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
-DIOS_PLATFORM=OS \
-DIOS_ARCH="${IOS_ARCH}" \
-DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
-DNET=$1 \
-DIS_IOS="true"
else
cmake .. \
-B"${BUILD_DIR}" \
-DCMAKE_BUILD_TYPE="${MODE}" \
-DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
-DIOS_PLATFORM=OS \
-DIOS_ARCH="${IOS_ARCH}" \
-DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
-DIS_IOS="true"
fi
cd "${BUILD_DIR}"
......
......@@ -159,7 +159,6 @@ set (CMAKE_OSX_SYSROOT ${CMAKE_IOS_SDK_ROOT} CACHE PATH "Sysroot used for iOS su
# set the architecture for iOS
if (${IOS_PLATFORM} STREQUAL "OS")
set (IOS_ARCH armv7 armv7s arm64)
elseif (${IOS_PLATFORM} STREQUAL "SIMULATOR")
set (IOS_ARCH i386)
elseif (${IOS_PLATFORM} STREQUAL "SIMULATOR64")
......
......@@ -42,6 +42,16 @@ elseif (NET STREQUAL "resnet")
set(MUL_OP ON)
set(POOL_OP ON)
set(RELU_OP ON)
elseif (NET STREQUAL "FPGAnets")
set(FUSION_CONVRELU_OP ON)
set(FUSION_CONVBNSCALE_OP ON)
set(FUSION_CONVBNSCALERELU_OP ON)
set(FUSION_POOLBN_OP ON)
set(FUSION_ELEMENTWISEADDRELU_OP ON)
set(REGION_OP ON)
set(POOL_OP ON)
set(CONCAT_OP ON)
set(SOFTMAX_OP ON)
else ()
set(BATCHNORM_OP ON)
set(BOXCODER_OP ON)
......@@ -64,6 +74,8 @@ else ()
set(TRANSPOSE_OP ON)
set(FUSION_CONVADD_RELU_OP ON)
set(FUSION_CONVADDBNRELU_OP ON)
set(FUSION_DWCONVBNRELU_OP ON)
set(FUSION_CONVBNRELU_OP ON)
set(PRELU_OP ON)
set(RESIZE_OP ON)
set(SCALE_OP ON)
......@@ -155,6 +167,14 @@ endif()
if (FUSION_CONVADDBNRELU_OP)
add_definitions(-DFUSION_CONVADDBNRELU_OP)
endif()
if (FUSION_DWCONVBNRELU_OP)
add_definitions(-DFUSION_DWCONVBNRELU_OP)
endif()
if (FUSION_CONVBNRELU_OP)
add_definitions(-DFUSION_CONVBNRELU_OP)
endif()
if (PRELU_OP)
add_definitions(-DPRELU_OP)
endif()
......@@ -173,3 +193,23 @@ endif()
if (IM2SEQUENCE_OP)
add_definitions(-DIM2SEQUENCE_OP)
endif()
if (FUSION_CONVRELU_OP)
add_definitions(-DFUSION_CONVRELU_OP)
endif()
if (FUSION_CONVBNSCALE_OP)
add_definitions(-DFUSION_CONVBNSCALE_OP)
endif()
if (FUSION_CONVBNSCALERELU_OP)
add_definitions(-DFUSION_CONVBNSCALERELU_OP)
endif()
if (FUSION_POOLBN_OP)
add_definitions(-DFUSION_POOLBN_OP)
endif()
if (FUSION_ELEMENTWISEADDRELU_OP)
add_definitions(-DFUSION_ELEMENTWISEADDRELU_OP)
endif()
if (REGION_OP)
add_definitions(-DREGION_OP)
endif()
set(dir ${CMAKE_CURRENT_SOURCE_DIR})
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${dir}/build")
ADD_EXECUTABLE(convert convert.cpp)
target_link_libraries(convert paddle-mobile)
\ No newline at end of file
#include "io/paddle_mobile.h"
#include <cstdlib>
using std::string;
static const std::string g_googlenet_combine = "../models/googlenet_combine";
static const std::string g_googlenet = "../models/googlenet";
using paddle_mobile::Executor;
using paddle_mobile::framework::Program;
char *Get_binary_data(std::string filename) {
FILE *file = fopen(filename.c_str(), "rb");
PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ",
filename.c_str());
fseek(file, 0, SEEK_END);
int64_t size = ftell(file);
PADDLE_MOBILE_ENFORCE(size > 0, "size is too small");
rewind(file);
char *data = new char[size];
size_t bytes_read = fread(data, 1, size, file);
PADDLE_MOBILE_ENFORCE(bytes_read == size,
"read binary file bytes do not match with fseek");
DLOG << "Get_binary_data end";
fclose(file);
return data;
}
void LoadWithDump(const paddle_mobile::framework::VarDesc var_desc,
paddle_mobile::framework::LoDTensor *tensor, char **data, FILE *out_file) {
// 1. version
uint32_t version = *reinterpret_cast<uint32_t *>(*data);
// write version
fwrite(&version, sizeof(uint32_t), 1, out_file );
(*data) += sizeof(uint32_t);
// 2 Lod information
uint64_t *lod_level_ptr = new uint64_t();
memcpy(lod_level_ptr, (*data), sizeof(uint64_t));
uint64_t lod_level = 0;
// write lod Information
fwrite(&lod_level, sizeof(uint64_t), 1, out_file);
delete lod_level_ptr;
(*data) += sizeof(uint64_t);
auto &lod = *tensor->mutable_lod();
lod.resize(lod_level);
for (uint64_t i = 0; i < lod_level; ++i) {
uint64_t size = *reinterpret_cast<uint64_t *>(*data);
// write lod size
fwrite(&size, sizeof(uint64_t), 1, out_file);
(*data) += sizeof(uint64_t);
std::vector<size_t> tmp(size / sizeof(size_t));
for (int k = 0; k < tmp.size(); ++k) {
tmp[k] = *reinterpret_cast<size_t *>(*data);
(*data) += sizeof(size_t);
}
// write lod size vector
fwrite(&tmp, sizeof(size_t), tmp.size(), out_file );
lod[i] = tmp;
}
// 3. tensor version
uint32_t tensor_version = *reinterpret_cast<uint32_t *>(*data);
// write tensor version
fwrite(&tensor_version, sizeof(uint32_t), 1, out_file);
(*data) += sizeof(uint32_t);
// 4. tensor desc
int32_t size = *reinterpret_cast<int32_t *>(*data);
// write tensor desc
fwrite(&size, sizeof(int32_t), 1, out_file);
(*data) += sizeof(int32_t);
std::unique_ptr<char[]> buf(new char[size]);
for (int m = 0; m < size; ++m) {
buf.get()[m] = (*data)[m];
}
fwrite(buf.get(), sizeof(char), size, out_file);
(*data) += (sizeof(char) * size);
const paddle_mobile::framework::TensorDesc &desc = var_desc.Tensor_desc();
int memory_size = 1;
for (auto l : desc.Dims()) {
memory_size *= l;
}
tensor->Resize(paddle_mobile::framework::make_ddim(desc.Dims()));
void *memory = tensor;
int type_size = 0;
switch (desc.DataType()) {
case paddle_mobile::framework::VARTYPE_TYPE_FP16:
type_size = 2;
break;
case paddle_mobile::framework::VARTYPE_TYPE_FP32:
type_size = 4;
memory = tensor->mutable_data<float>();
break;
case paddle_mobile::framework::VARTYPE_TYPE_FP64:
type_size = 8;
break;
case paddle_mobile::framework::VARTYPE_TYPE_INT32:
type_size = 4;
break;
case paddle_mobile::framework::VARTYPE_TYPE_INT64:
type_size = 8;
break;
case paddle_mobile::framework::VARTYPE_TYPE_BOOL:
type_size = 1;
break;
default:
break;
}
for (int n = 0; n < memory_size * type_size; ++n) {
static_cast<char *>(memory)[n] = (*data)[n];
}
(*data) += (sizeof(char) * memory_size * type_size);
// for float 32
float min_value = std::numeric_limits<float>::max();
float max_value = std::numeric_limits<float>::min();
for (int k = 0; k < memory_size; ++k) {
min_value = std::min(min_value, static_cast<float *> (memory)[k]);
max_value = std::max(max_value, static_cast<float *> (memory)[k]);
}
fwrite(&min_value, sizeof(float), 1, out_file);
fwrite(&max_value, sizeof(float), 1, out_file);
for (int g = 0; g < memory_size; ++g) {
float value = static_cast<float *> (memory)[g];
uint8_t factor = (uint8_t) round((value - min_value) / (max_value - min_value) * 255);
fwrite(&factor, sizeof(uint8_t), 1, out_file);
}
}
void quantificate_combined(std::string model_path, std::string param_path, std::string param_min_path){
paddle_mobile::Loader<paddle_mobile::CPU,paddle_mobile::Precision::FP32 > loader;
bool optimize = true;
auto program = loader.Load(model_path, param_path, optimize);
char *origin_data = Get_binary_data(program.para_path);
char *data = origin_data;
FILE *out_file = fopen(param_min_path.c_str(), "wb");
for (const auto &block : program.originProgram->Blocks()) {
for (const auto &var_desc : block->Vars()) {
auto var = program.scope->Var(var_desc->Name());
if(var_desc ->Persistable()) {
auto tensor = var->template GetMutable<paddle_mobile::framework::LoDTensor>();
if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
continue;
}
LoadWithDump(*var_desc, tensor, &data,out_file);
}
}
}
fclose(out_file);
delete origin_data;
}
void quantificate_seperated(std::string model_dir, std::string param_min_path) {
paddle_mobile::Loader<paddle_mobile::CPU,paddle_mobile::Precision::FP32 > loader;
bool optimize = true;
auto program = loader.Load(model_dir, optimize);
std::string shell_command = "mkdir "+param_min_path;
system(shell_command.c_str());
for (const auto &block : program.originProgram->Blocks()) {
for (const auto &var_desc : block->Vars()) {
auto var = program.scope->Var(var_desc->Name());
if(var_desc ->Persistable()) {
auto tensor = var->template GetMutable<paddle_mobile::framework::LoDTensor>();
if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
continue;
}
std::string file_name = param_min_path +"/"+ var_desc->Name();
FILE *out_file = fopen(file_name.c_str(), "wb");
char *origin_data =
Get_binary_data(program.model_path + "/" + var_desc->Name());
char *data = origin_data;
LoadWithDump(*var_desc, tensor, &data,out_file);
delete origin_data;
fclose(out_file);
}
}
}
}
int main() {
std::string filename = "params_min";
std::string model_path = g_googlenet_combine + "/model";
std::string param_path = g_googlenet_combine + "/params";
std::string dirname = "param_min_dir";
std::string model_dir = g_googlenet;
// quantificate_combined(model_path, param_path,filename);
quantificate_seperated(model_dir, dirname);
return 0;
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册