提交 abc1eaf3 编写于 作者: E eclipsess

conflict

...@@ -29,7 +29,10 @@ if(DEBUGING) ...@@ -29,7 +29,10 @@ if(DEBUGING)
message(STATUS "debugging mode") message(STATUS "debugging mode")
add_definitions(-DPADDLE_MOBILE_DEBUG) add_definitions(-DPADDLE_MOBILE_DEBUG)
else() else()
if(FPGA)
else()
add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden) add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden)
endif()
endif() endif()
if(USE_EXCEPTION) if(USE_EXCEPTION)
...@@ -93,8 +96,7 @@ else() ...@@ -93,8 +96,7 @@ else()
endif() endif()
if(FPGA) if(FPGA)
set(DEBUGING ON) message("FPGA mode enabled")
add_definitions(-DPADDLE_MOBILE_DEBUG)
add_definitions(-DPADDLE_MOBILE_FPGA) add_definitions(-DPADDLE_MOBILE_FPGA)
else() else()
file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/*.cpp src/operators/kernel/fpga/*.cc) file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/*.cpp src/operators/kernel/fpga/*.cc)
...@@ -177,6 +179,10 @@ if(DEBUGING) ...@@ -177,6 +179,10 @@ if(DEBUGING)
else() else()
add_subdirectory(test) add_subdirectory(test)
endif() endif()
elseif(FPGA)
add_subdirectory(test)
endif() endif()
...@@ -83,7 +83,7 @@ Paddle-Mobile是PaddlePaddle组织下的项目,是一个致力于嵌入式平 ...@@ -83,7 +83,7 @@ Paddle-Mobile是PaddlePaddle组织下的项目,是一个致力于嵌入式平
- **FPGA** - **FPGA**
FPGA实现正在进行中,是基于Xilinx的ZU5目标开发板。 目前已经支持 ZCU102 开发板。
- **灵活性** - **灵活性**
...@@ -112,6 +112,7 @@ Paddle-Mobile是PaddlePaddle组织下的项目,是一个致力于嵌入式平 ...@@ -112,6 +112,7 @@ Paddle-Mobile是PaddlePaddle组织下的项目,是一个致力于嵌入式平
开发文档主要是关于编译、运行等问题。做为开发者,它可以和贡献文档共同结合使用。 开发文档主要是关于编译、运行等问题。做为开发者,它可以和贡献文档共同结合使用。
* [iOS](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_ios.md) * [iOS](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_ios.md)
* [Android](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_android.md) * [Android](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_android.md)
* [FPGA](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_fpga.md)
### 贡献文档 ### 贡献文档
- [贡献文档链接](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/CONTRIBUTING.md) - [贡献文档链接](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/CONTRIBUTING.md)
......
# FPGA开发文档
FPGA平台的代码在Xilinx ZCU102 revision 1.0开发板测试Resnet50成功,预测结果正确。
## 准备硬件
___
1. 购买Xilinx ZCU102 revision1.0 开发板
2. 另外下载Xilinx ZCU102 Ubuntu[镜像文件](https://www.xilinx.com/member/forms/download/xef.html?filename=Ubuntu_Desktop_Release_2018_1.zip),并烧录进SD卡。
* Windowns系统可使用Win32DiskImager
* Linux系统使用dd命令:dd if=name.img of=/dev/sdb
2. 将SD卡插入电脑,替换分区1中已有的BOOT.BIN、image.ub为[BOOT.BIN、image.ub](http://mms-graph.bj.bcebos.com/paddle-mobile/fpga/files.tar.gz)
3. 将SD卡插入ZCU102开发板,设置板拨码开关为SD卡启动,上电启动Linux系统.
3. 装载驱动:sudo insmod [fpgadrv.ko](http://mms-graph.bj.bcebos.com/paddle-mobile/fpga/files.tar.gz)
## 编译工程
___
1. 将最新的paddle mobile 代码复制到ZCU102开发板中。
2. 进入paddle-mobile根目录, CMakeLists.txt 设置平台为 option(FPGA "fpga support" ON)。CPU和MALI\_GPU选项设置为OFF。
2. 执行以下命令,可在./test/build下生成test-resnet50可执行程序。
* mkdir build
* cd build
* cmake ..
* make
## 准备模型和数据
___
1. 模型文件放在./test/models/resnet50中。将[\_\_model\_\_](http://mms-graph.bj.bcebos.com/paddle-mobile/fpga/files.tar.gz)文件复制到此文件夹下。
2. 另外下载模型[权重文件](http://paddle-imagenet-models.bj.bcebos.com/resnet_50_model.tar),解压后也放在./test/models/resnet50 中。
3. 将数据文件[image_src_float](http://mms-graph.bj.bcebos.com/paddle-mobile/fpga/files.tar.gz)复制到/test/images下。此数据文件对应着标准数据集中的ILSVRC2012_val_00000885.JPEG,分类标签为80, 对应着"black grouse".
## 运行程序
___
1. 进入./test/build目录。
2. sudo ./test-resnet50
3. 如果于DEBUG选项是否打开,屏幕会输出很多中间打印信息。最终打印出预测分类结果为80。
...@@ -22,6 +22,7 @@ const char *G_OP_TYPE_BATCHNORM = "batch_norm"; ...@@ -22,6 +22,7 @@ const char *G_OP_TYPE_BATCHNORM = "batch_norm";
const char *G_OP_TYPE_BOX_CODER = "box_coder"; const char *G_OP_TYPE_BOX_CODER = "box_coder";
const char *G_OP_TYPE_CONCAT = "concat"; const char *G_OP_TYPE_CONCAT = "concat";
const char *G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add"; const char *G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add";
const char *G_OP_TYPE_FILL_CONSTANT = "fill_constant";
const char *G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu"; const char *G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu";
const char *G_OP_TYPE_FUSION_CONV_ADD_PRELU = "fusion_conv_add_prelu"; const char *G_OP_TYPE_FUSION_CONV_ADD_PRELU = "fusion_conv_add_prelu";
const char *G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU = "fusion_conv_add_add_prelu"; const char *G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU = "fusion_conv_add_add_prelu";
...@@ -34,6 +35,7 @@ const char *G_OP_TYPE_FUSION_CONV_ADD = "fusion_conv_add"; ...@@ -34,6 +35,7 @@ const char *G_OP_TYPE_FUSION_CONV_ADD = "fusion_conv_add";
const char *G_OP_TYPE_LRN = "lrn"; const char *G_OP_TYPE_LRN = "lrn";
const char *G_OP_TYPE_MUL = "mul"; const char *G_OP_TYPE_MUL = "mul";
const char *G_OP_TYPE_MULTICLASS_NMS = "multiclass_nms"; const char *G_OP_TYPE_MULTICLASS_NMS = "multiclass_nms";
const char *G_OP_TYPE_POLYGON_BOX_TRANSFORM = "polygon_box_transform";
const char *G_OP_TYPE_POOL2D = "pool2d"; const char *G_OP_TYPE_POOL2D = "pool2d";
const char *G_OP_TYPE_PRIOR_BOX = "prior_box"; const char *G_OP_TYPE_PRIOR_BOX = "prior_box";
const char *G_OP_TYPE_RELU = "relu"; const char *G_OP_TYPE_RELU = "relu";
...@@ -94,9 +96,11 @@ std::unordered_map< ...@@ -94,9 +96,11 @@ std::unordered_map<
{G_OP_TYPE_FUSION_CONV_BN_ADD_RELU, {{"Input"}, {"Out"}}}, {G_OP_TYPE_FUSION_CONV_BN_ADD_RELU, {{"Input"}, {"Out"}}},
{G_OP_TYPE_PRIOR_BOX, {{"Image", "Input"}, {"Boxes", "Variances"}}}, {G_OP_TYPE_PRIOR_BOX, {{"Image", "Input"}, {"Boxes", "Variances"}}},
{G_OP_TYPE_MULTICLASS_NMS, {{"BBoxes", "Scores"}, {"Out"}}}, {G_OP_TYPE_MULTICLASS_NMS, {{"BBoxes", "Scores"}, {"Out"}}},
{G_OP_TYPE_POLYGON_BOX_TRANSFORM, {{"Input"}, {"Output"}}},
{G_OP_TYPE_FC, {{"X", "Y", "Z"}, {"Out"}}}, {G_OP_TYPE_FC, {{"X", "Y", "Z"}, {"Out"}}},
{G_OP_TYPE_RESHAPE, {{"X"}, {"Out"}}}, {G_OP_TYPE_RESHAPE, {{"X"}, {"Out"}}},
{G_OP_TYPE_DEPTHWISE_CONV, {{"Input"}, {"Output"}}}, {G_OP_TYPE_DEPTHWISE_CONV, {{"Input"}, {"Output"}}},
{G_OP_TYPE_FILL_CONSTANT, {{}, {"Out"}}},
{G_OP_TYPE_FUSION_CONV_ADD_RELU, {{"Input"}, {"Out"}}}, {G_OP_TYPE_FUSION_CONV_ADD_RELU, {{"Input"}, {"Out"}}},
{G_OP_TYPE_FUSION_CONV_ADD_PRELU, {{"Input"}, {"Out"}}}, {G_OP_TYPE_FUSION_CONV_ADD_PRELU, {{"Input"}, {"Out"}}},
{G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU, {{"Input"}, {"Out"}}}, {G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU, {{"Input"}, {"Out"}}},
......
...@@ -12,14 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,14 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <cstdlib> #pragma once
#include <cstdlib>
#include <cstring>
#include <string>
#include "common/enforce.h" #include "common/enforce.h"
#include "common/log.h" #include "common/log.h"
#pragma once
namespace paddle_mobile { namespace paddle_mobile {
template <int ID, typename Type> template <int ID, typename Type>
struct IDToType { struct IDToType {
typedef Type type_t; typedef Type type_t;
......
...@@ -22,7 +22,7 @@ limitations under the License. */ ...@@ -22,7 +22,7 @@ limitations under the License. */
#include "fpga/filter.h" #include "fpga/filter.h"
#include "fpga/image.h" #include "fpga/image.h"
#define FPGA_TEST_MODE #define FPGA_TEST_MODE
//#define PADDLE_MOBILE_OS_LINUX #define PADDLE_MOBILE_OS_LINUX
namespace paddle_mobile { namespace paddle_mobile {
namespace fpga { namespace fpga {
...@@ -149,7 +149,7 @@ int ComputeBasicConv(const struct ConvArgs &args) { ...@@ -149,7 +149,7 @@ int ComputeBasicConv(const struct ConvArgs &args) {
return do_ioctl(IOCTL_CONFIG_CONV, &args); return do_ioctl(IOCTL_CONFIG_CONV, &args);
} }
int ComputeFpgaConv(const struct WrapperConvArgs &args) { int ComputeFpgaConv(const struct SplitConvArgs &args) {
#ifdef FPGA_TEST_MODE #ifdef FPGA_TEST_MODE
DLOG << "=============ComputeFPGAConv==========="; DLOG << "=============ComputeFPGAConv===========";
DLOG << " filter_num:" << args.filter_num DLOG << " filter_num:" << args.filter_num
...@@ -194,8 +194,8 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) { ...@@ -194,8 +194,8 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
#ifdef FPGA_TEST_MODE #ifdef FPGA_TEST_MODE
DLOG << "=============ComputeFpgaEWAdd==========="; DLOG << "=============ComputeFpgaEWAdd===========";
DLOG << " relu_enabled:" << args.relu_enabled DLOG << " relu_enabled:" << args.relu_enabled
<< " const0:" << fp16_2_fp32(short(args.const0)) << " const0:" << fp16_2_fp32(int16_t(args.const0))
<< " const1:" << fp16_2_fp32(short(args.const1)); << " const1:" << fp16_2_fp32(int16_t(args.const1));
DLOG << " image0_address:" << args.image0.address DLOG << " image0_address:" << args.image0.address
<< " image0_scale_address:" << args.image0.scale_address << " image0_scale_address:" << args.image0.scale_address
<< " image0_channels:" << args.image0.channels << " image0_channels:" << args.image0.channels
...@@ -383,10 +383,10 @@ void format_concat_output(framework::Tensor *out, int height, int width, ...@@ -383,10 +383,10 @@ void format_concat_output(framework::Tensor *out, int height, int width,
out->reset_data_ptr(data_ptr); out->reset_data_ptr(data_ptr);
} }
void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input, void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
framework::Tensor *out, framework::Tensor *filter, framework::Tensor *out, framework::Tensor *filter,
bool relu_enabled, int group_num, int stride_h, int stride_w, bool relu_enabled, int group_num, int stride_h,
int padding_h, int padding_w, float *bs_ptr) { int stride_w, int padding_h, int padding_w, float *bs_ptr) {
auto input_ptr = input->data<float>(); auto input_ptr = input->data<float>();
auto filter_ptr = filter->data<float>(); auto filter_ptr = filter->data<float>();
auto out_ptr = out->data<float>(); auto out_ptr = out->data<float>();
......
...@@ -89,7 +89,7 @@ struct ConcatArgs { ...@@ -89,7 +89,7 @@ struct ConcatArgs {
uint32_t width; uint32_t width;
}; };
struct WrapperConvArgs { struct SplitConvArgs {
uint32_t split_num; uint32_t split_num;
uint32_t group_num; uint32_t group_num;
uint32_t filter_num; uint32_t filter_num;
...@@ -98,6 +98,14 @@ struct WrapperConvArgs { ...@@ -98,6 +98,14 @@ struct WrapperConvArgs {
struct ConcatArgs concat_arg; struct ConcatArgs concat_arg;
}; };
struct GroupConvArgs {
uint32_t group_num;
uint32_t filter_num;
struct ImageOutputArgs output;
struct SplitConvArgs* conv_args;
struct ConcatArgs concat_arg;
};
struct PoolingArgs { struct PoolingArgs {
int16_t mode; // mode: 0:max, 1:avg int16_t mode; // mode: 0:max, 1:avg
half kernel_reciprocal; half kernel_reciprocal;
...@@ -159,30 +167,6 @@ struct MemoryCacheArgs { ...@@ -159,30 +167,6 @@ struct MemoryCacheArgs {
#define IOCTL_FPGA_REG_READ _IOW(IOCTL_FPGA_MAGIC, 28, struct FpgaRegReadArgs) #define IOCTL_FPGA_REG_READ _IOW(IOCTL_FPGA_MAGIC, 28, struct FpgaRegReadArgs)
#define IOCTL_FPGA_REG_WRITE _IOW(IOCTL_FPGA_MAGIC, 29, struct FpgaRegWriteArgs) #define IOCTL_FPGA_REG_WRITE _IOW(IOCTL_FPGA_MAGIC, 29, struct FpgaRegWriteArgs)
enum FPGA_ERR_TYPE {
ERR_IOCTL_CMD = -1,
ERR_TIMEOUT = -2,
ERR_COMPLETION_TIMEOUT = -3,
ERR_INVALID_FPGA_ADDR = -4,
ERR_NOMEM = -5,
ERR_NO_RESERVE_MEM = -6,
ERR_COPY_FROM_USER = -7,
ERR_COPY_TO_USER = -8,
ERR_DEL_TIMER = -9,
ERR_ENABLE_MSI = -10,
ERR_REGISTER_IRQ = -11,
ERR_PCIE_REGISTER = -12,
ERR_PCIE_PROBE = -13,
ERR_REGISTER_BLOCK = -14,
ERR_ALLOC_GENDISK = -15,
ERR_INIT_QUEUE = -16,
ERR_WAIT = -17,
ERR_ECC_ERROR = -31,
ERR_FPGA_FAIL_STOP = -64,
ERR_FPGA_DEBUG_STOP = -113,
DEV_TMP_UNAVAILABLE = -128
};
//============================== API ============================= //============================== API =============================
int open_device(); int open_device();
...@@ -195,7 +179,7 @@ int fpga_flush(void* address, size_t size); ...@@ -195,7 +179,7 @@ int fpga_flush(void* address, size_t size);
int fpga_invalidate(void* address, size_t size); int fpga_invalidate(void* address, size_t size);
int PerformBypass(const struct BypassArgs& args); int PerformBypass(const struct BypassArgs& args);
int ComputeFpgaConv(const struct WrapperConvArgs& args); int ComputeFpgaConv(const struct SplitConvArgs& args);
int ComputeFpgaPool(const struct PoolingArgs& args); int ComputeFpgaPool(const struct PoolingArgs& args);
int ComputeFpgaEWAdd(const struct EWAddArgs& args); int ComputeFpgaEWAdd(const struct EWAddArgs& args);
int ComputeFPGAConcat(const struct ConcatArgs& args); int ComputeFPGAConcat(const struct ConcatArgs& args);
...@@ -220,10 +204,10 @@ void format_bias_scale_array(float** bias_scale_array, ...@@ -220,10 +204,10 @@ void format_bias_scale_array(float** bias_scale_array,
void format_concat_output(framework::Tensor* out, int height, int width, void format_concat_output(framework::Tensor* out, int height, int width,
int image_num, uint32_t* channel_num); int image_num, uint32_t* channel_num);
void fill_conv_arg(struct WrapperConvArgs* arg, framework::Tensor* input, void fill_split_arg(struct SplitConvArgs* arg, framework::Tensor* input,
framework::Tensor* out, framework::Tensor* filter, framework::Tensor* out, framework::Tensor* filter,
bool relu_enabled, int group_num, int stride_h, int stride_w, bool relu_enabled, int group_num, int stride_h,
int padding_h, int padding_w, float* bs_ptr); int stride_w, int padding_h, int padding_w, float* bs_ptr);
half fp32_2_fp16(float fp32_num); half fp32_2_fp16(float fp32_num);
float fp16_2_fp32(half fp16_num); float fp16_2_fp32(half fp16_num);
......
...@@ -27,9 +27,6 @@ void align_element(float **data_in, int num_per_div_before_alignment, int num) { ...@@ -27,9 +27,6 @@ void align_element(float **data_in, int num_per_div_before_alignment, int num) {
(num + num_per_div_before_alignment - 1) / num_per_div_before_alignment; (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
int num_per_div_after_alignment = int num_per_div_after_alignment =
align_to_x(num_per_div_before_alignment, BS_NUM_ALIGNMENT); align_to_x(num_per_div_before_alignment, BS_NUM_ALIGNMENT);
if (num_per_div_before_alignment == num_per_div_after_alignment) {
return;
}
int num_element = int num_element =
2 * div_num * num_per_div_after_alignment; // including bias & scale 2 * div_num * num_per_div_after_alignment; // including bias & scale
float *ptr_aligned = float *ptr_aligned =
......
...@@ -21,7 +21,10 @@ namespace paddle_mobile { ...@@ -21,7 +21,10 @@ namespace paddle_mobile {
namespace fpga { namespace fpga {
namespace filter { namespace filter {
int calc_division_capacity(int chw) { return 2048 / ((chw + 15) / 16) * 32; } int calc_division_capacity(int chw) {
int n = 2048 / ((chw + 15) / 16) * 32;
return n < 2048 ? n : 2048;
}
int calc_split_num(int num, int division_capacity) { int calc_split_num(int num, int division_capacity) {
return (num + division_capacity - 1) / division_capacity; return (num + division_capacity - 1) / division_capacity;
......
...@@ -156,7 +156,7 @@ class AttrReader { ...@@ -156,7 +156,7 @@ class AttrReader {
template <typename T> template <typename T>
inline T Get(const string &name) const { inline T Get(const string &name) const {
PADDLE_MOBILE_ENFORCE(attrs_.count(name) != 0, PADDLE_MOBILE_ENFORCE(attrs_.count(name) != 0,
"%s should be in AttributeMap", name); "%s should be in AttributeMap", name.c_str());
return ((Attribute)attrs_.at(name)).Get<T>(); return ((Attribute)attrs_.at(name)).Get<T>();
} }
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "framework/data_type.h"
#include <stdint.h>
#include <string>
#include <unordered_map>
namespace paddle_mobile {
namespace framework {
struct DataTypeMap {
std::unordered_map<std::type_index,
_PaddleMobile__Framework__Proto__VarType__Type>
cpp_to_proto_;
std::unordered_map<int, std::type_index> proto_to_cpp_;
std::unordered_map<int, std::string> proto_to_str_;
std::unordered_map<std::type_index, size_t> cpp_to_size_;
};
static DataTypeMap* InitDataTypeMap();
// C++11 removes the need for manual locking. Concurrent execution shall wait if
// a static local variable is already being initialized.
// https://stackoverflow.com/questions/11711920/how-to-implement-multithread-safe-singleton-in-c11-without-using-mutex
static DataTypeMap& gDataTypeMap() {
static DataTypeMap* g_data_type_map_ = InitDataTypeMap();
return *g_data_type_map_;
}
template <typename T>
static inline void RegisterType(
DataTypeMap* map, _PaddleMobile__Framework__Proto__VarType__Type proto_type,
const std::string& name) {
map->proto_to_cpp_.emplace(static_cast<int>(proto_type), typeid(T));
map->cpp_to_proto_.emplace(typeid(T), proto_type);
map->proto_to_str_.emplace(static_cast<int>(proto_type), name);
map->cpp_to_size_.emplace(typeid(T), sizeof(T));
}
static DataTypeMap* InitDataTypeMap() {
auto retv = new DataTypeMap();
#define RegType(cc_type, proto_type) \
RegisterType<cc_type>(retv, proto_type, #cc_type)
// NOTE: Add your customize type here.
// RegType(float16, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP16);
RegType(float, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP32);
RegType(double, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP64);
RegType(int, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT32);
RegType(int64_t, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT64);
RegType(bool, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL);
RegType(size_t, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__SIZE_T);
RegType(int16_t, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT16);
RegType(uint8_t, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__UINT8);
RegType(int8_t, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT8);
#undef RegType
return retv;
}
_PaddleMobile__Framework__Proto__VarType__Type ToDataType(
std::type_index type) {
auto it = gDataTypeMap().cpp_to_proto_.find(type);
if (it != gDataTypeMap().cpp_to_proto_.end()) {
return it->second;
}
PADDLE_MOBILE_THROW_EXCEPTION("Not support %s as tensor type", type.name());
}
std::type_index ToTypeIndex(
_PaddleMobile__Framework__Proto__VarType__Type type) {
auto it = gDataTypeMap().proto_to_cpp_.find(static_cast<int>(type));
if (it != gDataTypeMap().proto_to_cpp_.end()) {
return it->second;
}
PADDLE_MOBILE_THROW_EXCEPTION(
"Not support _PaddleMobile__Framework__Proto__VarType__Type(%d) as "
"tensor type",
static_cast<int>(type));
}
std::string DataTypeToString(
const _PaddleMobile__Framework__Proto__VarType__Type type) {
auto it = gDataTypeMap().proto_to_str_.find(static_cast<int>(type));
if (it != gDataTypeMap().proto_to_str_.end()) {
return it->second;
}
PADDLE_MOBILE_THROW_EXCEPTION(
"Not support _PaddleMobile__Framework__Proto__VarType__Type(%d) as "
"tensor type",
static_cast<int>(type));
}
} // namespace framework
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <typeindex>
#include "common/enforce.h"
#include "framework/framework.pb-c.h"
namespace paddle_mobile {
namespace framework {
extern _PaddleMobile__Framework__Proto__VarType__Type ToDataType(
std::type_index type);
extern std::type_index ToTypeIndex(
_PaddleMobile__Framework__Proto__VarType__Type type);
template <typename Visitor>
inline void VisitDataType(_PaddleMobile__Framework__Proto__VarType__Type type,
Visitor visitor) {
switch (type) {
// case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP16:
// visitor.template apply<float16>();
// break;
case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP32:
visitor.template apply<float>();
break;
case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP64:
visitor.template apply<double>();
break;
case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT32:
visitor.template apply<int>();
break;
case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT64:
visitor.template apply<int64_t>();
break;
case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL:
visitor.template apply<bool>();
break;
case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__UINT8:
visitor.template apply<uint8_t>();
break;
case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT16:
visitor.template apply<int16_t>();
break;
case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT8:
visitor.template apply<int8_t>();
break;
default:
PADDLE_MOBILE_THROW_EXCEPTION("Not supported %d", type);
}
}
extern std::string DataTypeToString(
const _PaddleMobile__Framework__Proto__VarType__Type type);
inline std::ostream& operator<<(
std::ostream& out,
const _PaddleMobile__Framework__Proto__VarType__Type& type) {
out << DataTypeToString(type);
return out;
}
} // namespace framework
} // namespace paddle_mobile
...@@ -64,6 +64,9 @@ limitations under the License. */ ...@@ -64,6 +64,9 @@ limitations under the License. */
// load requared ops // load requared ops
LOAD_OP(feed) LOAD_OP(feed)
LOAD_OP(fetch) LOAD_OP(fetch)
#ifdef FILL_CONSTANT_OP
LOAD_OP(fill_constant)
#endif
#ifdef BATCHNORM_OP #ifdef BATCHNORM_OP
LOAD_OP2(batch_norm, CPU, MALI_GPU); LOAD_OP2(batch_norm, CPU, MALI_GPU);
#endif #endif
...@@ -199,6 +202,9 @@ LOAD_OP3(pool2d, CPU, MALI_GPU, FPGA); ...@@ -199,6 +202,9 @@ LOAD_OP3(pool2d, CPU, MALI_GPU, FPGA);
#ifdef MULTICLASSNMS_OP #ifdef MULTICLASSNMS_OP
LOAD_OP1(multiclass_nms, CPU); LOAD_OP1(multiclass_nms, CPU);
#endif #endif
#ifdef POLYGONBOXTRANSFORM_OP
LOAD_OP1(polygon_box_transform, CPU);
#endif
#ifdef SUM_OP #ifdef SUM_OP
LOAD_OP1(sum, CPU); LOAD_OP1(sum, CPU);
#endif #endif
......
...@@ -32,7 +32,7 @@ template <typename Dtype> ...@@ -32,7 +32,7 @@ template <typename Dtype>
vector<string> OperatorBase<Dtype>::GetInputKeys() const { vector<string> OperatorBase<Dtype>::GetInputKeys() const {
auto it = op_input_output_key.find(type_); auto it = op_input_output_key.find(type_);
if (it == op_input_output_key.end()) { if (it == op_input_output_key.end()) {
DLOG << type_ << " has no outputs"; DLOG << type_ << " has no inputs";
return {}; return {};
} }
return it->second.first; return it->second.first;
......
...@@ -18,9 +18,9 @@ limitations under the License. */ ...@@ -18,9 +18,9 @@ limitations under the License. */
#include <vector> #include <vector>
#include "framework/lod_tensor.h" #include "framework/lod_tensor.h"
#include "framework/mixed_vector.h"
#include "framework/tensor.h" #include "framework/tensor.h"
#include "memory/t_malloc.h" #include "memory/t_malloc.h"
#include "mixed_vector.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace framework { namespace framework {
......
...@@ -338,10 +338,14 @@ inline Print &operator<<(Print &printer, const Tensor &tensor) { ...@@ -338,10 +338,14 @@ inline Print &operator<<(Print &printer, const Tensor &tensor) {
for (int i = 0; i < tensor.numel(); i += stride) { for (int i = 0; i < tensor.numel(); i += stride) {
if (tensor.type() == typeid(float)) { if (tensor.type() == typeid(float)) {
printer << tensor.data<float>()[i] << " "; printer << tensor.data<float>()[i] << " ";
} else if (tensor.type() == typeid(int32_t)) {
printer << tensor.data<int32_t>()[i] << " ";
} else if (tensor.type() == typeid(int64_t)) { } else if (tensor.type() == typeid(int64_t)) {
printer << tensor.data<int64_t>()[i] << " "; printer << tensor.data<int64_t>()[i] << " ";
} else if (tensor.type() == typeid(int8_t)) { } else if (tensor.type() == typeid(int8_t)) {
printer << tensor.data<int8_t>()[i] << " "; printer << static_cast<int>(tensor.data<int8_t>()[i]) << " ";
} else if (tensor.type() == typeid(int32_t)) {
printer << tensor.data<int32_t>()[i] << " ";
} }
} }
#endif #endif
......
...@@ -29,7 +29,14 @@ PaddleMobilePredictor<Dtype, P>::PaddleMobilePredictor( ...@@ -29,7 +29,14 @@ PaddleMobilePredictor<Dtype, P>::PaddleMobilePredictor(
template <typename Dtype, Precision P> template <typename Dtype, Precision P>
bool PaddleMobilePredictor<Dtype, P>::Init(const PaddleMobileConfig &config) { bool PaddleMobilePredictor<Dtype, P>::Init(const PaddleMobileConfig &config) {
paddle_mobile_.reset(new PaddleMobile<Dtype, P>()); paddle_mobile_.reset(new PaddleMobile<Dtype, P>());
if (!config.model_dir.empty()) {
if (config.memory_pack.from_memory) {
DLOG << "load from memory!";
paddle_mobile_->LoadCombinedMemory(config.memory_pack.model_size,
config.memory_pack.model_buf,
config.memory_pack.combined_params_size,
config.memory_pack.combined_params_buf);
} else if (!config.model_dir.empty()) {
paddle_mobile_->Load(config.model_dir, config.optimize, paddle_mobile_->Load(config.model_dir, config.optimize,
config.quantification, config.batch_size); config.quantification, config.batch_size);
} else if (!config.prog_file.empty() && !config.param_file.empty()) { } else if (!config.prog_file.empty() && !config.param_file.empty()) {
......
...@@ -80,12 +80,13 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size, ...@@ -80,12 +80,13 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
} }
template <typename Dtype> template <typename Dtype>
void LoadMemInternal(void **data, framework::LoDTensor *tensor) { static void LoadMemInternal(void **data, framework::LoDTensor *tensor,
bool quant_uint8 = false) {
char **data_buf = reinterpret_cast<char **>(data); char **data_buf = reinterpret_cast<char **>(data);
int64_t size = tensor->numel(); int64_t size = tensor->numel();
Dtype *tensor_data = tensor->mutable_data<Dtype>(); Dtype *tensor_data = tensor->mutable_data<Dtype>();
if (0) { if (quant_uint8) {
// TODO(hjchen2) should be moved into operator init function // should be moved into operator init function
float min_value; float min_value;
float max_value; float max_value;
memcpy(&min_value, data_buf, sizeof(float)); memcpy(&min_value, data_buf, sizeof(float));
...@@ -141,7 +142,8 @@ void Executor<Dtype, P>::LoadMemory( ...@@ -141,7 +142,8 @@ void Executor<Dtype, P>::LoadMemory(
// parse tensor from stream // parse tensor from stream
switch (tensor_desc.DataType()) { switch (tensor_desc.DataType()) {
case framework::VARTYPE_TYPE_FP32: case framework::VARTYPE_TYPE_FP32:
LoadMemInternal<float>(reinterpret_cast<void **>(data_buf), tensor); LoadMemInternal<float>(reinterpret_cast<void **>(data_buf), tensor,
program_.quantification);
break; break;
case framework::VARTYPE_TYPE_INT8: case framework::VARTYPE_TYPE_INT8:
LoadMemInternal<int8_t>(reinterpret_cast<void **>(data_buf), tensor); LoadMemInternal<int8_t>(reinterpret_cast<void **>(data_buf), tensor);
......
...@@ -111,6 +111,14 @@ class PaddlePredictor { ...@@ -111,6 +111,14 @@ class PaddlePredictor {
PaddlePredictor() = default; PaddlePredictor() = default;
}; };
struct PaddleModelMemoryPack {
bool from_memory = false;
size_t model_size = 0;
uint8_t* model_buf = nullptr;
size_t combined_params_size = 0;
uint8_t* combined_params_buf = nullptr;
};
struct PaddleMobileConfig : public PaddlePredictor::Config { struct PaddleMobileConfig : public PaddlePredictor::Config {
enum Precision { FP32 = 0 }; enum Precision { FP32 = 0 };
enum Device { kCPU = 0, kFPGA = 1, kGPU_MALI = 2 }; enum Device { kCPU = 0, kFPGA = 1, kGPU_MALI = 2 };
...@@ -124,6 +132,7 @@ struct PaddleMobileConfig : public PaddlePredictor::Config { ...@@ -124,6 +132,7 @@ struct PaddleMobileConfig : public PaddlePredictor::Config {
int thread_num = 1; int thread_num = 1;
std::string prog_file; std::string prog_file;
std::string param_file; std::string param_file;
struct PaddleModelMemoryPack memory_pack;
}; };
// A factory to help create different predictors. // A factory to help create different predictors.
......
...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef DEQUANT_OP
#include "operators/dequantize_op.h" #include "operators/dequantize_op.h"
namespace paddle_mobile { namespace paddle_mobile {
...@@ -30,3 +32,5 @@ namespace ops = paddle_mobile::operators; ...@@ -30,3 +32,5 @@ namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU #ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(dequantize, ops::DequantizeOp); REGISTER_OPERATOR_CPU(dequantize, ops::DequantizeOp);
#endif #endif
#endif
...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef DEQUANT_OP
#pragma once #pragma once
#include <string> #include <string>
...@@ -41,3 +43,5 @@ class DequantizeOp ...@@ -41,3 +43,5 @@ class DequantizeOp
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -14,7 +14,7 @@ limitations under the License. */ ...@@ -14,7 +14,7 @@ limitations under the License. */
#ifdef ELEMENTWISEMUL_OP #ifdef ELEMENTWISEMUL_OP
#include "elementwise_mul_op.h" #include "operators/elementwise_mul_op.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FILL_CONSTANT_OP
#include "operators/fill_constant_op.h"
namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(fill_constant, ops::FillConstantOp);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
REGISTER_OPERATOR_MALI_GPU(fill_constant, ops::FillConstantOp);
#endif
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(fill_constant, ops::FillConstantOp);
#endif
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FILL_CONSTANT_OP
#pragma once
#include <string>
#include "framework/data_type.h"
#include "framework/operator.h"
#include "framework/selected_rows.h"
#include "operators/math/math_function.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
using std::string;
template <typename DeviceType, typename T>
class FillConstantOp : public framework::OperatorBase<DeviceType> {
public:
FillConstantOp(const string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs,
const framework::AttributeMap attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorBase<DeviceType>(type, inputs, outputs, attrs,
scope),
param_(inputs, outputs, attrs, *scope) {}
void RunImpl() const {
auto data_type =
static_cast<_PaddleMobile__Framework__Proto__VarType__Type>(
param_.DataDtype());
framework::Tensor *tensor = nullptr;
auto value = param_.Value();
auto *outvar = param_.OutVar();
if (outvar->template IsType<framework::LoDTensor>()) {
tensor = outvar->template GetMutable<framework::LoDTensor>();
} else if (outvar->template IsType<framework::SelectedRows>()) {
tensor = outvar->template GetMutable<framework::SelectedRows>()
->mutable_value();
} else {
PADDLE_MOBILE_THROW_EXCEPTION(
"fill constant op's output only"
"supports SelectedRows and LoDTensor");
}
tensor->Resize(framework::make_ddim(param_.Shape()));
tensor->mutable_data(framework::ToTypeIndex(data_type));
math::set_constant(tensor, value);
}
void Init() {}
void InferShape() const {
PADDLE_MOBILE_ENFORCE(
param_.Out() != nullptr,
"Output (Out) of fill_constant op should not be null.");
framework::DDim ddim = framework::make_ddim(param_.Shape());
param_.Out()->Resize(ddim);
}
protected:
FillConstantParam<DeviceType> param_;
};
} // namespace operators
} // namespace paddle_mobile
#endif
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef PADDLE_MOBILE_CPU #ifdef DEQUANT_OP
#include "operators/kernel/dequantize_kernel.h" #include "operators/kernel/dequantize_kernel.h"
...@@ -38,7 +38,8 @@ void DequantizeKernel<CPU, float>::Compute( ...@@ -38,7 +38,8 @@ void DequantizeKernel<CPU, float>::Compute(
const int32_t *x = input->data<const int32_t>(); const int32_t *x = input->data<const int32_t>();
float *y = output->mutable_data<float>(); float *y = output->mutable_data<float>();
size_t size = output->numel(); size_t size = output->numel();
float scale = 1.f / (activation_scale * weight_scale); // float scale = 1.f / (activation_scale * weight_scale);
float scale = activation_scale / weight_scale;
#if defined(__ARM_NEON__) || defined(__ARM_NEON) #if defined(__ARM_NEON__) || defined(__ARM_NEON)
size_t loop = size >> 4; size_t loop = size >> 4;
size_t remain = size & 0xF; size_t remain = size & 0xF;
......
...@@ -31,6 +31,8 @@ void MulKernel<CPU, float>::Compute(const MulParam<CPU> &param) const { ...@@ -31,6 +31,8 @@ void MulKernel<CPU, float>::Compute(const MulParam<CPU> &param) const {
param.Out()->set_lod(param.InputX()->lod()); param.Out()->set_lod(param.InputX()->lod());
} }
template class MulKernel<CPU, float>;
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
......
...@@ -12,57 +12,24 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,57 +12,24 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef MUL_OP #ifdef POLYGONBOXTRANSFORM_OP
#include "operators/kernel/mul_kernel.h" #include "operators/kernel/polygon_box_transform_kernel.h"
#include "operators/kernel/central-arm-func/polygon_box_transform_arm_func.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <> template <>
bool MulKernel<FPGA, float>::Init(MulParam<FPGA> *param) { bool PolygonBoxTransformKernel<CPU, float>::Init(
bool relu_enabled = false; PolygonBoxTransformParam<CPU> *param) {
auto input_x = const_cast<LoDTensor *>(param->InputX());
auto filter = const_cast<LoDTensor *>(param->InputY());
auto out = param->Out();
PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0],
"Image channel should be equal to weight number");
int channel = (uint32_t)out->dims()[1];
auto bs_ptr =
(float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT
for (int i = 0; i < channel; i++) {
bs_ptr[i + channel] = 1;
bs_ptr[i] = 0;
}
int num = (uint32_t)filter->dims()[1];
int chw = (uint32_t)filter->dims()[0];
PADDLE_MOBILE_ENFORCE(
chw == input_x->numel(),
"Filter element num should be equal to IFM element num");
int height = (uint32_t)input_x->dims()[2];
int width = (uint32_t)input_x->dims()[3];
int filter_channel = chw / height / width;
out->Resize(framework::make_ddim({1, channel, 1, 1}));
filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
float max_value = fpga::filter_find_max(filter);
fpga::format_fc_filter(filter, max_value);
int element_num_per_div = fpga::get_filter_num_per_div(filter, 1);
fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
fpga::format_fp16_ofm(out);
fpga::WrapperConvArgs conv_arg = {0};
fpga::fill_conv_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, 0,
0, bs_ptr);
param->SetFpgaArgs(conv_arg);
return true; return true;
} }
template <> template <>
void MulKernel<FPGA, float>::Compute(const MulParam<FPGA> &param) const { void PolygonBoxTransformKernel<CPU, float>::Compute(
fpga::ComputeFpgaConv(param.FpgaArgs()); const PolygonBoxTransformParam<CPU> &param) const {
PolygonBoxTransformCompute<float>(param);
} }
} // namespace operators } // namespace operators
......
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef PADDLE_MOBILE_CPU #ifdef QUANT_OP
#include "operators/kernel/quantize_kernel.h" #include "operators/kernel/quantize_kernel.h"
#include <cmath> #include <cmath>
...@@ -225,7 +225,7 @@ static void quantize_round_to_nearest(const Tensor *input, const float scale, ...@@ -225,7 +225,7 @@ static void quantize_round_to_nearest(const Tensor *input, const float scale,
const float *x = input->data<const float>(); const float *x = input->data<const float>();
int8_t *y = output->mutable_data<int8_t>(); int8_t *y = output->mutable_data<int8_t>();
size_t size = input->numel(); size_t size = input->numel();
#ifdef defined(__ARM_NEON__) || defined(__ARM_NEON) #if defined(__ARM_NEON__) || defined(__ARM_NEON)
size_t loop = size >> 4; size_t loop = size >> 4;
size_t remain = size & 0xF; size_t remain = size & 0xF;
for (size_t i = 0; i < loop; ++i) { for (size_t i = 0; i < loop; ++i) {
...@@ -280,17 +280,18 @@ void QuantizeKernel<CPU, float>::Compute( ...@@ -280,17 +280,18 @@ void QuantizeKernel<CPU, float>::Compute(
} }
max_abs = std::max(max_abs, 1e-6f); max_abs = std::max(max_abs, 1e-6f);
// only support int8 currently // only support int8 currently
float online_scale = 127 / max_abs; float scale = 127 / max_abs;
param.online_scale_->mutable_data<float>()[0] = online_scale; param.online_scale_->mutable_data<float>()[0] = max_abs;
switch (param.round_type_) { switch (param.round_type_) {
case ROUND_NEAREST_TO_EVEN: case ROUND_NEAREST_TO_EVEN:
quantize_round_to_even(input, online_scale, output); quantize_round_to_even(input, scale, output);
break; break;
case ROUND_NEAREST_TOWARDS_ZERO: case ROUND_NEAREST_TOWARDS_ZERO:
quantize_round_to_zero(input, online_scale, output); quantize_round_to_zero(input, scale, output);
break; break;
case ROUND_NEAREST_AWAY_ZERO: case ROUND_NEAREST_AWAY_ZERO:
quantize_round_to_nearest(input, online_scale, output); quantize_round_to_nearest(input, scale, output);
break;
default: default:
LOG(kLOG_ERROR) << "round type is not supported."; LOG(kLOG_ERROR) << "round type is not supported.";
break; break;
......
...@@ -16,24 +16,27 @@ limitations under the License. */ ...@@ -16,24 +16,27 @@ limitations under the License. */
#pragma once #pragma once
#include <vector> #include <vector>
#include "operators/math/conv_arm_int8.h"
#include "operators/math/conv_func.h" #include "operators/math/conv_func.h"
#include "operators/math/depthwise_conv_3x3.h" #include "operators/math/depthwise_conv_3x3.h"
#include "operators/math/im2col.h" #include "operators/math/im2col.h"
#include "operators/math/math_function.h" #include "operators/math/math_function.h"
#include "operators/math/pad.h"
#include "operators/math/vol2col.h" #include "operators/math/vol2col.h"
#include "operators/op_param.h" #include "operators/op_param.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
template <typename Dtype>
inline void ConvBasic(const ConvParam<CPU> &param) { inline void ConvBasic(const ConvParam<CPU> &param) {
const Tensor *input = param.Input(); const Tensor *input = param.Input();
Tensor filter = *param.Filter(); Tensor filter = *param.Filter();
Tensor *output = param.Output(); Tensor *output = param.Output();
output->mutable_data<float>();
int groups = param.Groups(); int groups = param.Groups();
std::vector<int> strides = param.Strides(); const std::vector<int> strides = param.Strides();
std::vector<int> paddings = param.Paddings(); const std::vector<int> paddings = param.Paddings();
std::vector<int> dilations = param.Dilations(); const std::vector<int> dilations = param.Dilations();
const int batch_size = static_cast<int>(input->dims()[0]); const int batch_size = static_cast<int>(input->dims()[0]);
...@@ -57,7 +60,7 @@ inline void ConvBasic(const ConvParam<CPU> &param) { ...@@ -57,7 +60,7 @@ inline void ConvBasic(const ConvParam<CPU> &param) {
Tensor col; Tensor col;
Tensor col_matrix; Tensor col_matrix;
if (is_expand) { if (is_expand) {
col.mutable_data<float>(col_shape); col.mutable_data<Dtype>(col_shape);
col_matrix.ShareDataWith(col); col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape); col_matrix.Resize(col_matrix_shape);
} }
...@@ -76,8 +79,8 @@ inline void ConvBasic(const ConvParam<CPU> &param) { ...@@ -76,8 +79,8 @@ inline void ConvBasic(const ConvParam<CPU> &param) {
int in_step = static_cast<int>(input->dims()[1]) / groups; int in_step = static_cast<int>(input->dims()[1]) / groups;
int out_step = static_cast<int>(output->dims()[1]) / groups; int out_step = static_cast<int>(output->dims()[1]) / groups;
math::Vol2ColFunctor<CPU, float> vol2col; math::Vol2ColFunctor<CPU, Dtype> vol2col;
math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col; math::Im2ColFunctor<math::ColFormat::kCFO, CPU, Dtype> im2col;
for (int i = 0; i < batch_size; i++) { for (int i = 0; i < batch_size; i++) {
Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
...@@ -96,6 +99,7 @@ inline void ConvBasic(const ConvParam<CPU> &param) { ...@@ -96,6 +99,7 @@ inline void ConvBasic(const ConvParam<CPU> &param) {
std::vector<int>{paddings[0], paddings[1], paddings[0], std::vector<int>{paddings[0], paddings[1], paddings[0],
paddings[1]}, paddings[1]},
&col); &col);
} else if (data_dim == 3U) { } else if (data_dim == 3U) {
// vol2col // vol2col
vol2col(in_slice, dilations, strides, paddings, &col); vol2col(in_slice, dilations, strides, paddings, &col);
...@@ -104,15 +108,70 @@ inline void ConvBasic(const ConvParam<CPU> &param) { ...@@ -104,15 +108,70 @@ inline void ConvBasic(const ConvParam<CPU> &param) {
// gemm // gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmul<float>(filter_slice, false, col_matrix, false,
math::matmul<Dtype>(filter_slice, false, col_matrix, false,
static_cast<float>(1), &out_slice, static_cast<float>(1), &out_slice,
static_cast<float>(0)); static_cast<float>(0));
} }
} }
} }
inline void ConvCompute_int8(const ConvParam<CPU> &param) {
typedef void (*ConvFunc)(const Tensor &input, const Tensor &kernel,
Tensor *output);
static ConvFunc conv_funcs_table[7][5] = {
{0, 0, 0, 0, 0}, // k = 1
{0, 0, 0, 0, 0}, {conv3x3s1_int8, 0, 0, 0, 0}, // k = 3
{0, 0, 0, 0, 0}, {conv5x5s1_int8, 0, 0, 0, 0}, // k = 5
{0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, // k = 7
};
const Tensor *input = param.Input();
Tensor *filter = param.Filter();
Tensor *output = param.Output();
int groups = param.Groups();
const std::vector<int> &strides = param.Strides();
const std::vector<int> &paddings = param.Paddings();
const std::vector<int> &dilations = param.Dilations();
int kernel_h = filter->dims()[2];
int kernel_w = filter->dims()[3];
output->mutable_data<int32_t>();
ConvFunc conv_func = 0;
if (strides[1] == strides[0] && strides[1] < 6 && kernel_h == kernel_w &&
kernel_h < 8 && groups == 1 && dilations[0] == dilations[1] &&
dilations[1] == 1) {
conv_func = conv_funcs_table[kernel_h - 1][strides[0] - 1];
}
if (conv_func) {
int batch_size = input->dims()[0];
math::PadFunctor<CPU, int8_t> pad;
Tensor input_pad;
for (int i = 0; i < batch_size; ++i) {
Tensor in_batch = input->Slice(i, i + 1);
Tensor out_batch = output->Slice(i, i + 1);
if (paddings[0] == 0 && paddings[1] == 0) {
input_pad = in_batch;
} else {
framework::DDim pad_shape = in_batch.dims();
pad_shape[2] += 2 * paddings[0];
pad_shape[3] += 2 * paddings[1];
input_pad.mutable_data<int8_t>(pad_shape);
pad(in_batch, paddings[0], paddings[1], &input_pad);
}
conv_func(input_pad, *filter, &out_batch);
}
} else {
ConvBasic<int8_t>(param);
}
}
template <typename P> template <typename P>
void ConvCompute(const ConvParam<CPU> &param) { void ConvCompute(const ConvParam<CPU> &param) {
if (param.Input()->type() == typeid(int8_t)) {
ConvCompute_int8(param);
} else {
param.Output()->mutable_data<float>();
if (param.Groups() == param.Input()->dims()[1] && if (param.Groups() == param.Input()->dims()[1] &&
param.Input()->dims()[1] == param.Output()->dims()[1] && param.Input()->dims()[1] == param.Output()->dims()[1] &&
param.Filter()->dims()[2] == param.Filter()->dims()[3] && param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
...@@ -126,7 +185,8 @@ void ConvCompute(const ConvParam<CPU> &param) { ...@@ -126,7 +185,8 @@ void ConvCompute(const ConvParam<CPU> &param) {
math::DepthwiseConv3x3(param.Input(), param.Strides(), param.Paddings(), math::DepthwiseConv3x3(param.Input(), param.Strides(), param.Paddings(),
param.Filter(), nullptr, param.Output(), false); param.Filter(), nullptr, param.Output(), false);
} else { } else {
ConvBasic(param); ConvBasic<float>(param);
}
} }
} }
......
...@@ -44,7 +44,7 @@ void DepthwiseConvCompute(const ConvParam<CPU> &param) { ...@@ -44,7 +44,7 @@ void DepthwiseConvCompute(const ConvParam<CPU> &param) {
Bias, false); Bias, false);
} else { } else {
ConvBasic(param); ConvBasic<float>(param);
} }
} }
......
...@@ -15,8 +15,12 @@ limitations under the License. */ ...@@ -15,8 +15,12 @@ limitations under the License. */
#ifdef ELEMENTWISEADD_OP #ifdef ELEMENTWISEADD_OP
#pragma once #pragma once
#include "operators/math/elementwise_op_function.h" #include "operators/math/elementwise_op_function.h"
#include "operators/op_param.h" #include "operators/op_param.h"
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include <arm_neon.h>
#endif
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
...@@ -33,8 +37,61 @@ void ElementwiseAddCompute(const ElementwiseAddParam<CPU> &param) { ...@@ -33,8 +37,61 @@ void ElementwiseAddCompute(const ElementwiseAddParam<CPU> &param) {
Tensor *Out = param.Out(); Tensor *Out = param.Out();
Out->mutable_data<float>(); Out->mutable_data<float>();
int axis = param.Axis(); int axis = param.Axis();
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
const auto &x_dims = input_x->dims();
const auto &y_dims = input_y->dims();
/// axis = -1 represent the last dimensions.
axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
size_t batch = 1;
size_t channels = 1;
size_t elementwise_num = 1;
for (int i = 0; i < axis; ++i) {
batch *= x_dims[i];
}
for (int i = 0; i < y_dims.size(); ++i) {
channels *= y_dims[i];
}
for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
elementwise_num *= x_dims[i];
}
const float *bias_data = input_y->data<float>();
const float *input_data = input_x->data<float>();
float *output_data = Out->mutable_data<float>();
for (int i = 0; i < batch; ++i) {
for (int j = 0; j < channels; ++j) {
size_t offset = (i * channels + j) * elementwise_num;
const float *input = input_data + offset;
const float *bias = bias_data + j;
float *output = output_data + offset;
int loop = elementwise_num >> 0x4;
int remain = elementwise_num & 0xF;
for (int k = 0; k < loop; ++k) {
float32x4_t rb = vdupq_n_f32(*bias);
float32x4_t r0 = vld1q_f32(input);
float32x4_t r1 = vld1q_f32(input + 4);
float32x4_t r2 = vld1q_f32(input + 8);
float32x4_t r3 = vld1q_f32(input + 12);
r0 = vaddq_f32(r0, rb);
r1 = vaddq_f32(r1, rb);
r2 = vaddq_f32(r2, rb);
r3 = vaddq_f32(r3, rb);
vst1q_f32(output, r0);
vst1q_f32(output + 4, r1);
vst1q_f32(output + 8, r2);
vst1q_f32(output + 12, r3);
input += 16;
output += 16;
}
for (int k = 0; k < remain; ++k) {
output[k] = input[k] + *bias;
}
}
}
#else
ElementwiseComputeEx<AddFunctor<float>, float>(input_x, input_y, axis, ElementwiseComputeEx<AddFunctor<float>, float>(input_x, input_y, axis,
AddFunctor<float>(), Out); AddFunctor<float>(), Out);
#endif
} }
template class ElementwiseAddKernel<CPU, float>; template class ElementwiseAddKernel<CPU, float>;
......
...@@ -58,7 +58,7 @@ void MulCompute(const MulParam<CPU> &param) { ...@@ -58,7 +58,7 @@ void MulCompute(const MulParam<CPU> &param) {
const Tensor *input_x = param.InputX(); const Tensor *input_x = param.InputX();
const Tensor *input_y = param.InputY(); const Tensor *input_y = param.InputY();
Tensor *out = param.Out(); Tensor *out = param.Out();
out->mutable_data<float>();
const Tensor x_matrix = const Tensor x_matrix =
input_x->dims().size() > 2 input_x->dims().size() > 2
? framework::ReshapeToMatrix(*input_x, param.XNumColDims()) ? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
...@@ -71,15 +71,21 @@ void MulCompute(const MulParam<CPU> &param) { ...@@ -71,15 +71,21 @@ void MulCompute(const MulParam<CPU> &param) {
if (out_dim.size() != 2) { if (out_dim.size() != 2) {
out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]}); out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
} }
if (param.InputX()->type() == typeid(int8_t)) {
out->mutable_data<int32_t>();
math::matmul<int8_t>(x_matrix, false, y_matrix, false,
static_cast<int8_t>(1), out, static_cast<int8_t>(0));
} else {
out->mutable_data<float>();
math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1), math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
out, static_cast<float>(0)); out, static_cast<float>(0));
}
if (out_dim.size() != 2) { if (out_dim.size() != 2) {
out->Resize(out_dim); out->Resize(out_dim);
} }
} }
template class MulKernel<CPU, float>;
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef POLYGONBOXTRANSFORM_OP
#pragma once
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename P>
void PolygonBoxTransformCompute(const PolygonBoxTransformParam<CPU>& param) {
const auto* input = param.Input();
const auto& input_dims = input->dims();
const auto* input_data = input->data<float>();
auto* output = param.Output();
auto* output_data = output->mutable_data<float>();
int64_t batch_size = input_dims[0];
int64_t geo_channel = input_dims[1];
int64_t height = input_dims[2];
int64_t width = input_dims[3];
int64_t id = 0;
for (int64_t id_n = 0; id_n < batch_size * geo_channel; ++id_n) {
for (int64_t id_h = 0; id_h < height; ++id_h) {
for (int64_t id_w = 0; id_w < width; ++id_w) {
id = id_n * height * width + width * id_h + id_w;
if (id_n % 2 == 0) {
output_data[id] = id_w * 4 - input_data[id];
} else {
output_data[id] = id_h * 4 - input_data[id];
}
}
}
}
}
} // namespace operators
} // namespace paddle_mobile
#endif
...@@ -17,6 +17,9 @@ limitations under the License. */ ...@@ -17,6 +17,9 @@ limitations under the License. */
#include <operators/math/transform.h> #include <operators/math/transform.h>
#include "operators/op_param.h" #include "operators/op_param.h"
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include <arm_neon.h>
#endif
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
...@@ -37,71 +40,100 @@ void ReluCompute(const ReluParam<CPU> &param) { ...@@ -37,71 +40,100 @@ void ReluCompute(const ReluParam<CPU> &param) {
auto *out_ptr = out->mutable_data<float>(); auto *out_ptr = out->mutable_data<float>();
int numel = input_x->numel(); int numel = input_x->numel();
// if (numel > 64) { #if defined(__ARM_NEON__) || defined(__ARM_NEON)
// asm volatile( #if __aarch64__
// "pld [%[input_x_ptr], #0] \n\t" if (numel > 0) {
// "vmov.f32 q8, #0.0 \n\t" int loop = numel >> 0x4;
// "subs %[num], %[num], #32 \n\t" int remain = numel & 0xF;
// "blt end_num_%= \n\t" float32x4_t zero = vdupq_n_f32(0.f);
// "loop_num_%=: \n\t" for (int i = 0; i < loop; ++i) {
// "pld [%[input_x_ptr], #1024] \n\t" float32x4_t r0 = vld1q_f32(input_x_ptr);
// float32x4_t r1 = vld1q_f32(input_x_ptr + 4);
// "vld1.32 {q0, q1}, [%[input_x_ptr]]! \n\t" float32x4_t r2 = vld1q_f32(input_x_ptr + 8);
// "vld1.32 {q2, q3}, [%[input_x_ptr]]! \n\t" float32x4_t r3 = vld1q_f32(input_x_ptr + 12);
// "vld1.32 {q4, q5}, [%[input_x_ptr]]! \n\t" r0 = vmaxq_f32(r0, zero);
// "vld1.32 {q6, q7}, [%[input_x_ptr]]! \n\t" r1 = vmaxq_f32(r1, zero);
// r2 = vmaxq_f32(r2, zero);
// "vmax.f32 q0, q0, q8 \n\t" r3 = vmaxq_f32(r3, zero);
// "vmax.f32 q1, q1, q8 \n\t" vst1q_f32(out_ptr, r0);
// "vmax.f32 q2, q2, q8 \n\t" vst1q_f32(out_ptr + 4, r1);
// "vmax.f32 q3, q3, q8 \n\t" vst1q_f32(out_ptr + 8, r2);
// "vmax.f32 q4, q4, q8 \n\t" vst1q_f32(out_ptr + 12, r3);
// "vmax.f32 q5, q5, q8 \n\t" input_x_ptr += 16;
// "vmax.f32 q6, q6, q8 \n\t" out_ptr += 16;
// "vmax.f32 q7, q7, q8 \n\t" }
// for (int i = 0; i < remain; ++i) {
// "vst1.32 {q0, q1}, [%[out_ptr]]! \n\t" out_ptr[i] = (input_x_ptr[i] > 0) * input_x_ptr[i];
// "vst1.32 {q2, q3}, [%[out_ptr]]! \n\t" }
// "vst1.32 {q4, q5}, [%[out_ptr]]! \n\t" #else
// "vst1.32 {q6, q7}, [%[out_ptr]]! \n\t" if (numel > 64) {
// asm volatile(
// "subs %[num], %[num], #32 \n\t" "pld [%[input_x_ptr], #0] \n\t"
// "bge loop_num_%= \n\t" "vmov.f32 q8, #0.0 \n\t"
// "end_num_%=: \n\t" "subs %[num], %[num], #32 \n\t"
// "cmp %[num], #0 \n\t" "blt end_num_%= \n\t"
// "bge end_%= \n\t" "loop_num_%=: \n\t"
// "mov r6, #4 \n\t" "pld [%[input_x_ptr], #1024] \n\t"
// "mul r5, %[num], r6 \n\t"
// "add %[input_x_ptr], %[input_x_ptr], r5 \n\t" "vld1.32 {q0, q1}, [%[input_x_ptr]]! \n\t"
// "vld1.32 {q0, q1}, [%[input_x_ptr]]! \n\t" "vld1.32 {q2, q3}, [%[input_x_ptr]]! \n\t"
// "vld1.32 {q2, q3}, [%[input_x_ptr]]! \n\t" "vld1.32 {q4, q5}, [%[input_x_ptr]]! \n\t"
// "vld1.32 {q4, q5}, [%[input_x_ptr]]! \n\t" "vld1.32 {q6, q7}, [%[input_x_ptr]]! \n\t"
// "vld1.32 {q6, q7}, [%[input_x_ptr]]! \n\t"
// "vmax.f32 q0, q0, q8 \n\t" "vmax.f32 q0, q0, q8 \n\t"
// "vmax.f32 q1, q1, q8 \n\t" "vmax.f32 q1, q1, q8 \n\t"
// "vmax.f32 q2, q2, q8 \n\t" "vmax.f32 q2, q2, q8 \n\t"
// "vmax.f32 q3, q3, q8 \n\t" "vmax.f32 q3, q3, q8 \n\t"
// "vmax.f32 q4, q4, q8 \n\t" "vmax.f32 q4, q4, q8 \n\t"
// "vmax.f32 q5, q5, q8 \n\t" "vmax.f32 q5, q5, q8 \n\t"
// "vmax.f32 q6, q6, q8 \n\t" "vmax.f32 q6, q6, q8 \n\t"
// "vmax.f32 q7, q7, q8 \n\t" "vmax.f32 q7, q7, q8 \n\t"
// "add %[out_ptr], %[out_ptr], r5 \n\t"
// "vst1.32 {q0, q1}, [%[out_ptr]]! \n\t" "vst1.32 {q0, q1}, [%[out_ptr]]! \n\t"
// "vst1.32 {q2, q3}, [%[out_ptr]]! \n\t" "vst1.32 {q2, q3}, [%[out_ptr]]! \n\t"
// "vst1.32 {q4, q5}, [%[out_ptr]]! \n\t" "vst1.32 {q4, q5}, [%[out_ptr]]! \n\t"
// "vst1.32 {q6, q7}, [%[out_ptr]]! \n\t" "vst1.32 {q6, q7}, [%[out_ptr]]! \n\t"
// "end_%=: \n\t"
// : "subs %[num], %[num], #32 \n\t"
// : "bge loop_num_%= \n\t"
// [out_ptr] "r"(out_ptr), [input_x_ptr] "r"(input_x_ptr), [num] "end_num_%=: \n\t"
// "r"(numel) : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "cmp %[num], #0 \n\t"
// "q7", "q8", "r5", "bge end_%= \n\t"
// "r6"); "mov r6, #4 \n\t"
// } else { "mul r5, %[num], r6 \n\t"
"add %[input_x_ptr], %[input_x_ptr], r5 \n\t"
"vld1.32 {q0, q1}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q2, q3}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q4, q5}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q6, q7}, [%[input_x_ptr]]! \n\t"
"vmax.f32 q0, q0, q8 \n\t"
"vmax.f32 q1, q1, q8 \n\t"
"vmax.f32 q2, q2, q8 \n\t"
"vmax.f32 q3, q3, q8 \n\t"
"vmax.f32 q4, q4, q8 \n\t"
"vmax.f32 q5, q5, q8 \n\t"
"vmax.f32 q6, q6, q8 \n\t"
"vmax.f32 q7, q7, q8 \n\t"
"add %[out_ptr], %[out_ptr], r5 \n\t"
"vst1.32 {q0, q1}, [%[out_ptr]]! \n\t"
"vst1.32 {q2, q3}, [%[out_ptr]]! \n\t"
"vst1.32 {q4, q5}, [%[out_ptr]]! \n\t"
"vst1.32 {q6, q7}, [%[out_ptr]]! \n\t"
"end_%=: \n\t"
:
:
[out_ptr] "r"(out_ptr), [input_x_ptr] "r"(input_x_ptr), [num] "r"(numel)
: "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "r5",
"r6");
#endif
} else {
#endif
ReluFunctor<float> func_; ReluFunctor<float> func_;
math::Transform trans; math::Transform trans;
trans(input_x_ptr, input_x_ptr + numel, out_ptr, func_); trans(input_x_ptr, input_x_ptr + numel, out_ptr, func_);
// } #if defined(__ARM_NEON__) || defined(__ARM_NEON)
}
#endif
} }
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
......
...@@ -15,11 +15,14 @@ limitations under the License. */ ...@@ -15,11 +15,14 @@ limitations under the License. */
#ifdef SUM_OP #ifdef SUM_OP
#pragma once #pragma once
#include <vector>
#include "operators/math/selected_rows_functor.h" #include "operators/math/selected_rows_functor.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
using LoDTensorArray = std::vector<LoDTensor>; using LoDTensorArray = std::vector<LoDTensor>;
template <typename P> template <typename P>
void SumCompute(const SumParam<CPU> &param) { void SumCompute(const SumParam<CPU> &param) {
auto inputsvars = param.InputsVars(); auto inputsvars = param.InputsVars();
...@@ -63,31 +66,21 @@ void SumCompute(const SumParam<CPU> &param) { ...@@ -63,31 +66,21 @@ void SumCompute(const SumParam<CPU> &param) {
std::unique_ptr<framework::SelectedRows> in0; std::unique_ptr<framework::SelectedRows> in0;
if (in_place) { if (in_place) {
// If is in_place, we store the input[0] to in0 // If is in_place, we store the input[0] to in0
auto *in_sel0 = inputsvars[0]->Get<SelectedRows>(); auto *in_sel0 = inputsvars[0]->Get<framework::SelectedRows>();
auto &rows = in_sel0->rows(); auto &rows = in_sel0->rows();
//#ifdef PADDLE_WITH_CUDA
// std::vector<int64_t> rows_in_cpu;
// rows_in_cpu.reserve(rows.size());
// for (auto item : rows) {
// rows_in_cpu.push_back(item);
// }
// in0.reset(new framework::SelectedRows(rows_in_cpu,
// in_sel0.height()));
//#else
in0.reset(new framework::SelectedRows(rows, in_sel0->height())); in0.reset(new framework::SelectedRows(rows, in_sel0->height()));
//#endif
in0->mutable_value()->ShareDataWith(in_sel0->value()); in0->mutable_value()->ShareDataWith(in_sel0->value());
} }
auto get_selected_row = [&](size_t i) -> const SelectedRows & { auto get_selected_row = [&](size_t i) -> const framework::SelectedRows & {
if (i == 0 && in0) { if (i == 0 && in0) {
return *in0.get(); return *in0.get();
} else { } else {
return *(inputsvars[i]->Get<SelectedRows>()); return *(inputsvars[i]->Get<framework::SelectedRows>());
} }
}; };
auto *out = outvar->GetMutable<SelectedRows>(); auto *out = outvar->GetMutable<framework::SelectedRows>();
out->mutable_rows()->clear(); out->mutable_rows()->clear();
auto *out_value = out->mutable_value(); auto *out_value = out->mutable_value();
...@@ -150,8 +143,6 @@ void SumCompute(const SumParam<CPU> &param) { ...@@ -150,8 +143,6 @@ void SumCompute(const SumParam<CPU> &param) {
} }
} }
} else { } else {
if (outvar->IsType<framework::Tensor>()) {
}
PADDLE_MOBILE_THROW_EXCEPTION( PADDLE_MOBILE_THROW_EXCEPTION(
"Unexpected branch, output variable type is %s", outvar->Type().name()); "Unexpected branch, output variable type is %s", outvar->Type().name());
} }
......
...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef DEQUANT_OP
#pragma once #pragma once
#include "framework/operator.h" #include "framework/operator.h"
...@@ -30,3 +32,5 @@ class DequantizeKernel ...@@ -30,3 +32,5 @@ class DequantizeKernel
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -23,8 +23,6 @@ limitations under the License. */ ...@@ -23,8 +23,6 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
using namespace framework;
template <typename DeviceType, typename T> template <typename DeviceType, typename T>
class ElementwiseMulKernel class ElementwiseMulKernel
: public framework::OpKernelBase<DeviceType, : public framework::OpKernelBase<DeviceType,
......
...@@ -66,10 +66,11 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) { ...@@ -66,10 +66,11 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
fpga::format_fp16_ofm(out); fpga::format_fp16_ofm(out);
fpga::WrapperConvArgs conv_arg = {0}; fpga::SplitConvArgs conv_arg = {0};
fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled, fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
param->Groups(), param->Strides()[0], param->Strides()[1], param->Groups(), param->Strides()[0],
param->Paddings()[0], param->Paddings()[1], bs_ptr); param->Strides()[1], param->Paddings()[0],
param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(conv_arg); param->SetFpgaArgs(conv_arg);
return true; return true;
......
...@@ -65,10 +65,11 @@ bool ConvAddBNReluKernel<FPGA, float>::Init( ...@@ -65,10 +65,11 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
fpga::format_fp16_ofm(out); fpga::format_fp16_ofm(out);
fpga::WrapperConvArgs conv_arg = {0}; fpga::SplitConvArgs conv_arg = {0};
fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled, fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
param->Groups(), param->Strides()[0], param->Strides()[1], param->Groups(), param->Strides()[0],
param->Paddings()[0], param->Paddings()[1], bs_ptr); param->Strides()[1], param->Paddings()[0],
param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(conv_arg); param->SetFpgaArgs(conv_arg);
return true; return true;
} }
......
...@@ -47,10 +47,11 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) { ...@@ -47,10 +47,11 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
fpga::format_fp16_ofm(out); fpga::format_fp16_ofm(out);
fpga::WrapperConvArgs conv_arg = {0}; fpga::SplitConvArgs conv_arg = {0};
fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled, fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
param->Groups(), param->Strides()[0], param->Strides()[1], param->Groups(), param->Strides()[0],
param->Paddings()[0], param->Paddings()[1], bs_ptr); param->Strides()[1], param->Paddings()[0],
param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(conv_arg); param->SetFpgaArgs(conv_arg);
return true; return true;
} }
......
...@@ -59,10 +59,11 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) { ...@@ -59,10 +59,11 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
fpga::format_fp16_ofm(out); fpga::format_fp16_ofm(out);
fpga::WrapperConvArgs conv_arg = {0}; fpga::SplitConvArgs conv_arg = {0};
fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled, fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
param->Groups(), param->Strides()[0], param->Strides()[1], param->Groups(), param->Strides()[0],
param->Paddings()[0], param->Paddings()[1], bs_ptr); param->Strides()[1], param->Paddings()[0],
param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(conv_arg); param->SetFpgaArgs(conv_arg);
return true; return true;
} }
......
...@@ -59,10 +59,11 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) { ...@@ -59,10 +59,11 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
fpga::format_fp16_ofm(out); fpga::format_fp16_ofm(out);
fpga::WrapperConvArgs conv_arg = {0}; fpga::SplitConvArgs conv_arg = {0};
fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled, fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
param->Groups(), param->Strides()[0], param->Strides()[1], param->Groups(), param->Strides()[0],
param->Paddings()[0], param->Paddings()[1], bs_ptr); param->Strides()[1], param->Paddings()[0],
param->Paddings()[1], bs_ptr);
param->SetFpgaArgs(conv_arg); param->SetFpgaArgs(conv_arg);
return true; return true;
} }
......
...@@ -53,9 +53,9 @@ bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) { ...@@ -53,9 +53,9 @@ bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
fpga::format_fp16_ofm(out); fpga::format_fp16_ofm(out);
fpga::WrapperConvArgs conv_arg = {0}; fpga::SplitConvArgs conv_arg = {0};
fpga::fill_conv_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, 0, fpga::fill_split_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1,
0, bs_ptr); 0, 0, bs_ptr);
param->SetFpgaArgs(conv_arg); param->SetFpgaArgs(conv_arg);
return true; return true;
} }
......
...@@ -54,9 +54,9 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) { ...@@ -54,9 +54,9 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
fpga::format_fp16_ofm(out); fpga::format_fp16_ofm(out);
fpga::WrapperConvArgs conv_arg = {0}; fpga::SplitConvArgs conv_arg = {0};
fpga::fill_conv_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, 0, fpga::fill_split_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1,
0, bs_ptr); 0, 0, bs_ptr);
param->SetFpgaArgs(conv_arg); param->SetFpgaArgs(conv_arg);
return true; return true;
} }
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef POLYGONBOXTRANSFORM_OP
#pragma once
#include "framework/operator.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
template <typename DeviceType, typename T>
class PolygonBoxTransformKernel
: public framework::OpKernelBase<DeviceType,
PolygonBoxTransformParam<DeviceType>> {
public:
void Compute(const PolygonBoxTransformParam<DeviceType>& param) const;
bool Init(PolygonBoxTransformParam<DeviceType>* param);
};
} // namespace operators
} // namespace paddle_mobile
#endif
...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef QUANT_OP
#pragma once #pragma once
#include "framework/operator.h" #include "framework/operator.h"
...@@ -30,3 +32,5 @@ class QuantizeKernel ...@@ -30,3 +32,5 @@ class QuantizeKernel
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -21,8 +21,6 @@ limitations under the License. */ ...@@ -21,8 +21,6 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
using namespace framework;
template <typename DeviceType, typename T> template <typename DeviceType, typename T>
class SumKernel class SumKernel
: public framework::OpKernelBase<DeviceType, SumParam<DeviceType>> { : public framework::OpKernelBase<DeviceType, SumParam<DeviceType>> {
......
此差异已折叠。
此差异已折叠。
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef CONV_OP
#pragma once
#include "framework/tensor.h"
namespace paddle_mobile {
namespace operators {
void conv3x3s1_int8(const framework::Tensor& input,
const framework::Tensor& weight, framework::Tensor* output);
void conv3x3s1_int8_4c(const framework::Tensor& input,
const framework::Tensor& weight,
framework::Tensor* output);
void conv5x5s1_int8(const framework::Tensor& input,
const framework::Tensor& weight, framework::Tensor* output);
} // namespace operators
} // namespace paddle_mobile
#endif
...@@ -3379,7 +3379,7 @@ void Gemm::SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, ...@@ -3379,7 +3379,7 @@ void Gemm::SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A,
// 对 B 分块 // 对 B 分块
NC = L1 / (KC * sizeof(float)); NC = L1 / (KC * sizeof(float));
if (NC == 0) { if (NC == 0) {
NC == NR; NC = NR;
} else { } else {
int nblock_num = (n + NC - 1) / NC; int nblock_num = (n + NC - 1) / NC;
NC = (n + nblock_num - 1) / nblock_num; NC = (n + nblock_num - 1) / nblock_num;
...@@ -3662,7 +3662,7 @@ void Gemm::AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) { ...@@ -3662,7 +3662,7 @@ void Gemm::AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
b_ptr = b; b_ptr = b;
int kc1 = k / 8; int kc1 = k / 8;
int kc2 = k % 8; int kc2 = k % 8;
int step = 4 * ldc; int step = sizeof(float) * ldc;
asm volatile( asm volatile(
"pld [%[a_ptr]] \n\t" "pld [%[a_ptr]] \n\t"
"pld [%[a_ptr], #64] \n\t" "pld [%[a_ptr], #64] \n\t"
...@@ -3866,11 +3866,10 @@ void Gemm::AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) { ...@@ -3866,11 +3866,10 @@ void Gemm::AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
: :
: [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1), : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
[kc2] "r"(kc2), [step] "r"(step) [kc2] "r"(kc2), [step] "r"(step)
: "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", : "cc", "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
#endif // __aarch64__ #endif // __aarch64__
#else
#endif // __ARM_NEON #endif // __ARM_NEON
} }
......
...@@ -22,9 +22,11 @@ limitations under the License. */ ...@@ -22,9 +22,11 @@ limitations under the License. */
#define C(i, j) C[(i)*ldc + (j)] #define C(i, j) C[(i)*ldc + (j)]
#if __aarch64__ #if __aarch64__
#define MR_INT8 4
#define MR 6 #define MR 6
#define NR 16 #define NR 16
#else #else
#define MR_INT8 4
#define MR 6 #define MR 6
#define NR 8 #define NR 8
#endif #endif
...@@ -96,6 +98,7 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb, ...@@ -96,6 +98,7 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
void InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b, void InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b,
float *c, float *C, int ldc, float *p, float *c, float *C, int ldc, float *p,
std::string mode, float *bias, float *bias1); std::string mode, float *bias, float *bias1);
/* /*
// 向量矩阵乘法 (M = 1) // 向量矩阵乘法 (M = 1)
void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda, void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
...@@ -139,6 +142,7 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb, ...@@ -139,6 +142,7 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
float *new_scale, float *new_bias); float *new_scale, float *new_bias);
void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc, void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
float *new_scale, float *new_bias, float *bias1); float *new_scale, float *new_bias, float *bias1);
/* /*
// 向量矩阵乘法结果回写 // 向量矩阵乘法结果回写
// C = A * B // C = A * B
...@@ -185,15 +189,67 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb, ...@@ -185,15 +189,67 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
const float *B, int ldb, float *C, int ldc, float *p, const float *B, int ldb, float *C, int ldc, float *p,
std::string mode, float *bias, float *bias1); std::string mode, float *bias, float *bias1);
// 8 bits function cluster begins
// 8 bits int small block inner product
void AddDot4x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
int32_t ldc);
void AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
int32_t ldc);
// 8 bits int inner product
void InnerKernelWithBias(int32_t mc, int32_t nc, int8_t alpha,
const int8_t *a, const int8_t *b, int8_t beta,
int32_t *c, int32_t *C, int32_t ldc, bool relu,
int8_t *bias);
// 8 bits int pack function
void PackMatrixA_4r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
int32_t lda, int8_t *buffer);
void PackMatrixA_6r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
int32_t lda, int8_t *buffer);
void PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
int32_t ldb, int8_t *buffer);
// 8 bits int matrix product
void Sgemm(int32_t m, int32_t n, int32_t k, int8_t alpha, const int8_t *A,
int32_t lda, const int8_t *B, int32_t ldb, int8_t beta, int32_t *C,
int32_t ldc, bool relu, int8_t *bias);
// 8 bits int write back
// C = alpha * A * B + beta * C
void WriteWithAlphaBeta(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
int32_t ldc);
// C = A * B
void WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C, int32_t ldc);
// C = A * B + C
void WriteWithAdd(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
int32_t ldc);
// C = A * B + bias
void WriteWithAddV1(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
int32_t ldc, int8_t *bias);
// C = A * B + C, relu(C)
void WriteWithAddRelu(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
int32_t ldc);
// C = A * B + bias, relu(C)
void WriteWithAddReluV1(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
int32_t ldc, int8_t *bias);
private: private:
int MC = 0; int MC = 0;
int KC = 0; int KC = 0;
int NC = 0; int NC = 0;
// 32位 float
float *packedA; float *packedA;
float *packedB; float *packedB;
float *packedC; float *packedC;
float *zero; float *zero;
// 8 bits int
int8_t *packedA_int8;
int8_t *packedB_int8;
int32_t *packedC_int8;
int8_t *zero_int8;
}; };
} // namespace math } // namespace math
......
此差异已折叠。
...@@ -28,15 +28,11 @@ namespace math { ...@@ -28,15 +28,11 @@ namespace math {
* [input_channels, filter_height, filter_width, output_height, * [input_channels, filter_height, filter_width, output_height,
* output_width] * output_width]
*/ */
template <class T> template <>
class Im2ColFunctor<ColFormat::kCFO, CPU, T> { void Im2ColFunctor<ColFormat::kCFO, CPU, float>::operator()(
public: const framework::Tensor &im, const std::vector<int> &dilation,
void operator()(const framework::Tensor &im, const std::vector<int> &dilation, const std::vector<int> &stride, const std::vector<int> &padding,
const std::vector<int> &stride, framework::Tensor *col) {
const std::vector<int> &padding, framework::Tensor *col) {
// PADDLE_ENFORCE(im.dims().size() == 3);
// PADDLE_ENFORCE(col->dims().size() == 5);
int im_channels = im.dims()[0]; int im_channels = im.dims()[0];
int im_height = im.dims()[1]; int im_height = im.dims()[1];
int im_width = im.dims()[2]; int im_width = im.dims()[2];
...@@ -45,30 +41,9 @@ class Im2ColFunctor<ColFormat::kCFO, CPU, T> { ...@@ -45,30 +41,9 @@ class Im2ColFunctor<ColFormat::kCFO, CPU, T> {
int col_height = col->dims()[3]; int col_height = col->dims()[3];
int col_width = col->dims()[4]; int col_width = col->dims()[4];
// PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2]
// -
// ((dilation[0] * (filter_height - 1)
// + 1))) /
// stride[0] +
// 1,
// col_height,
// "Output_height and
// padding(padding_up, padding_down)
// are " "inconsistent.");
// PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3]
// -
// ((dilation[1] * (filter_width - 1)
// + 1))) /
// stride[1] +
// 1,
// col_width,
// "Output_height and
// padding(padding_up, padding_down)
// are " "inconsistent.");
int channels_col = im_channels * filter_height * filter_width; int channels_col = im_channels * filter_height * filter_width;
const T *im_data = im.data<T>(); const float *im_data = im.data<float>();
T *col_data = col->data<T>(); float *col_data = col->data<float>();
#if __ARM_NEON #if __ARM_NEON
const int osize = col_height; const int osize = col_height;
const int isize = im_height; const int isize = im_height;
...@@ -249,8 +224,8 @@ class Im2ColFunctor<ColFormat::kCFO, CPU, T> { ...@@ -249,8 +224,8 @@ class Im2ColFunctor<ColFormat::kCFO, CPU, T> {
col_data += 9 * oosize; col_data += 9 * oosize;
im_data += isize * isize; im_data += isize * isize;
} }
} else if (stride[0] == 2 && filter_height == 3 && pad1 && } else if (stride[0] == 2 && filter_height == 3 && pad1 && dilation[0] == 1 &&
dilation[0] == 1 && im_height > 2 && im_height == im_width) { im_height > 2 && im_height == im_width) {
for (int c = 0; c < im_channels; ++c) { for (int c = 0; c < im_channels; ++c) {
int oosize = osize * osize; int oosize = osize * osize;
int nk4 = osize / 4; int nk4 = osize / 4;
...@@ -396,15 +371,13 @@ class Im2ColFunctor<ColFormat::kCFO, CPU, T> { ...@@ -396,15 +371,13 @@ class Im2ColFunctor<ColFormat::kCFO, CPU, T> {
for (int h = 0; h < col_height; ++h) { for (int h = 0; h < col_height; ++h) {
int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0]; int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
for (int w = 0; w < col_width; ++w) { for (int w = 0; w < col_width; ++w) {
int im_col_idx = int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
w * stride[1] - padding[1] + w_offset * dilation[1];
int col_idx = (c * col_height + h) * col_width + w; int col_idx = (c * col_height + h) * col_width + w;
int im_idx = int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx;
(im_row_idx + c_im * im_height) * im_width + im_col_idx;
col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height || col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height ||
im_col_idx < 0 || im_col_idx >= im_width) im_col_idx < 0 || im_col_idx >= im_width)
? static_cast<T>(0) ? static_cast<float>(0)
: im_data[im_idx]; : im_data[im_idx];
} }
} }
...@@ -424,14 +397,138 @@ class Im2ColFunctor<ColFormat::kCFO, CPU, T> { ...@@ -424,14 +397,138 @@ class Im2ColFunctor<ColFormat::kCFO, CPU, T> {
col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height || col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height ||
im_col_idx < 0 || im_col_idx >= im_width) im_col_idx < 0 || im_col_idx >= im_width)
? static_cast<T>(0) ? static_cast<float>(0)
: im_data[im_idx]; : im_data[im_idx];
} }
} }
} }
#endif #endif
}
void ExtractToImg(const int8_t *im_data, int8_t *col_data, const int im_height,
const int im_width, const int col_height, const int col_width,
const int padding_h, const int padding_w, const int stride_h,
const int stride_w, const int kh, const int kw) {
int h = padding_h - kh;
int w = padding_w - kw;
int col_start_height = h > 0 ? (h + stride_h - 1) / stride_h : 0;
int col_start_width = w > 0 ? (w + stride_w - 1) / stride_w : 0;
int start_height = kh + col_start_height * stride_h - padding_h;
int start_width = kw + col_start_width * stride_w - padding_w;
int end_height = (col_height - col_start_height) * stride_h + start_height;
end_height = end_height > im_height ? im_height : end_height;
int end_width = (col_width - col_start_width) * stride_w + start_width;
end_width = end_width > im_width ? im_width : end_width;
int extract = (end_width - start_width + stride_w - 1) / stride_w;
im_data += start_height * im_width + start_width;
col_data += col_start_height * col_width + col_start_width;
for (int i = start_height; i < end_height; i += stride_h) {
if (stride_w == 1) {
memcpy(col_data, im_data, extract * sizeof(int8_t));
} else if (stride_w == 2) {
int s = 0;
#if __ARM_NEON
for (; s < extract - 15; s += 16) {
int8x16x2_t img = vld2q_s8(im_data + s * 2);
vst1q_s8(col_data + s, img.val[0]);
} }
}; #endif
for (; s < extract; ++s) {
col_data[s] = im_data[s * 2];
}
} else if (stride_w == 3) {
int s = 0;
#if __ARM_NEON
for (; s < extract - 15; s += 16) {
int8x16x3_t img = vld3q_s8(im_data + s * 3);
vst1q_s8(col_data + s, img.val[0]);
}
#endif
for (; s < extract; ++s) {
col_data[s] = im_data[s * 3];
}
} else if (stride_w == 4) {
int s = 0;
#if __ARM_NEON
for (; s < extract - 15; s += 16) {
int8x16x4_t img = vld4q_s8(im_data + s * 4);
vst1q_s8(col_data + s, img.val[0]);
}
#endif
for (; s < extract; ++s) {
col_data[s] = im_data[s * 4];
}
} else {
PADDLE_MOBILE_THROW_EXCEPTION("stride_w must be one of 1, 2, 3 and 4.");
}
im_data += im_width * stride_h;
col_data += col_width;
}
}
/*
* im = [input_channels, input_height, input_width]
* col =
* [input_channels, filter_height, filter_width, output_height,
* output_width]
*/
template <>
void Im2ColFunctor<ColFormat::kCFO, CPU, int8_t>::operator()(
const framework::Tensor &im, const std::vector<int> &dilation,
const std::vector<int> &stride, const std::vector<int> &padding,
framework::Tensor *col) {
int im_channels = im.dims()[0];
int im_height = im.dims()[1];
int im_width = im.dims()[2];
int filter_height = col->dims()[1];
int filter_width = col->dims()[2];
int col_height = col->dims()[3];
int col_width = col->dims()[4];
int channels_col = im_channels * filter_height * filter_width;
const int8_t *im_data = im.data<int8_t>();
int8_t *col_data = col->data<int8_t>();
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
if (stride[0] <= 4 && dilation[0] == 1 && dilation[0] == dilation[1]) {
// pad 0
memset(col_data, 0, col->numel() * sizeof(int8_t));
for (int ic = 0; ic < im_channels; ++ic) {
for (int kh = 0; kh < filter_height; ++kh) {
for (int kw = 0; kw < filter_width; ++kw) {
ExtractToImg(im_data, col_data, im_height, im_width, col_height,
col_width, padding[0], padding[1], stride[0], stride[1],
kh, kw);
col_data += col_height * col_width;
}
}
im_data += im_height * im_width;
}
} else {
#endif
for (int c = 0; c < channels_col; ++c) {
int w_offset = c % filter_width;
int h_offset = (c / filter_width) % filter_height;
int c_im = c / (filter_width * filter_height);
for (int h = 0; h < col_height; ++h) {
int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
for (int w = 0; w < col_width; ++w) {
int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
int col_idx = (c * col_height + h) * col_width + w;
int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx;
col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height ||
im_col_idx < 0 || im_col_idx >= im_width)
? static_cast<int8_t>(0)
: im_data[im_idx];
}
}
}
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
}
#endif
}
/* /*
* im = [input_channels, input_height, input_width] * im = [input_channels, input_height, input_width]
...@@ -456,27 +553,6 @@ class Col2ImFunctor<ColFormat::kCFO, CPU, T> { ...@@ -456,27 +553,6 @@ class Col2ImFunctor<ColFormat::kCFO, CPU, T> {
int col_height = col.dims()[3]; int col_height = col.dims()[3];
int col_width = col.dims()[4]; int col_width = col.dims()[4];
// PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2]
// -
// ((dilation[0] * (filter_height - 1)
// + 1))) /
// stride[0] +
// 1,
// col_height,
// "Output_height and
// padding(padding_up, padding_down)
// are " "inconsistent.");
// PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3]
// -
// ((dilation[1] * (filter_width - 1)
// + 1))) /
// stride[1] +
// 1,
// col_width,
// "Output_height and
// padding(padding_up, padding_down)
// are " "inconsistent.");
int channels_col = im_channels * filter_height * filter_width; int channels_col = im_channels * filter_height * filter_width;
T *im_data = im->data<T>(); T *im_data = im->data<T>();
...@@ -503,9 +579,9 @@ class Col2ImFunctor<ColFormat::kCFO, CPU, T> { ...@@ -503,9 +579,9 @@ class Col2ImFunctor<ColFormat::kCFO, CPU, T> {
}; };
template class Im2ColFunctor<ColFormat::kCFO, CPU, float>; template class Im2ColFunctor<ColFormat::kCFO, CPU, float>;
// template class Im2ColFunctor<ColFormat::kCFO, CPU, double>; template class Im2ColFunctor<ColFormat::kCFO, CPU, int8_t>;
template class Col2ImFunctor<ColFormat::kCFO, CPU, float>; template class Col2ImFunctor<ColFormat::kCFO, CPU, float>;
template class Col2ImFunctor<ColFormat::kCFO, CPU, double>; template class Col2ImFunctor<ColFormat::kCFO, CPU, int8_t>;
/* /*
* im = [input_channels, input_height, input_width] * im = [input_channels, input_height, input_width]
...@@ -519,8 +595,6 @@ class Im2ColFunctor<ColFormat::kOCF, CPU, T> { ...@@ -519,8 +595,6 @@ class Im2ColFunctor<ColFormat::kOCF, CPU, T> {
void operator()(const framework::Tensor &im, const std::vector<int> &dilation, void operator()(const framework::Tensor &im, const std::vector<int> &dilation,
const std::vector<int> &stride, const std::vector<int> &stride,
const std::vector<int> &padding, framework::Tensor *col) { const std::vector<int> &padding, framework::Tensor *col) {
// PADDLE_ENFORCE(im.dims().size() == 3);
// PADDLE_ENFORCE(col->dims().size() == 5);
int im_channels = im.dims()[0]; int im_channels = im.dims()[0];
int im_height = im.dims()[1]; int im_height = im.dims()[1];
int im_width = im.dims()[2]; int im_width = im.dims()[2];
...@@ -528,18 +602,6 @@ class Im2ColFunctor<ColFormat::kOCF, CPU, T> { ...@@ -528,18 +602,6 @@ class Im2ColFunctor<ColFormat::kOCF, CPU, T> {
int filter_width = col->dims()[4]; int filter_width = col->dims()[4];
int col_height = col->dims()[0]; int col_height = col->dims()[0];
int col_width = col->dims()[1]; int col_width = col->dims()[1];
// PADDLE_ENFORCE_EQ(
// (im_height + padding[0] + padding[2] -
// filter_height) / stride[0]
// + 1, col_height, "Output_height and
// padding(padding_up,
// padding_down) are " "inconsistent.");
// PADDLE_ENFORCE_EQ(
// (im_width + padding[1] + padding[3] -
// filter_width) / stride[1] +
// 1, col_width, "col_width and padding(padding_left,
// padding_right)
// are " "inconsistent.");
const T *im_data = im.data<T>(); const T *im_data = im.data<T>();
T *col_data = col->data<T>(); T *col_data = col->data<T>();
...@@ -589,8 +651,6 @@ class Col2ImFunctor<ColFormat::kOCF, CPU, T> { ...@@ -589,8 +651,6 @@ class Col2ImFunctor<ColFormat::kOCF, CPU, T> {
const std::vector<int> &dilation, const std::vector<int> &dilation,
const std::vector<int> &stride, const std::vector<int> &stride,
const std::vector<int> &padding, framework::Tensor *im) { const std::vector<int> &padding, framework::Tensor *im) {
// PADDLE_ENFORCE(im->dims().size() == 3);
// PADDLE_ENFORCE(col.dims().size() == 5);
int im_channels = im->dims()[0]; int im_channels = im->dims()[0];
int im_height = im->dims()[1]; int im_height = im->dims()[1];
int im_width = im->dims()[2]; int im_width = im->dims()[2];
...@@ -599,19 +659,6 @@ class Col2ImFunctor<ColFormat::kOCF, CPU, T> { ...@@ -599,19 +659,6 @@ class Col2ImFunctor<ColFormat::kOCF, CPU, T> {
int col_height = col.dims()[0]; int col_height = col.dims()[0];
int col_width = col.dims()[1]; int col_width = col.dims()[1];
// PADDLE_ENFORCE_EQ(
// (im_height + padding[0] + padding[2] -
// filter_height) / stride[0]
// + 1, col_height, "Output_height and
// padding(padding_up,
// padding_down) are " "inconsistent.");
// PADDLE_ENFORCE_EQ(
// (im_width + padding[1] + padding[3] -
// filter_width) / stride[1] +
// 1, col_width, "col_width and padding(padding_left,
// padding_right)
// are " "inconsistent.");
T *im_data = im->data<T>(); T *im_data = im->data<T>();
const T *col_data = col.data<T>(); const T *col_data = col.data<T>();
...@@ -651,9 +698,7 @@ class Col2ImFunctor<ColFormat::kOCF, CPU, T> { ...@@ -651,9 +698,7 @@ class Col2ImFunctor<ColFormat::kOCF, CPU, T> {
}; };
template class Im2ColFunctor<ColFormat::kOCF, CPU, float>; template class Im2ColFunctor<ColFormat::kOCF, CPU, float>;
template class Im2ColFunctor<ColFormat::kOCF, CPU, double>;
template class Col2ImFunctor<ColFormat::kOCF, CPU, float>; template class Col2ImFunctor<ColFormat::kOCF, CPU, float>;
template class Col2ImFunctor<ColFormat::kOCF, CPU, double>;
} // namespace math } // namespace math
} // namespace operators } // namespace operators
......
...@@ -15,12 +15,31 @@ limitations under the License. */ ...@@ -15,12 +15,31 @@ limitations under the License. */
#include "operators/math/math_function.h" #include "operators/math/math_function.h"
#include <cstring> #include <cstring>
#include <string> #include <string>
#include "framework/data_type.h"
#include "framework/tensor.h"
#include "operators/math/gemm.h" #include "operators/math/gemm.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
namespace math { namespace math {
struct TensorSetConstant {
TensorSetConstant(framework::Tensor *tensor, float value)
: tensor_(tensor), value_(value) {}
template <typename T>
void apply() const {
auto *begin = tensor_->mutable_data<T>();
std::fill(begin, begin + tensor_->numel(), static_cast<T>(value_));
}
framework::Tensor *tensor_;
float value_;
};
void set_constant(framework::Tensor *tensor, float value) {
framework::VisitDataType(framework::ToDataType(tensor->type()),
TensorSetConstant(tensor, value));
}
template <> template <>
void matmul<float>(const framework::Tensor &matrix_a, bool trans_a, void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, float alpha, const framework::Tensor &matrix_b, bool trans_b, float alpha,
...@@ -135,7 +154,7 @@ template <typename T> ...@@ -135,7 +154,7 @@ template <typename T>
struct ClearTensor<CPU, T> { struct ClearTensor<CPU, T> {
void operator()(framework::Tensor *tensor) { void operator()(framework::Tensor *tensor) {
auto size = tensor->numel(); auto size = tensor->numel();
auto *tensor_data = tensor->data<float>(); auto *tensor_data = tensor->data<T>();
memset((void *)tensor_data, 0, sizeof(T) * size); // NOLINT memset((void *)tensor_data, 0, sizeof(T) * size); // NOLINT
} }
}; };
...@@ -151,9 +170,9 @@ struct RowwiseAdd<CPU, T> { ...@@ -151,9 +170,9 @@ struct RowwiseAdd<CPU, T> {
PADDLE_MOBILE_ENFORCE((output->dims() == in_dims), PADDLE_MOBILE_ENFORCE((output->dims() == in_dims),
"output->dims() must be equal to in_dims."); "output->dims() must be equal to in_dims.");
auto *input_data = input.data<float>(); auto *input_data = input.data<T>();
auto *out_data = output->data<float>(); auto *out_data = output->data<T>();
auto *vec_data = vector.data<float>(); auto *vec_data = vector.data<T>();
for (int64_t i = 0; i < in_dims[0]; ++i) { for (int64_t i = 0; i < in_dims[0]; ++i) {
for (int64_t j = 0; j < size; ++j) { for (int64_t j = 0; j < size; ++j) {
out_data[i * size + j] = input_data[i * size + j] + vec_data[j]; out_data[i * size + j] = input_data[i * size + j] + vec_data[j];
......
...@@ -15,17 +15,20 @@ limitations under the License. */ ...@@ -15,17 +15,20 @@ limitations under the License. */
#pragma once #pragma once
#include <cmath> #include <cmath>
#include <string>
#include "framework/tensor.h" #include "framework/tensor.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
namespace math { namespace math {
void set_constant(framework::Tensor *tensor, float value);
template <typename T> template <typename T>
void matmul(const framework::Tensor &matrix_a, bool trans_a, void matmul(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b, T alpha, const framework::Tensor &matrix_b, bool trans_b, T alpha,
framework::Tensor *matrix_out, T beta, bool relu = false, framework::Tensor *matrix_out, T beta, bool relu = false,
float *bias = nullptr); T *bias = nullptr);
template <typename T> template <typename T>
void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a, void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a,
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cstring>
#include <string>
#include "operators/math/gemm.h"
#include "operators/math/math_function.h"
namespace paddle_mobile {
namespace operators {
namespace math {
template <>
void matmul<int8_t>(const framework::Tensor &matrix_a, bool trans_a,
const framework::Tensor &matrix_b, bool trans_b,
int8_t alpha, framework::Tensor *matrix_out, int8_t beta,
bool relu, int8_t *bias) {
auto dim_a = matrix_a.dims();
auto dim_b = matrix_b.dims();
auto dim_out = matrix_out->dims();
PADDLE_MOBILE_ENFORCE(
dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
"The input and output of matmul be matrix");
int32_t M = dim_out[0];
int32_t N = dim_out[1];
int32_t K = (!trans_a) ? dim_a[1] : dim_a[0];
Gemm gemm;
if (trans_a) {
int32_t numel = matrix_a.numel();
int32_t m = matrix_a.dims()[0];
int32_t n = matrix_a.dims()[1];
int8_t *tmp = (int8_t *)(matrix_a.data<int8_t>()); // NOLINT
int8_t *a = static_cast<int8_t *>(
paddle_mobile::memory::Alloc(sizeof(int8_t) * numel));
int32_t index = 0;
for (int32_t j = 0; j < n; j++) {
for (int32_t i = 0; i < m; i++) {
a[index++] = tmp[i * n + j];
}
}
gemm.Sgemm(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
matrix_out->data<int32_t>(), N, relu, bias);
} else {
gemm.Sgemm(M, N, K, alpha, matrix_a.data<int8_t>(), K,
matrix_b.data<int8_t>(), N, beta, matrix_out->data<int32_t>(), N,
relu, bias);
}
}
} // namespace math
} // namespace operators
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "operators/math/pad.h"
namespace paddle_mobile {
namespace operators {
namespace math {
template <typename T>
class PadFunctor<CPU, T> {
public:
void operator()(const framework::Tensor &input, const int pad_h,
const int pad_w, framework::Tensor *output) {
const T *in_data = input.data<T>();
T *out_data = output->mutable_data<T>();
const framework::DDim &input_shape = input.dims();
const framework::DDim &output_shape = output->dims();
// fill output with 0
memset(out_data, 0, sizeof(T) * output->numel());
// should make sure the shape of output is match with input
for (int i = 0; i < input_shape[0]; ++i) {
for (int c = 0; c < input_shape[1]; ++c) {
out_data += pad_h * output_shape[3];
for (int h = 0; h < input_shape[2]; ++h) {
memcpy(out_data + pad_w, in_data, sizeof(T) * input_shape[3]);
out_data += output_shape[3];
in_data += input_shape[3];
}
out_data += pad_h * output_shape[3];
}
}
}
};
template class PadFunctor<CPU, float>;
template class PadFunctor<CPU, int8_t>;
} // namespace math
} // namespace operators
} // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "framework/tensor.h"
namespace paddle_mobile {
namespace operators {
namespace math {
template <typename DeviceType, typename T>
class PadFunctor {
public:
void operator()(const framework::Tensor &input, const int pad_h,
const int pad_w, framework::Tensor *output);
};
} // namespace math
} // namespace operators
} // namespace paddle_mobile
...@@ -32,9 +32,6 @@ class Vol2ColFunctor<CPU, T> { ...@@ -32,9 +32,6 @@ class Vol2ColFunctor<CPU, T> {
void operator()(const Tensor &vol, const std::vector<int> &dilations, void operator()(const Tensor &vol, const std::vector<int> &dilations,
const std::vector<int> &strides, const std::vector<int> &strides,
const std::vector<int> &paddings, Tensor *col) const { const std::vector<int> &paddings, Tensor *col) const {
// PADDLE_ENFORCE(vol.dims().size() == 4);
// PADDLE_ENFORCE(col->dims().size() == 7);
int input_channels = vol.dims()[0]; int input_channels = vol.dims()[0];
int input_depth = vol.dims()[1]; int input_depth = vol.dims()[1];
int input_height = vol.dims()[2]; int input_height = vol.dims()[2];
...@@ -48,32 +45,6 @@ class Vol2ColFunctor<CPU, T> { ...@@ -48,32 +45,6 @@ class Vol2ColFunctor<CPU, T> {
int channels_col = int channels_col =
input_channels * filter_depth * filter_height * filter_width; input_channels * filter_depth * filter_height * filter_width;
// PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] -
// ((dilations[0] * (filter_depth - 1)
// + 1))) /
// strides[0] +
// 1,
// output_depth,
// "input_depth and output_depth are "
// "mismatching.");
// PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] -
// ((dilations[1] * (filter_height -
// 1) + 1))) /
// strides[1] +
// 1,
// output_height,
// "input_height and output_height are
// "
// "mismatching.");
// PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] -
// ((dilations[2] * (filter_width - 1)
// + 1))) /
// strides[2] +
// 1,
// output_width,
// "input_width and output_width are "
// "mismatching.");
const T *vol_data = vol.data<T>(); const T *vol_data = vol.data<T>();
T *col_data = col->data<T>(); T *col_data = col->data<T>();
...@@ -119,9 +90,6 @@ class Col2VolFunctor<CPU, T> { ...@@ -119,9 +90,6 @@ class Col2VolFunctor<CPU, T> {
void operator()(const Tensor &col, const std::vector<int> &dilations, void operator()(const Tensor &col, const std::vector<int> &dilations,
const std::vector<int> &strides, const std::vector<int> &strides,
const std::vector<int> &paddings, Tensor *vol) const { const std::vector<int> &paddings, Tensor *vol) const {
// PADDLE_ENFORCE(vol->dims().size() == 4);
// PADDLE_ENFORCE(col.dims().size() == 7);
int input_channels = vol->dims()[0]; int input_channels = vol->dims()[0];
int input_depth = vol->dims()[1]; int input_depth = vol->dims()[1];
int input_height = vol->dims()[2]; int input_height = vol->dims()[2];
...@@ -135,31 +103,6 @@ class Col2VolFunctor<CPU, T> { ...@@ -135,31 +103,6 @@ class Col2VolFunctor<CPU, T> {
int channels_col = int channels_col =
input_channels * filter_depth * filter_height * filter_width; input_channels * filter_depth * filter_height * filter_width;
// PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] -
// ((dilations[0] * (filter_depth - 1)
// + 1))) /
// strides[0] +
// 1,
// output_depth,
// "input_depth and output_depth are "
// "mismatching.");
// PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] -
// ((dilations[1] * (filter_height -
// 1) + 1))) /
// strides[1] +
// 1,
// output_height,
// "input_height and output_height are
// "
// "mismatching.");
// PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] -
// ((dilations[2] * (filter_width - 1)
// + 1))) /
// strides[2] +
// 1,
// output_width,
// "input_width and output_width are "
// "mismatching.");
T *vol_data = vol->data<T>(); T *vol_data = vol->data<T>();
const T *col_data = col.data<T>(); const T *col_data = col.data<T>();
...@@ -195,9 +138,9 @@ class Col2VolFunctor<CPU, T> { ...@@ -195,9 +138,9 @@ class Col2VolFunctor<CPU, T> {
}; };
template class Vol2ColFunctor<CPU, float>; template class Vol2ColFunctor<CPU, float>;
template class Vol2ColFunctor<CPU, double>; template class Vol2ColFunctor<CPU, int8_t>;
template class Col2VolFunctor<CPU, float>; template class Col2VolFunctor<CPU, float>;
template class Col2VolFunctor<CPU, double>; template class Col2VolFunctor<CPU, int8_t>;
} // namespace math } // namespace math
} // namespace operators } // namespace operators
......
...@@ -546,11 +546,11 @@ class MulParam : OpParam { ...@@ -546,11 +546,11 @@ class MulParam : OpParam {
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
private: private:
fpga::WrapperConvArgs fpga_conv_args; fpga::SplitConvArgs fpga_conv_args;
public: public:
const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; } const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; } void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
#endif #endif
}; };
#endif #endif
...@@ -999,6 +999,28 @@ class MultiClassNMSParam : public OpParam { ...@@ -999,6 +999,28 @@ class MultiClassNMSParam : public OpParam {
}; };
#endif #endif
#ifdef POLYGONBOXTRANSFORM_OP
template <typename Dtype>
class PolygonBoxTransformParam : public OpParam {
typedef typename DtypeTensorTrait<Dtype>::gtype GType;
typedef typename DtypeTensorTrait<Dtype>::rtype RType;
public:
PolygonBoxTransformParam(const VariableNameMap &inputs,
const VariableNameMap &outputs,
const AttributeMap &attrs, const Scope &scope) {
input_ = InputFrom<GType>(inputs, scope);
output_ = OutputFrom<GType>(outputs, scope);
}
const RType *Input() const { return input_; }
RType *Output() const { return output_; }
private:
RType *input_;
RType *output_;
};
#endif
template <typename Dtype> template <typename Dtype>
class FeedParam : public OpParam { class FeedParam : public OpParam {
typedef typename DtypeTensorTrait<Dtype>::gtype GType; typedef typename DtypeTensorTrait<Dtype>::gtype GType;
...@@ -1041,6 +1063,42 @@ class FetchParam : public OpParam { ...@@ -1041,6 +1063,42 @@ class FetchParam : public OpParam {
RType *out_; RType *out_;
}; };
#ifdef FILL_CONSTANT_OP
template <typename Dtype>
class FillConstantParam : public OpParam {
typedef typename DtypeTensorTrait<Dtype>::gtype GType;
typedef typename DtypeTensorTrait<Dtype>::rtype RType;
public:
FillConstantParam(const VariableNameMap &inputs,
const VariableNameMap &outputs, const AttributeMap &attrs,
const Scope &scope) {
out_var_ = OutVarFrom(outputs, scope);
out_ = OutFrom<GType>(outputs, scope);
dtype_ = GetAttr<int>("dtype", attrs);
shape_ = GetAttr<vector<int>>("shape", attrs);
value_ = GetAttr<float>("value", attrs);
}
Variable *OutVar() const { return out_var_; }
RType *Out() const { return out_; }
const int &DataDtype() const { return dtype_; }
const vector<int> &Shape() const { return shape_; }
const float &Value() const { return value_; }
private:
Variable *out_var_;
RType *out_;
int dtype_;
vector<int> shape_;
float value_;
};
#endif
#ifdef TRANSPOSE_OP #ifdef TRANSPOSE_OP
template <typename Dtype> template <typename Dtype>
class TransposeParam : public OpParam { class TransposeParam : public OpParam {
...@@ -1401,11 +1459,11 @@ class FusionFcParam : public OpParam { ...@@ -1401,11 +1459,11 @@ class FusionFcParam : public OpParam {
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
private: private:
fpga::WrapperConvArgs fpga_conv_args; fpga::SplitConvArgs fpga_conv_args;
public: public:
const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; } const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; } void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
#endif #endif
}; };
...@@ -1441,11 +1499,11 @@ class FusionConvAddParam : public ConvParam<Dtype> { ...@@ -1441,11 +1499,11 @@ class FusionConvAddParam : public ConvParam<Dtype> {
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
private: private:
fpga::WrapperConvArgs fpga_conv_args; fpga::SplitConvArgs fpga_conv_args;
public: public:
const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; } const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; } void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
#endif #endif
}; };
...@@ -1496,11 +1554,11 @@ class FusionConvAddPReluParam : public ConvParam<Dtype> { ...@@ -1496,11 +1554,11 @@ class FusionConvAddPReluParam : public ConvParam<Dtype> {
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
private: private:
fpga::WrapperConvArgs fpga_conv_args; fpga::SplitConvArgs fpga_conv_args;
public: public:
const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; } const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; } void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
#endif #endif
}; };
#endif #endif
...@@ -1554,11 +1612,11 @@ class FusionConvAddAddPReluParam : public ConvParam<Dtype> { ...@@ -1554,11 +1612,11 @@ class FusionConvAddAddPReluParam : public ConvParam<Dtype> {
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
private: private:
fpga::WrapperConvArgs fpga_conv_args; fpga::SplitConvArgs fpga_conv_args;
public: public:
const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; } const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; } void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
#endif #endif
}; };
#endif #endif
...@@ -1629,11 +1687,11 @@ class FusionConvAddBNReluParam : public ConvParam<Dtype> { ...@@ -1629,11 +1687,11 @@ class FusionConvAddBNReluParam : public ConvParam<Dtype> {
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
private: private:
fpga::WrapperConvArgs fpga_conv_args; fpga::SplitConvArgs fpga_conv_args;
public: public:
const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; } const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; } void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
#endif #endif
}; };
#endif #endif
...@@ -1715,11 +1773,11 @@ class FusionConvBNAddReluParam : public ConvParam<Dtype> { ...@@ -1715,11 +1773,11 @@ class FusionConvBNAddReluParam : public ConvParam<Dtype> {
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
private: private:
fpga::WrapperConvArgs fpga_conv_args; fpga::SplitConvArgs fpga_conv_args;
public: public:
const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; } const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; } void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
#endif #endif
}; };
#endif #endif
...@@ -1782,11 +1840,11 @@ class FusionConvBNParam : public ConvParam<Dtype> { ...@@ -1782,11 +1840,11 @@ class FusionConvBNParam : public ConvParam<Dtype> {
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
private: private:
fpga::WrapperConvArgs fpga_conv_args; fpga::SplitConvArgs fpga_conv_args;
public: public:
const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; } const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; } void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
#endif #endif
}; };
#endif #endif
...@@ -1857,11 +1915,11 @@ class FusionConvAddBNParam : public ConvParam<Dtype> { ...@@ -1857,11 +1915,11 @@ class FusionConvAddBNParam : public ConvParam<Dtype> {
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
private: private:
fpga::WrapperConvArgs fpga_conv_args; fpga::SplitConvArgs fpga_conv_args;
public: public:
const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; } const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; } void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
#endif #endif
}; };
#endif #endif
...@@ -1983,11 +2041,11 @@ class FusionConvBNReluParam : public ConvParam<Dtype> { ...@@ -1983,11 +2041,11 @@ class FusionConvBNReluParam : public ConvParam<Dtype> {
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
private: private:
fpga::WrapperConvArgs fpga_conv_args; fpga::SplitConvArgs fpga_conv_args;
public: public:
const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; } const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; } void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
#endif #endif
}; };
#endif #endif
...@@ -2272,6 +2330,7 @@ class ShapeParam : public OpParam { ...@@ -2272,6 +2330,7 @@ class ShapeParam : public OpParam {
}; };
#endif #endif
#ifdef QUANT_OP
template <typename Dtype> template <typename Dtype>
class QuantizeParam : public OpParam { class QuantizeParam : public OpParam {
typedef typename DtypeTensorTrait<Dtype>::gtype GType; typedef typename DtypeTensorTrait<Dtype>::gtype GType;
...@@ -2282,14 +2341,12 @@ class QuantizeParam : public OpParam { ...@@ -2282,14 +2341,12 @@ class QuantizeParam : public OpParam {
const AttributeMap &attrs, const Scope &scope) { const AttributeMap &attrs, const Scope &scope) {
input_ = InputXFrom<GType>(inputs, scope); input_ = InputXFrom<GType>(inputs, scope);
out_ = OutFrom<GType>(outputs, scope); out_ = OutFrom<GType>(outputs, scope);
if (HasAttr("is_static", attrs)) {
is_static_ = GetAttr<bool>("is_static", attrs);
}
// online // online
// scale = max(abs(x)) // scale = max(abs(x))
online_scale_ = GetVarValue<GType>("OutScale", outputs, scope); online_scale_ = GetVarValue<GType>("OutScale", outputs, scope);
// offline // offline
if (HasAttr("static_scale", attrs)) { if (HasAttr("static_scale", attrs)) {
is_static_ = true;
static_scale_ = GetAttr<float>("static_scale", attrs); static_scale_ = GetAttr<float>("static_scale", attrs);
} }
// x = round(scale * x) // x = round(scale * x)
...@@ -2311,9 +2368,11 @@ class QuantizeParam : public OpParam { ...@@ -2311,9 +2368,11 @@ class QuantizeParam : public OpParam {
float static_scale_ = 1.0f; float static_scale_ = 1.0f;
// round method type // round method type
// nearest_zero and nearest_even is valid currently // nearest_zero and nearest_even is valid currently
RoundType round_type_ = ROUND_NEAREST_TO_EVEN; RoundType round_type_ = ROUND_NEAREST_AWAY_ZERO;
}; };
#endif
#ifdef DEQUANT_OP
template <typename Dtype> template <typename Dtype>
class DequantizeParam : public OpParam { class DequantizeParam : public OpParam {
typedef typename DtypeTensorTrait<Dtype>::gtype GType; typedef typename DtypeTensorTrait<Dtype>::gtype GType;
...@@ -2341,6 +2400,7 @@ class DequantizeParam : public OpParam { ...@@ -2341,6 +2400,7 @@ class DequantizeParam : public OpParam {
RType *activation_scale_; RType *activation_scale_;
float weight_scale_; float weight_scale_;
}; };
#endif
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef POLYGONBOXTRANSFORM_OP
#include "operators/polygon_box_transform_op.h"
namespace paddle_mobile {
namespace operators {
template <typename Dtype, typename T>
void PolygonBoxTransformOp<Dtype, T>::InferShape() const {
PADDLE_MOBILE_ENFORCE(this->param_.Input() != nullptr,
"Input (Input) of get_shape op should not be null.");
PADDLE_MOBILE_ENFORCE(this->param_.Output() != nullptr,
"Output (Output) of get_shape op should not be null.");
auto input_dims = this->param_.Input()->dims();
PADDLE_MOBILE_ENFORCE(input_dims.size() == 4, "input's rank must be 4.");
PADDLE_MOBILE_ENFORCE(input_dims[1] % 2 == 0,
"input's second dimension must be even.");
this->param_.Output()->Resize(input_dims);
}
} // namespace operators
} // namespace paddle_mobile
namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(polygon_box_transform, ops::PolygonBoxTransformOp);
#endif
#endif
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef POLYGONBOXTRANSFORM_OP
#pragma once
#include <string>
#include "framework/operator.h"
#include "operators/kernel/polygon_box_transform_kernel.h"
#include "operators/op_param.h"
namespace paddle_mobile {
namespace operators {
using paddle_mobile::framework::Tensor;
template <typename DeviceType, typename T>
class PolygonBoxTransformOp
: public framework::OperatorWithKernel<
DeviceType, PolygonBoxTransformParam<DeviceType>,
operators::PolygonBoxTransformKernel<DeviceType, T>> {
public:
PolygonBoxTransformOp(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs,
const framework::AttributeMap &attrs,
std::shared_ptr<framework::Scope> scope)
: framework::OperatorWithKernel<
DeviceType, PolygonBoxTransformParam<DeviceType>,
operators::PolygonBoxTransformKernel<DeviceType, T>>(
type, inputs, outputs, attrs, scope) {}
using framework::OperatorWithKernel<
DeviceType, PolygonBoxTransformParam<DeviceType>,
operators::PolygonBoxTransformKernel<DeviceType, T>>::OperatorWithKernel;
void InferShape() const override;
protected:
};
} // namespace operators
} // namespace paddle_mobile
#endif
...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef QUANT_OP
#include "operators/quantize_op.h" #include "operators/quantize_op.h"
#include <vector> #include <vector>
...@@ -33,3 +35,5 @@ namespace ops = paddle_mobile::operators; ...@@ -33,3 +35,5 @@ namespace ops = paddle_mobile::operators;
#ifdef PADDLE_MOBILE_CPU #ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU(quantize, ops::QuantizeOp); REGISTER_OPERATOR_CPU(quantize, ops::QuantizeOp);
#endif #endif
#endif
...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef QUANT_OP
#pragma once #pragma once
#include <string> #include <string>
...@@ -40,3 +42,5 @@ class QuantizeOp : public framework::OperatorWithKernel< ...@@ -40,3 +42,5 @@ class QuantizeOp : public framework::OperatorWithKernel<
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
#endif
...@@ -26,7 +26,7 @@ void SumOp<Dtype, T>::InferShape() const { ...@@ -26,7 +26,7 @@ void SumOp<Dtype, T>::InferShape() const {
auto inputs = this->param_.Inputs(); auto inputs = this->param_.Inputs();
const size_t n = inputs.size(); const size_t n = inputs.size();
std::vector<DDim> inputs_dims; std::vector<framework::DDim> inputs_dims;
inputs_dims.reserve(n); inputs_dims.reserve(n);
for (int i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
inputs_dims.push_back(inputs[i]->dims()); inputs_dims.push_back(inputs[i]->dims());
...@@ -65,7 +65,6 @@ REGISTER_OPERATOR_CPU(sum, ops::SumOp); ...@@ -65,7 +65,6 @@ REGISTER_OPERATOR_CPU(sum, ops::SumOp);
REGISTER_OPERATOR_MALI_GPU(sum, ops::ConcatOp); REGISTER_OPERATOR_MALI_GPU(sum, ops::ConcatOp);
#endif #endif
#ifdef PADDLE_MOBILE_FPGA #ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA(sum, ops::ConcatOp);
#endif #endif
#endif #endif
此差异已折叠。
...@@ -84,7 +84,7 @@ int do_sgemm(int m, int n, int k, bool relu, int t1, int t2, int pr) { ...@@ -84,7 +84,7 @@ int do_sgemm(int m, int n, int k, bool relu, int t1, int t2, int pr) {
} }
paddle_mobile::operators::math::Gemm gemm; paddle_mobile::operators::math::Gemm gemm;
gemm.SgemmWithBn(m, n, k, 0.9, a, lda, b, ldb, 0.3, c, ldc, relu, scale, bias, gemm.SgemmWithBn(m, n, k, 1, a, lda, b, ldb, 0.3, c, ldc, relu, scale, bias,
nullptr); nullptr);
int eq = 0; int eq = 0;
int neq = 0; int neq = 0;
......
此差异已折叠。
...@@ -28,13 +28,11 @@ limitations under the License. */ ...@@ -28,13 +28,11 @@ limitations under the License. */
int main() { int main() {
paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile; paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
paddle_mobile.SetThreadNum(4); paddle_mobile.SetThreadNum(1);
Tensor aa, bb, cc, scale, bias; Tensor aa, bb, cc;
auto aaptr = aa.mutable_data<float>({m, k}); auto aaptr = aa.mutable_data<float>({m, k});
auto bbptr = bb.mutable_data<float>({k, n}); auto bbptr = bb.mutable_data<float>({k, n});
auto ccptr = cc.mutable_data<float>({m, n}); auto ccptr = cc.mutable_data<float>({m, n});
auto scaleptr = scale.mutable_data<float>({m});
auto biasptr = bias.mutable_data<float>({m});
for (int i = 0; i < m * k; ++i) { for (int i = 0; i < m * k; ++i) {
aaptr[i] = 2; aaptr[i] = 2;
...@@ -45,23 +43,55 @@ int main() { ...@@ -45,23 +43,55 @@ int main() {
for (int i = 0; i < m * n; ++i) { for (int i = 0; i < m * n; ++i) {
ccptr[i] = 2; ccptr[i] = 2;
} }
for (int i = 0; i < m; ++i) {
scaleptr[i] = 1; Tensor aa_int8, bb_int8, cc_int8;
biasptr[i] = 0; auto aaptr_int8 = aa_int8.mutable_data<int8_t>({m, k});
auto bbptr_int8 = bb_int8.mutable_data<int8_t>({k, n});
auto ccptr_int8 = cc_int8.mutable_data<int32_t>({m, n});
for (int i = 0; i < m * k; ++i) {
aaptr_int8[i] = static_cast<int8_t>(2);
}
for (int i = 0; i < k * n; ++i) {
bbptr_int8[i] = static_cast<int8_t>(2);
}
for (int i = 0; i < m * n; ++i) {
ccptr_int8[i] = static_cast<int32_t>(2);
} }
auto time1 = time(); // float
// warm-up 10 times
for (int j = 0; j < 10; ++j) { for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul<float>( paddle_mobile::operators::math::matmul<float>(
aa, false, bb, false, static_cast<float>(1), &cc, static_cast<float>(0), aa, false, bb, false, static_cast<float>(1), &cc, static_cast<float>(0),
false, biasptr); false, nullptr);
}
// paddle_mobile::operators::math::matmulWithBn<float>( auto time1 = time();
// aa, false, bb, false, static_cast<float>(1), &cc, for (int j = 0; j < 10; ++j) {
// static_cast<float>(0), true, &scale, &bias, 0); paddle_mobile::operators::math::matmul<float>(
aa, false, bb, false, static_cast<float>(1), &cc, static_cast<float>(0),
false, nullptr);
} }
auto time2 = time(); auto time2 = time();
std::cout << "gemm cost :" << time_diff(time1, time2) / 10 << "ms\n"; std::cout << "float gemm cost :" << time_diff(time1, time2) / 10 << "ms\n";
// int8_t
// warm-up 10 times
for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul<int8_t>(
aa_int8, false, bb_int8, false, static_cast<int8_t>(1), &cc_int8,
static_cast<int8_t>(0), false, nullptr);
}
auto time3 = time();
for (int j = 0; j < 10; ++j) {
paddle_mobile::operators::math::matmul<int8_t>(
aa_int8, false, bb_int8, false, static_cast<int8_t>(1), &cc_int8,
static_cast<int8_t>(0), false, nullptr);
}
auto time4 = time();
std::cout << "int8_t gemm cost :" << time_diff(time3, time4) / 10 << "ms\n";
return 0; return 0;
} }
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
...@@ -59,7 +59,7 @@ int TestDequqntizeOp() { ...@@ -59,7 +59,7 @@ int TestDequqntizeOp() {
framework::Tensor output_cmp; framework::Tensor output_cmp;
output_cmp.Resize(dim); output_cmp.Resize(dim);
float dequant_scale = 1.f / (1.27 * 1.74); float dequant_scale = 1.27 / 1.74;
dequantize(input, dequant_scale, &output_cmp); dequantize(input, dequant_scale, &output_cmp);
const float* output_cmp_data = output_cmp.data<float>(); const float* output_cmp_data = output_cmp.data<float>();
for (int i = 0; i < output->numel(); ++i) { for (int i = 0; i < output->numel(); ++i) {
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册