diff --git a/.gitignore b/.gitignore index 8f92118b08bb30531869c28d32d335cc47116350..8c4450181d82116620d880c93789dee9dcda9d73 100644 --- a/.gitignore +++ b/.gitignore @@ -92,3 +92,4 @@ metal/images/ metal/paddle-mobile/paddle-mobile/CPU/libpaddle-mobile.a *.xcuserdatad/ */xcuserdata/ +/venv/ diff --git a/CMakeLists.txt b/CMakeLists.txt index 3f9fbcbc18d0bfe1d634dd6815b16a5f1862e846..a98d815943cf4d4bb3d632ccfcb83fc7818e047d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,9 +2,9 @@ cmake_minimum_required(VERSION 3.0) project(paddle-mobile) # select the platform to build -option(CPU "armv7 with neon support" ON) +option(CPU "armv7 with neon support" OFF) option(MALI_GPU "mali gpu support" OFF) -option(FPGA "fpga support" OFF) +option(FPGA "fpga support" ON) option(USE_OPENMP "openmp support" OFF) option(DEBUGING "enable debug mode" ON) @@ -20,6 +20,7 @@ set(CMAKE_CXX_FLAGS "-O3 -s -DNDEBUG ${CMAKE_CXX_FLAGS}") if(IS_IOS) set(CMAKE_CXX_FLAGS "-mfpu=neon -marm -fobjc-abi-version=2 -fobjc-arc \ -std=gnu++11 -stdlib=libc++ -isysroot ${CMAKE_OSX_SYSROOT} ${CMAKE_CXX_FLAGS}") + add_compile_options(-fembed-bitcode) else() set(CMAKE_CXX_FLAGS "-std=c++11 ${CMAKE_CXX_FLAGS}") endif() @@ -28,7 +29,10 @@ if(DEBUGING) message(STATUS "debugging mode") add_definitions(-DPADDLE_MOBILE_DEBUG) else() - add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden) + if(FPGA) + else() + add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden) + endif() endif() if(USE_EXCEPTION) @@ -92,8 +96,7 @@ else() endif() if(FPGA) - set(DEBUGING ON) - add_definitions(-DPADDLE_MOBILE_DEBUG) + message("FPGA mode enabled") add_definitions(-DPADDLE_MOBILE_FPGA) else() file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/*.cpp src/operators/kernel/fpga/*.cc) @@ -176,6 +179,10 @@ if(DEBUGING) else() add_subdirectory(test) endif() +elseif(FPGA) + add_subdirectory(test) endif() + + diff --git a/README.md b/README.md index de7dd530c94b4a3055cbf07a4a19a55c21457ed0..b86860830066cf1b622ff3b449803b0446794b74 100644 --- a/README.md +++ b/README.md @@ -110,7 +110,8 @@ Paddle-Mobile是PaddlePaddle组织下的项目,是一个致力于嵌入式平 ### 开发文档 开发文档主要是关于编译、运行等问题。做为开发者,它可以和贡献文档共同结合使用。 -[开发文档链接](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_doc.md) +* [iOS](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_ios.md) +* [Android](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_android.md) ### 贡献文档 - [贡献文档链接](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/CONTRIBUTING.md) diff --git a/benchmark/arm_benchmark.md b/benchmark/arm_benchmark.md new file mode 100644 index 0000000000000000000000000000000000000000..280bec16e4baf035eb30138d49b2d31d038aa4c7 --- /dev/null +++ b/benchmark/arm_benchmark.md @@ -0,0 +1,36 @@ +|mobilenet arm v7|1线程|2线程|4线程| +|------------|----|-----|-----| +|麒麟970(ms)|108.180|63.935|37.545| +|麒麟960(ms)|108.588|63.073|36.822| +|高通845(ms)|85.952|48.890|28.641| +|高通835(ms)|105.434|62.752|37.131| +||||| +|mobilenetssd arm v7|1线程|2线程|4线程| +|麒麟970(ms)|212.686|127.205|77.485| +|麒麟960(ms)|212.641|125.338|75.250| +|高通845(ms)|182.863|95.671|56.857| +|高通835(ms)|213.849|127.717|77.006| +||||| +|googlenet(v1) arm v7|1线程|2线程|4线程| +|麒麟970(ms)|335.288|234.559|161.295| +|麒麟960(ms)|354.443|232.642|157.815| +|高通845(ms)|282.007|173.146|122.148| +|高通835(ms)|341.250|233.354|158.554| +||||| +|squeezenet arm v7|1线程|2线程|4线程| +|麒麟970(ms)|83.726|57.944|36.923| +|麒麟960(ms)|85.835|55.762|36.496| +|高通845(ms)|71.301|41.618|28.785| +|高通835(ms)|82.407|56.176|36.455| +||||| +|yolo arm v7|1线程|2线程|4线程| +|麒麟970(ms)|129.658|79.993|49.969| +|麒麟960(ms)|130.208|78.791|48.390| +|高通845(ms)|109.244|61.736|40.600| +|高通835(ms)|130.402|80.863|50.359| + + 测试机型信息: + 麒麟970:荣耀v10 (2.36GHz * 4 + 1.8GHz * 4) + 麒麟960:华为mate9 (2.36GHz * 4 + 1.8GHz * 4) + 骁龙835:小米6 (2.45GHz * 4 + 1.9GHz * 4) + 骁龙845:OPPO FindX (2.80GHz * 4 + 1.8GHz * 4) \ No newline at end of file diff --git a/benchmark/metal_benchmark.md b/benchmark/metal_benchmark.md new file mode 100644 index 0000000000000000000000000000000000000000..e3e5d0750f72fc395c402d516aa9fee02a0fcd7f --- /dev/null +++ b/benchmark/metal_benchmark.md @@ -0,0 +1,10 @@ +|mobilenetfssd|速度| +|------------|-----| +|A9(ms)|33.78| +|A10(ms)|24.05| +|A11(ms)|17.15| +||| +|genet|速度| +|A9(ms) |3.49| +|A10(ms)|2.54| +|A11(ms)|1.43| \ No newline at end of file diff --git a/doc/design_doc.md b/doc/design_doc.md index bf5f78e8d805465418cad8989945f2afa7ab5587..70292c6b0bd617930a9c9458b87cef34dee3347e 100644 --- a/doc/design_doc.md +++ b/doc/design_doc.md @@ -3,7 +3,7 @@ #### 以下是 paddle-mobile 代码的执行流程图: -![执行流程图](http://otkwwi4x8.bkt.clouddn.com/2018-07-02-15305189473720.png) +![执行流程图](http://mms-graph.bj.bcebos.com/paddle-mobile/git_images/flow_chart.png) #### 主要分为: Loader 模块、 Program 模块、 Executor 模块、 op 模块、 kernel 模块、scope variable Tensor 模块 @@ -14,12 +14,12 @@ 先来看一下模型, 模型分为两种结构: 一种为参数文件是散开的, 如下图, 红框为模型结构的 protobuf 文件, 其余为参数文件 -![模型描述](http://otkwwi4x8.bkt.clouddn.com/2018-07-02-15305190629577.png) +![模型描述](http://mms-graph.bj.bcebos.com/paddle-mobile/git_images/model_desc.png) 另一种为参数文件结合在一起的, 如下图, 红框内为模型结构描述的 protobuf 文件, 另一个文件为结合在一起的参数文件 -![模型描述combined](http://otkwwi4x8.bkt.clouddn.com/2018-07-02-15305191057130.png) +![模型描述combined](http://mms-graph.bj.bcebos.com/paddle-mobile/git_images/model_desc_combined.png) loader 模块的作用是将模型结构信息 load 进内存, 将红框内的 protobuf 文件 load 进内存, 并对模型结构进行优化(如将几个细粒度的 op 融合成 粗粒度的 op, 如将 conv、 add、 batchnorm、 relu 融合为 conv\_add\_batchnorm\_relu). @@ -161,7 +161,7 @@ sh build.sh android yolo ### 五. kernel kernel 为 op 的底层运算实现, 主要有两个函数, Init 和 Compute, 分别用来初始化、预处理 和 运算操作, 值得提出的是, kernel 会根据泛型特化到不同的平台, 如图所示: -![设备特化]![](http://otkwwi4x8.bkt.clouddn.com/2018-07-02-15305191401976.png) +![设备特化](http://mms-graph.bj.bcebos.com/paddle-mobile/git_images/devices.png) 不同平台的 kernel 实现, 为同一个 kernel 类不同泛型的特化实现, 目前有三个平台, arm、mali、fpga, 图中的 central-arm-func\ 目录为 op kernel 的 arm 实现, 它承担了 arm\ 目录下 kernel 的底层实现, 同时 arm 处理器作为中央处理器, central-arm-func\ 也可以作为其他协处理器的底层实现, 如: fpga 的某一个 op kernel 还没有 fpga 协处理器的实现, 就可以直接调用使用这里的 arm 实现. diff --git a/doc/development_doc.md b/doc/development_android.md similarity index 79% rename from doc/development_doc.md rename to doc/development_android.md index 3f45f956f00e78c23b60b4c108b8c90cf4065e04..528d7aa2def78103b8dbdcf0329279f029c85cac 100644 --- a/doc/development_doc.md +++ b/doc/development_android.md @@ -1,74 +1,3 @@ -### iOS&Android开发文档 - -# iOS开发文档 - -## 编译 - -```sh - -# 在 paddle-mobile 目录下: -cd tools - -sh build.sh ios - -# 如果只想编译某个特定模型的 op, 则需执行以下命令 -sh build.sh ios googlenet - -# 在这个文件夹下, 你可以拿到生成的 .a 库 -cd ../build/release/ios/build - -``` -#### 常见问题: - -1. No iOS SDK's found in default search path ... - - 这个问题是因为 tools/ios-cmake/ios.toolchain.cmake 找不到你最近使用的 iOS SDK 路径, 所以需要自己进行指定, - 以我当前的环境为例: 在 tools/ios-cmake/ios.toolchain.cmake 143行前添加我本地的 iOS SDK 路径: set(CMAKE_IOS_SDK_ROOT "/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk") - -## 集成 - -``` -将上一步生成的: -libpaddle-mobile.a - -/src/ios_io/ 下的 -PaddleMobile.h -``` -拖入工程 - -#### oc 接口 - -接口如下: - -``` -/* - 创建对象 -*/ -- (instancetype)init; - -/* - load 模型, 开辟内存 -*/ -- (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath; - -/* - 进行预测, means 和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict -*/ -- (NSArray *)predict:(CGImageRef)image dim:(NSArray *)dim means:(NSArray *)means scale:(float)scale; - -/* - 进行预测 -*/ -- (NSArray *)predict:(CGImageRef)image dim:(NSArray *)dim; - -/* - 清理内存 -*/ -- (void)clear; - -``` - - # Android开发文档 用户可通过如下两种方式,交叉编译Android平台上适用的paddle-mobile库: diff --git a/doc/development_arm_linux.md b/doc/development_arm_linux.md new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/doc/development_ios.md b/doc/development_ios.md new file mode 100644 index 0000000000000000000000000000000000000000..1d4f28bd5bcde1c3068ddeae87627ae6686d886a --- /dev/null +++ b/doc/development_ios.md @@ -0,0 +1,85 @@ +# iOS开发文档 + +## CPU + +需要: xcode + +### 编译 + +```sh + +# 在 paddle-mobile 目录下: +cd tools + +sh build.sh ios + +# 如果只想编译某个特定模型的 op, 则需执行以下命令 +sh build.sh ios googlenet + +# 在这个文件夹下, 你可以拿到生成的 .a 库 +cd ../build/release/ios/build + +``` +#### 常见问题: + +1. No iOS SDK's found in default search path ... + + 这个问题是因为 tools/ios-cmake/ios.toolchain.cmake 找不到你最近使用的 iOS SDK 路径, 所以需要自己进行指定, + 以我当前的环境为例: 在 tools/ios-cmake/ios.toolchain.cmake 143行前添加我本地的 iOS SDK 路径: set(CMAKE_IOS_SDK_ROOT "/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk") + +### 集成 + +``` +将上一步生成的: +libpaddle-mobile.a + +/src/ios_io/ 下的 +PaddleMobile.h +``` +拖入工程 + +#### oc 接口 + +接口如下: + +``` +/* + 创建对象 +*/ +- (instancetype)init; + +/* + load 模型, 开辟内存 +*/ +- (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath; + +/* + 进行预测, means 和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict +*/ +- (NSArray *)predict:(CGImageRef)image dim:(NSArray *)dim means:(NSArray *)means scale:(float)scale; + +/* + 进行预测 +*/ +- (NSArray *)predict:(CGImageRef)image dim:(NSArray *)dim; + +/* + 清理内存 +*/ +- (void)clear; + +``` + +## GPU + +需要: xcode、cocoapods + +``` +# 在 paddle-mobile 目录下: +cd metal + +pod install + +open paddle-mobile.xcworkspace + +``` diff --git a/doc/images/devices.png b/doc/images/devices.png deleted file mode 100644 index 413d32c249972ee96f678d50a5cd0b36a2a03e29..0000000000000000000000000000000000000000 Binary files a/doc/images/devices.png and /dev/null differ diff --git a/doc/images/flow_chart.png b/doc/images/flow_chart.png deleted file mode 100644 index c747230da43e2e688d7460704268631758d34596..0000000000000000000000000000000000000000 Binary files a/doc/images/flow_chart.png and /dev/null differ diff --git a/doc/images/model_desc.png b/doc/images/model_desc.png deleted file mode 100644 index 3c026b6192c8e1d84b3a82c3db91e022f35358c2..0000000000000000000000000000000000000000 Binary files a/doc/images/model_desc.png and /dev/null differ diff --git a/doc/images/model_desc_combined.png b/doc/images/model_desc_combined.png deleted file mode 100644 index 38e7388efcfdcad53f4e80ce0ac5d3b993eb986c..0000000000000000000000000000000000000000 Binary files a/doc/images/model_desc_combined.png and /dev/null differ diff --git a/metal/README.md b/metal/README.md index 90c517a2c10c28a9fcf26357e65ce2178a2fd8ac..2da6558b05b051b8b476f259d49fa3845e397b29 100644 --- a/metal/README.md +++ b/metal/README.md @@ -1,3 +1,12 @@ ## Paddle-Mobile -This folder is used to develop metal version for ios gpu +需要: xcode、 cocoapods + +``` +pod install + +open paddle-mobile.xcworkspace + +``` + +Demo 所需依赖的模型可在[这里](http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip)下载 diff --git a/src/common/types.cpp b/src/common/types.cpp index 18b143a974d7bee7a79b9b14233b30a497882b94..46e5bfab3711ac81f5438cb21105843f52183e15 100644 --- a/src/common/types.cpp +++ b/src/common/types.cpp @@ -62,6 +62,8 @@ const char *G_OP_TYPE_CRF = "crf_decoding"; const char *G_OP_TYPE_BILINEAR_INTERP = "bilinear_interp"; const char *G_OP_TYPE_FLATTEN = "flatten"; const char *G_OP_TYPE_SHAPE = "shape"; +const char *G_OP_TYPE_ELEMENTWISE_MUL = "elementwise_mul"; +const char *G_OP_TYPE_SUM = "sum"; const char *G_OP_TYPE_QUANTIZE = "quantize"; const char *G_OP_TYPE_DEQUANTIZE = "dequantize"; @@ -115,7 +117,8 @@ std::unordered_map< {G_OP_TYPE_FLATTEN, {{"X"}, {"Out"}}}, {G_OP_TYPE_SHAPE, {{"Input"}, {"Out"}}}, {G_OP_TYPE_CONV_TRANSPOSE, {{"Input"}, {"Output"}}}, + {G_OP_TYPE_SUM, {{"X"}, {"Out"}}}, + {G_OP_TYPE_ELEMENTWISE_MUL, {{"X", "Y"}, {"Out"}}}, {G_OP_TYPE_QUANTIZE, {{"X"}, {"Out", "OutScale"}}}, {G_OP_TYPE_DEQUANTIZE, {{"X", "Scale"}, {"Out"}}}}; - } // namespace paddle_mobile diff --git a/src/common/types.h b/src/common/types.h index ec2e3ea2f2c818ca6ea7634ac1c564bbca492a34..0855bd053f0dc804b6f3289796f3818657675864 100644 --- a/src/common/types.h +++ b/src/common/types.h @@ -126,6 +126,8 @@ extern const char *G_OP_TYPE_REGION; extern const char *G_OP_TYPE_FUSION_CONV_BN; extern const char *G_OP_TYPE_CONV_TRANSPOSE; extern const char *G_OP_TYPE_PRELU; +extern const char *G_OP_TYPE_SUM; +extern const char *G_OP_TYPE_ELEMENTWISE_MUL; extern const char *G_OP_TYPE_QUANTIZE; extern const char *G_OP_TYPE_DEQUANTIZE; diff --git a/src/common/variant.h b/src/common/variant.h index 8ec9ccb7a92acb06417a74d9ebe95189ac9e547f..4aa4f47c628caec438ecd00522d90ebf299da6a0 100644 --- a/src/common/variant.h +++ b/src/common/variant.h @@ -12,14 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include +#pragma once +#include +#include +#include #include "common/enforce.h" #include "common/log.h" -#pragma once - namespace paddle_mobile { + template struct IDToType { typedef Type type_t; @@ -79,13 +81,13 @@ struct Variant { template void Set(Args &&... args) { - helper::Destroy(type_id, &data.data); - new (&data.data) T(std::forward(args)...); + helper::Destroy(type_id, data.data); + new (data.data) T(std::forward(args)...); type_id = typeid(T).hash_code(); } void SetString(std::string &string) { - // helper::Destroy(type_id, &data); + helper::Destroy(type_id, data.data); type_id = typeid(std::string).hash_code(); strcpy(data.data, string.c_str()); } @@ -109,7 +111,7 @@ struct Variant { "stl lib with string copy)"); exit(0); } else if (type_id == typeid(T).hash_code()) { - return *const_cast(reinterpret_cast(&data)); + return *const_cast(reinterpret_cast(data.data)); } else { PADDLE_MOBILE_THROW_EXCEPTION(" bad cast in variant"); exit(0); @@ -122,7 +124,8 @@ struct Variant { static inline size_t invalid_type() { return typeid(void).hash_code(); } typedef VariantHelper helper; size_t type_id; - RawData data; + // todo use an anto size to suite this. + RawData<64> data; }; template diff --git a/src/fpga/api.cpp b/src/fpga/api.cpp index 138906c790574a4a0201180b5d18cd67960a7e1d..97746d0b203523b9337af17346b623d96dbf5a88 100644 --- a/src/fpga/api.cpp +++ b/src/fpga/api.cpp @@ -22,7 +22,7 @@ limitations under the License. */ #include "fpga/filter.h" #include "fpga/image.h" #define FPGA_TEST_MODE -#define PADDLE_MOBILE_OS_LINUX +// #define PADDLE_MOBILE_OS_LINUX namespace paddle_mobile { namespace fpga { @@ -125,6 +125,7 @@ float fp16_2_fp32(half fp16_num) { } int ComputeBasicConv(const struct ConvArgs &args) { +#ifdef FPGA_TEST_MODE DLOG << "======Compute Basic Conv======"; DLOG << " relu_enabled:" << args.relu_enabled << " sb_address:" << args.sb_address @@ -144,11 +145,11 @@ int ComputeBasicConv(const struct ConvArgs &args) { << " stride_w:" << args.kernel.stride_w; DLOG << " out_address:" << args.output.address << " out_scale_address:" << args.output.scale_address; - +#endif return do_ioctl(IOCTL_CONFIG_CONV, &args); } -int ComputeFpgaConv(const struct WrapperConvArgs &args) { +int ComputeFpgaConv(const struct SplitConvArgs &args) { #ifdef FPGA_TEST_MODE DLOG << "=============ComputeFPGAConv==========="; DLOG << " filter_num:" << args.filter_num @@ -192,8 +193,9 @@ int ComputeFpgaPool(const struct PoolingArgs &args) { int ComputeFpgaEWAdd(const struct EWAddArgs &args) { #ifdef FPGA_TEST_MODE DLOG << "=============ComputeFpgaEWAdd==========="; - DLOG << " relu_enabled:" << args.relu_enabled << " const0:" << args.const0 - << " const1:" << args.const1; + DLOG << " relu_enabled:" << args.relu_enabled + << " const0:" << fp16_2_fp32(int16_t(args.const0)) + << " const1:" << fp16_2_fp32(int16_t(args.const1)); DLOG << " image0_address:" << args.image0.address << " image0_scale_address:" << args.image0.scale_address << " image0_channels:" << args.image0.channels @@ -381,10 +383,10 @@ void format_concat_output(framework::Tensor *out, int height, int width, out->reset_data_ptr(data_ptr); } -void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input, - framework::Tensor *out, framework::Tensor *filter, - bool relu_enabled, int group_num, int stride_h, int stride_w, - int padding_h, int padding_w, float *bs_ptr) { +void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, + framework::Tensor *out, framework::Tensor *filter, + bool relu_enabled, int group_num, int stride_h, + int stride_w, int padding_h, int padding_w, float *bs_ptr) { auto input_ptr = input->data(); auto filter_ptr = filter->data(); auto out_ptr = out->data(); @@ -401,8 +403,8 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input, arg->concat_arg.image_num = arg->split_num; arg->concat_arg.image_out = out_ptr; arg->concat_arg.scale_out = out->scale; - arg->concat_arg.height = (uint32_t)filter->dims()[2]; - arg->concat_arg.width = (uint32_t)filter->dims()[3]; + arg->concat_arg.height = (uint32_t)out->dims()[2]; + arg->concat_arg.width = (uint32_t)out->dims()[3]; int n = arg->split_num; arg->concat_arg.images_in = @@ -411,7 +413,6 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input, (float **)fpga_malloc(n * sizeof(float *)); // NOLINT arg->concat_arg.channel_num = (uint32_t *)fpga_malloc(n * sizeof(uint32_t)); // NOLINT - arg->concat_arg.image_out = out_ptr; auto channel = (int)out->dims()[1]; // NOLINT int filter_num_per_div = get_filter_num_per_div(filter, group_num); diff --git a/src/fpga/api.h b/src/fpga/api.h index a4f71e119c83de40771f321abfc8bb2821e4523a..f535975a35ecc3c454bbac597b31d8c3670cbf91 100644 --- a/src/fpga/api.h +++ b/src/fpga/api.h @@ -89,7 +89,7 @@ struct ConcatArgs { uint32_t width; }; -struct WrapperConvArgs { +struct SplitConvArgs { uint32_t split_num; uint32_t group_num; uint32_t filter_num; @@ -98,6 +98,14 @@ struct WrapperConvArgs { struct ConcatArgs concat_arg; }; +struct GroupConvArgs { + uint32_t group_num; + uint32_t filter_num; + struct ImageOutputArgs output; + struct SplitConvArgs* conv_args; + struct ConcatArgs concat_arg; +}; + struct PoolingArgs { int16_t mode; // mode: 0:max, 1:avg half kernel_reciprocal; @@ -159,30 +167,6 @@ struct MemoryCacheArgs { #define IOCTL_FPGA_REG_READ _IOW(IOCTL_FPGA_MAGIC, 28, struct FpgaRegReadArgs) #define IOCTL_FPGA_REG_WRITE _IOW(IOCTL_FPGA_MAGIC, 29, struct FpgaRegWriteArgs) -enum FPGA_ERR_TYPE { - ERR_IOCTL_CMD = -1, - ERR_TIMEOUT = -2, - ERR_COMPLETION_TIMEOUT = -3, - ERR_INVALID_FPGA_ADDR = -4, - ERR_NOMEM = -5, - ERR_NO_RESERVE_MEM = -6, - ERR_COPY_FROM_USER = -7, - ERR_COPY_TO_USER = -8, - ERR_DEL_TIMER = -9, - ERR_ENABLE_MSI = -10, - ERR_REGISTER_IRQ = -11, - ERR_PCIE_REGISTER = -12, - ERR_PCIE_PROBE = -13, - ERR_REGISTER_BLOCK = -14, - ERR_ALLOC_GENDISK = -15, - ERR_INIT_QUEUE = -16, - ERR_WAIT = -17, - ERR_ECC_ERROR = -31, - ERR_FPGA_FAIL_STOP = -64, - ERR_FPGA_DEBUG_STOP = -113, - DEV_TMP_UNAVAILABLE = -128 -}; - //============================== API ============================= int open_device(); @@ -195,7 +179,7 @@ int fpga_flush(void* address, size_t size); int fpga_invalidate(void* address, size_t size); int PerformBypass(const struct BypassArgs& args); -int ComputeFpgaConv(const struct WrapperConvArgs& args); +int ComputeFpgaConv(const struct SplitConvArgs& args); int ComputeFpgaPool(const struct PoolingArgs& args); int ComputeFpgaEWAdd(const struct EWAddArgs& args); int ComputeFPGAConcat(const struct ConcatArgs& args); @@ -220,10 +204,10 @@ void format_bias_scale_array(float** bias_scale_array, void format_concat_output(framework::Tensor* out, int height, int width, int image_num, uint32_t* channel_num); -void fill_conv_arg(struct WrapperConvArgs* arg, framework::Tensor* input, - framework::Tensor* out, framework::Tensor* filter, - bool relu_enabled, int group_num, int stride_h, int stride_w, - int padding_h, int padding_w, float* bs_ptr); +void fill_split_arg(struct SplitConvArgs* arg, framework::Tensor* input, + framework::Tensor* out, framework::Tensor* filter, + bool relu_enabled, int group_num, int stride_h, + int stride_w, int padding_h, int padding_w, float* bs_ptr); half fp32_2_fp16(float fp32_num); float fp16_2_fp32(half fp16_num); diff --git a/src/fpga/bias_scale.cpp b/src/fpga/bias_scale.cpp index 50f1ed03f0121b5afdc41d427e5b52675994bd1e..23889d5b1fee3d8cb9e4673f42b18574366411eb 100644 --- a/src/fpga/bias_scale.cpp +++ b/src/fpga/bias_scale.cpp @@ -27,6 +27,9 @@ void align_element(float **data_in, int num_per_div_before_alignment, int num) { (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment; int num_per_div_after_alignment = align_to_x(num_per_div_before_alignment, BS_NUM_ALIGNMENT); + if (num_per_div_before_alignment == num_per_div_after_alignment) { + return; + } int num_element = 2 * div_num * num_per_div_after_alignment; // including bias & scale float *ptr_aligned = diff --git a/src/fpga/filter.cpp b/src/fpga/filter.cpp index 34e0ad6f18f8e80d636e42630e03650c018a8825..db851b926bbbd549205ee5d75bc46a6c04888098 100644 --- a/src/fpga/filter.cpp +++ b/src/fpga/filter.cpp @@ -21,7 +21,10 @@ namespace paddle_mobile { namespace fpga { namespace filter { -int calc_division_capacity(int chw) { return 2048 / ((chw + 15) / 16) * 32; } +int calc_division_capacity(int chw) { + int n = 2048 / ((chw + 15) / 16) * 32; + return n < 2048 ? n : 2048; +} int calc_split_num(int num, int division_capacity) { return (num + division_capacity - 1) / division_capacity; @@ -210,12 +213,12 @@ void format_filter(float **data_in, int num, int channel, int height, int width, align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT); int div_num = (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment; - int num_after_alignment = num_per_div_after_alignment * div_num; - + int residual = num % num_per_div_before_alignment; + int num_after_alignment = num_per_div_after_alignment * + ((residual == 0) ? div_num : (div_num - 1)) + + align_to_x(residual, FILTER_NUM_ALIGNMENT); quantize(data_in, data_size, max); - char **quantize_data = (char **)data_in; // NOLINT - convert_to_hwc(quantize_data, num, channel, height, width); align_element(quantize_data, num, chw); align_num(quantize_data, num_per_div_before_alignment, num, chw); diff --git a/src/framework/load_ops.h b/src/framework/load_ops.h index a2a6da34849641b4f99310621445cb312c7d5227..03fdd8d433cd40aa7ba4786f02221bd24bd3a050 100644 --- a/src/framework/load_ops.h +++ b/src/framework/load_ops.h @@ -199,6 +199,12 @@ LOAD_OP3(pool2d, CPU, MALI_GPU, FPGA); #ifdef MULTICLASSNMS_OP LOAD_OP1(multiclass_nms, CPU); #endif +#ifdef SUM_OP +LOAD_OP1(sum, CPU); +#endif +#ifdef ELEMENTWISEMUL_OP +LOAD_OP1(elementwise_mul, CPU); +#endif #ifdef SLICE_OP LOAD_OP2(slice, CPU, MALI_GPU); #endif @@ -206,5 +212,8 @@ LOAD_OP2(slice, CPU, MALI_GPU); LOAD_OP2(fusion_conv_bn, CPU, FPGA); LOAD_FUSION_MATCHER(fusion_conv_bn); #endif +#ifdef ELEMENTWISESUB_OP +LOAD_OP1(elementwise_sub, CPU) +#endif LOAD_OP1(quantize, CPU); LOAD_OP1(dequantize, CPU); diff --git a/src/framework/mixed_vector.h b/src/framework/mixed_vector.h new file mode 100644 index 0000000000000000000000000000000000000000..031d73179c991229ec99ebdde927b0ad1532d82b --- /dev/null +++ b/src/framework/mixed_vector.h @@ -0,0 +1,272 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once + +#include +#include +#include + +#include "framework/tensor.h" +#include "framework/tensor_util.h" + +namespace paddle_mobile { +namespace framework { + +// Vector implements the std::vector interface, and can get Data or +// MutableData from any place. The data will be synced implicitly inside. +template +class Vector { + public: + using value_type = T; + // Default ctor. Create empty Vector + Vector() { InitEmpty(); } + + // Fill vector with value. The vector size is `count`. + explicit Vector(size_t count, const T& value = T()) { + InitEmpty(); + if (count != 0) { + resize(count); + T* ptr = begin(); + for (size_t i = 0; i < count; ++i) { + ptr[i] = value; + } + } + } + + // Ctor with init_list + Vector(std::initializer_list init) { + if (init.size() == 0) { + InitEmpty(); + } else { + InitByIter(init.size(), init.begin(), init.end()); + } + } + + // implicit cast from std::vector. + template + Vector(const std::vector& dat) { // NOLINT + if (dat.size() == 0) { + InitEmpty(); + } else { + InitByIter(dat.size(), dat.begin(), dat.end()); + } + } + + // Copy ctor + Vector(const Vector& other) { this->operator=(other); } + + // Copy operator + Vector& operator=(const Vector& other) { + if (other.size() != 0) { + this->InitByIter(other.size(), other.begin(), other.end()); + } else { + InitEmpty(); + } + return *this; + } + + // Move ctor + Vector(Vector&& other) { + this->size_ = other.size_; + this->flag_ = other.flag_; + if (other.cuda_vec_.memory_size()) { + this->cuda_vec_.ShareDataWith(other.cuda_vec_); + } + if (other.cpu_vec_.memory_size()) { + this->cpu_vec_.ShareDataWith(other.cpu_vec_); + } + } + + // CPU data access method. Mutable. + T& operator[](size_t i) { + MutableCPU(); + return const_cast(cpu_vec_.data())[i]; + } + + // CPU data access method. Immutable. + const T& operator[](size_t i) const { + // ImmutableCPU(); + return cpu_vec_.data()[i]; + } + + // std::vector iterator methods. Based on CPU data access method + size_t size() const { return size_; } + + T* begin() { return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); } + + T* end() { + return capacity() == 0 ? &EmptyDummy() : &this->operator[](size()); + } + + T& front() { return *begin(); } + + T& back() { + auto it = end(); + --it; + return *it; + } + + const T* begin() const { + return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); + } + + const T* end() const { + return capacity() == 0 ? &EmptyDummy() : &this->operator[](size()); + } + + const T* cbegin() const { return begin(); } + + const T* cend() const { return end(); } + + const T& back() const { + auto it = end(); + --it; + return *it; + } + + T* data() { return begin(); } + + const T* data() const { return begin(); } + + const T& front() const { return *begin(); } + // end of std::vector iterator methods + + // assign this from iterator. + // NOTE: the iterator must support `end-begin` + template + void assign(Iter begin, Iter end) { + InitByIter(end - begin, begin, end); + } + + // push_back. If the previous capacity is not enough, the memory will + // double. + void push_back(T elem) { + if (size_ + 1 > capacity()) { + reserve((size_ + 1) << 1); + } + *end() = elem; + ++size_; + } + + // extend a vector by iterator. + // NOTE: the iterator must support end-begin + template + void Extend(It begin, It end) { + size_t pre_size = size_; + resize(pre_size + (end - begin)); + T* ptr = this->begin() + pre_size; + for (; begin < end; ++begin, ++ptr) { + *ptr = *begin; + } + } + + // resize the vector + void resize(size_t size) { + if (size + 1 <= capacity()) { + size_ = size; + } else { + MutableCPU(); + Tensor cpu_tensor; + T* ptr = cpu_tensor.mutable_data( + framework::make_ddim({static_cast(size)})); + const T* old_ptr = + cpu_vec_.memory_size() == 0 ? nullptr : cpu_vec_.data(); + if (old_ptr != nullptr) { + std::copy(old_ptr, old_ptr + size_, ptr); + } + size_ = size; + cpu_vec_.ShareDataWith(cpu_tensor); + } + } + + // clear + void clear() { + size_ = 0; + flag_ = kDirty | kDataInCPU; + } + + size_t capacity() const { + return cpu_vec_.memory_size() / SizeOfType(typeid(T)); + } + + // reserve data + void reserve(size_t size) { + size_t pre_size = size_; + resize(size); + resize(pre_size); + } + + // implicit cast operator. Vector can be cast to std::vector implicitly. + operator std::vector() const { + std::vector result; + result.resize(size()); + std::copy(begin(), end(), result.begin()); + return result; + } + + bool operator==(const Vector& other) const { + if (size() != other.size()) return false; + auto it1 = cbegin(); + auto it2 = other.cbegin(); + for (; it1 < cend(); ++it1, ++it2) { + if (*it1 != *it2) { + return false; + } + } + return true; + } + + private: + void InitEmpty() { + size_ = 0; + flag_ = kDataInCPU; + } + + template + void InitByIter(size_t size, Iter begin, Iter end) { + T* ptr = this->cpu_vec_.template mutable_data( + framework::make_ddim({static_cast(size)})); + for (size_t i = 0; i < size; ++i) { + *ptr++ = *begin++; + } + flag_ = kDataInCPU | kDirty; + size_ = size; + } + + enum DataFlag { + kDataInCPU = 0x01, + kDataInCUDA = 0x02, + // kDirty means the data has been changed in one device. + kDirty = 0x10 + }; + + void MutableCPU() { flag_ = kDirty | kDataInCPU; } + + void UnsetFlag(int flag) const { flag_ &= ~flag; } + void SetFlag(int flag) const { flag_ |= flag; } + + static T& EmptyDummy() { + static T dummy = T(); + return dummy; + } + + mutable int flag_; + mutable Tensor cpu_vec_; + mutable Tensor cuda_vec_; + size_t size_; +}; + +} // namespace framework +} // namespace paddle_mobile diff --git a/src/framework/selected_rows.cpp b/src/framework/selected_rows.cpp new file mode 100644 index 0000000000000000000000000000000000000000..96e72051e5bf882c3549fb94cd8119ffc4fdfb9c --- /dev/null +++ b/src/framework/selected_rows.cpp @@ -0,0 +1,127 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "framework/selected_rows.h" + +namespace paddle_mobile { +namespace framework { + +struct ReAllocateVisitor { + ReAllocateVisitor(framework::Tensor* tensor, const framework::DDim& dims) + : tensor_(tensor), dims_(dims) {} + + template + void operator()() const { + framework::Tensor cpu_tensor; + T* ptr = cpu_tensor.mutable_data(dims_); + const T* old_ptr = + tensor_->memory_size() == 0 ? nullptr : tensor_->data(); + if (old_ptr != nullptr) { + std::copy(old_ptr, old_ptr + tensor_->numel(), ptr); + } + tensor_->ShareDataWith(cpu_tensor); + } + + framework::Tensor* tensor_; + framework::DDim dims_; +}; +// TensorCopyVisitor(value, i * value_width, *value_.get(), +// index * value_width, value_width)); +struct TensorCopyVisitor { + TensorCopyVisitor(framework::Tensor* dst, int64_t dst_offset, + const framework::Tensor src, int64_t src_offset, + int64_t size) + : dst_(dst), + dst_offset_(dst_offset), + src_(src), + src_offset_(src_offset), + size_(size) {} + + template + void operator()() const { + // TODO(Yancey1989): support other place + memory::Copy(dst_->mutable_data() + dst_offset_, + src_.data() + src_offset_, size_ * sizeof(T)); + } + + framework::Tensor* dst_; + int64_t dst_offset_; + framework::Tensor src_; + int64_t src_offset_; + int64_t size_; +}; + +bool SelectedRows::HasKey(int64_t key) const { + return std::find(rows_.begin(), rows_.end(), key) == rows_.end() ? false + : true; +} + +// std::vector SelectedRows::Get(std::vector keys, +// framework::Tensor* value) const { +// PADDLE_MOBILE_ENFORCE(value->IsInitialized(), +// "The value tensor should be initialized."); +// std::vector non_keys; +// int64_t value_width = value_->numel() / value_->dims()[0]; +// PADDLE_MOBILE_ENFORCE(value_width == value->numel() / value->dims()[0], +// "output tensor should have the same shape with table " +// "execpt the dims[0]."); +// +// for (size_t i = 0; i < keys.size(); ++i) { +// int64_t index = Index(keys[i]); +// if (index == -1) { +// non_keys.push_back(keys[i]); +// } else { +// framework::VisitDataType( +// framework::ToDataType(value_->type()), +// TensorCopyVisitor(value, i * value_width, *value_.get(), +// index * value_width, value_width)); +// } +// } +// return non_keys; +//} + +// bool SelectedRows::Set(int64_t key, const framework::Tensor& value) { +// PADDLE_MOBILE_ENFORCE(value.IsInitialized(), "The value should be +// initialized."); if (value_->IsInitialized()) { +// PADDLE_MOBILE_ENFORCE( +// value.type() == value_->type(), +// "The type of the value should be same with the original value"); +// } +// PADDLE_MOBILE_ENFORCE(value.dims()[0] == static_cast(1), +// "The first dim of value should be 1."); +// auto index = Index(key); +// bool is_new_key = false; +// if (index == -1) { +// rows_.push_back(key); +// index = rows_.size() - 1; +// is_new_key = true; +// // whether need to resize the table +// if (static_cast(rows_.size()) > value_->dims()[0]) { +// auto dims = value_->dims(); +// dims[0] = (dims[0] + 1) << 1; +// framework::VisitDataType(framework::ToDataType(value.type()), +// ReAllocateVisitor(value_.get(), dims)); +// } +// } +// +// framework::VisitDataType( +// framework::ToDataType(value.type()), +// TensorCopyVisitor(value_.get(), +// index * value_->numel() / value_->dims()[0], value, +// static_cast(0), value.numel())); +// return is_new_key; +//} + +} // namespace framework +} // namespace paddle_mobile diff --git a/src/framework/selected_rows.h b/src/framework/selected_rows.h new file mode 100644 index 0000000000000000000000000000000000000000..db49bd91159116883e5fcb148ef3ed012ec42e71 --- /dev/null +++ b/src/framework/selected_rows.h @@ -0,0 +1,138 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include + +#include "framework/lod_tensor.h" +#include "framework/mixed_vector.h" +#include "framework/tensor.h" +#include "memory/t_malloc.h" + +namespace paddle_mobile { +namespace framework { + +class SelectedRows { + /* + * @brief We can use the SelectedRows structure to reproduce a sparse table. + * A sparse table is a key-value structure that the key is an `int64_t` + * number, + * and the value is a Tensor which the first dimension is 0. + * You can use the following interface to operate the sparse table, and you + * can find + * some detail information from the comments of each interface: + * + * HasKey(key), whether the sparse table has the specified key. + * Set(key, value), set a key-value pair into the sparse table. + * Get(keys, value*), get value by given key list and apply it to the given + * value pointer + * with the specified offset. + * + */ + public: + SelectedRows(const std::vector& rows, const int64_t& height) + : rows_(rows), height_(height) { + value_.reset(new Tensor()); + } + + SelectedRows() { + height_ = 0; + value_.reset(new Tensor()); + } + + // platform::Place place() const { return value_->place(); } + + const Tensor& value() const { return *value_; } + + Tensor* mutable_value() { return value_.get(); } + + int64_t height() const { return height_; } + + void set_height(int64_t height) { height_ = height; } + + const Vector& rows() const { return rows_; } + + Vector* mutable_rows() { return &rows_; } + + void set_rows(const Vector& rows) { rows_ = rows; } + + /* + * @brief wheter has the specified key in the table. + * + * @return true if the key is exists. + */ + bool HasKey(int64_t key) const; + + /* + * @brief Get value by the key list, if the + * + * @return a list of keys which does not exists in table + */ + std::vector Get(std::vector keys, + framework::Tensor* tensor) const; + + /* + * @brief Set a key-value pair into the table. + * This function will double the value memory if it's not engouth. + * + * @note: + * 1. The first dim of the value should be 1 + * 2. The value should be initialized and the data type + * should be the same with the table. + * + * @return true if the key is a new one, otherwise false + * + */ + bool Set(int64_t key, const Tensor& value); + + /* + * @brief Get the index of key in rows + * + * @return -1 if the key does not exists. + */ + int64_t Index(int64_t key) const { + auto it = std::find(rows_.begin(), rows_.end(), key); + if (it == rows_.end()) { + return static_cast(-1); + } + return static_cast(std::distance(rows_.begin(), it)); + } + + DDim GetCompleteDims() const { + std::vector dims = vectorize(value_->dims()); + dims[0] = height_; + return make_ddim(dims); + } + + private: + // Notice: rows can be duplicate. We can have {0, 4, 7, 0, 5, 7, 9} here. + // SelectedRows are simply concated when adding together. Until a + // SelectedRows add a Tensor, will the duplicate rows be handled. + Vector rows_; + std::unique_ptr value_{nullptr}; + int64_t height_; +}; + +/* + * Serialize/Desiralize SelectedRows to std::ostream + * You can pass ofstream or ostringstream to serilize to file + * or to a in memory string. GPU tensor will be copied to CPU. + */ +void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows); +void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows); + +} // namespace framework +} // namespace paddle_mobile diff --git a/src/framework/tensor.h b/src/framework/tensor.h index c3e1393dc045c3be804407f905a974b716b4442a..496cde98e57561ca048f356fa397f5447b9050f5 100644 --- a/src/framework/tensor.h +++ b/src/framework/tensor.h @@ -338,6 +338,8 @@ inline Print &operator<<(Print &printer, const Tensor &tensor) { for (int i = 0; i < tensor.numel(); i += stride) { if (tensor.type() == typeid(float)) { printer << tensor.data()[i] << " "; + } else if (tensor.type() == typeid(int32_t)) { + printer << tensor.data()[i] << " "; } else if (tensor.type() == typeid(int64_t)) { printer << tensor.data()[i] << " "; } else if (tensor.type() == typeid(int8_t)) { diff --git a/src/operators/elementwise_mul_op.cpp b/src/operators/elementwise_mul_op.cpp new file mode 100644 index 0000000000000000000000000000000000000000..335a908ace54664f0bcbca37bdcde30047edee5d --- /dev/null +++ b/src/operators/elementwise_mul_op.cpp @@ -0,0 +1,41 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef ELEMENTWISEMUL_OP + +#include "operators/elementwise_mul_op.h" + +namespace paddle_mobile { +namespace operators { + +template +void ElementwiseMulOp::InferShape() const { + auto x_dim = this->param_.InputX()->dims(); + this->param_.Out()->Resize(x_dim); +} + +} // namespace operators +} // namespace paddle_mobile + +namespace ops = paddle_mobile::operators; +#ifdef PADDLE_MOBILE_CPU +REGISTER_OPERATOR_CPU(elementwise_mul, ops::ElementwiseMulOp); +#endif +#ifdef PADDLE_MOBILE_MALI_GPU +REGISTER_OPERATOR_MALI_GPU(elementwise_mul, ops::ElementwiseMulOp); +#endif +#ifdef PADDLE_MOBILE_FPGA +#endif + +#endif diff --git a/src/operators/elementwise_mul_op.h b/src/operators/elementwise_mul_op.h new file mode 100644 index 0000000000000000000000000000000000000000..991b03a486d65c720b88b80a1aece417b9919d3d --- /dev/null +++ b/src/operators/elementwise_mul_op.h @@ -0,0 +1,51 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef ELEMENTWISEMUL_OP + +#pragma once + +#include +#include "framework/operator.h" +#include "kernel/elementwise_mul_kernel.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { +using std::string; +template +class ElementwiseMulOp : public framework::OperatorWithKernel< + DeviceType, ElementwiseMulParam, + operators::ElementwiseMulKernel> { + public: + ElementwiseMulOp(const string &type, const VariableNameMap &inputs, + const VariableNameMap &outputs, + const framework::AttributeMap &attrs, + std::shared_ptr scope) + : framework::OperatorWithKernel< + DeviceType, ElementwiseMulParam, + operators::ElementwiseMulKernel>( + type, inputs, outputs, attrs, scope) {} + + using framework::OperatorWithKernel< + DeviceType, ElementwiseMulParam, + operators::ElementwiseMulKernel>::OperatorWithKernel; + void InferShape() const override; + + protected: +}; +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/elementwise_sub_op.cpp b/src/operators/elementwise_sub_op.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e5ec33ced29f02a524350ed907ef69f2a5dbfca8 --- /dev/null +++ b/src/operators/elementwise_sub_op.cpp @@ -0,0 +1,41 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef ELEMENTWISESUB_OP + +#include "operators/elementwise_sub_op.h" + +namespace paddle_mobile { +namespace operators { + +template +void ElementwiseSubOp::InferShape() const { + auto x_dim = this->param_.InputX()->dims(); + this->param_.Out()->Resize(x_dim); +} + +} // namespace operators +} // namespace paddle_mobile + +namespace ops = paddle_mobile::operators; +#ifdef PADDLE_MOBILE_CPU +REGISTER_OPERATOR_CPU(elementwise_sub, ops::ElementwiseSubOp); +#endif +#ifdef PADDLE_MOBILE_MALI_GPU +REGISTER_OPERATOR_MALI_GPU(elementwise_sub, ops::ElementwiseSubOp); +#endif +#ifdef PADDLE_MOBILE_FPGA +#endif + +#endif diff --git a/src/operators/elementwise_sub_op.h b/src/operators/elementwise_sub_op.h new file mode 100644 index 0000000000000000000000000000000000000000..2edd2581a9d3929a29459df60f514132796a53e2 --- /dev/null +++ b/src/operators/elementwise_sub_op.h @@ -0,0 +1,51 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef ELEMENTWISESUB_OP + +#pragma once + +#include +#include "framework/operator.h" +#include "kernel/elementwise_sub_kernel.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { +using std::string; +template +class ElementwiseSubOp : public framework::OperatorWithKernel< + DeviceType, ElementwiseSubParam, + operators::ElementwiseSubKernel> { + public: + ElementwiseSubOp(const string &type, const VariableNameMap &inputs, + const VariableNameMap &outputs, + const framework::AttributeMap &attrs, + std::shared_ptr scope) + : framework::OperatorWithKernel< + DeviceType, ElementwiseSubParam, + operators::ElementwiseSubKernel>( + type, inputs, outputs, attrs, scope) {} + + using framework::OperatorWithKernel< + DeviceType, ElementwiseSubParam, + operators::ElementwiseSubKernel>::OperatorWithKernel; + void InferShape() const override; + + protected: +}; +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/arm/elementwise_mul_kernel.cpp b/src/operators/kernel/arm/elementwise_mul_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..00205952a2567aae5927e318c494c90bc4a5ffbb --- /dev/null +++ b/src/operators/kernel/arm/elementwise_mul_kernel.cpp @@ -0,0 +1,38 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef ELEMENTWISEMUL_OP + +#include "operators/kernel/elementwise_mul_kernel.h" +#include "operators/kernel/central-arm-func/elementwise_mul_arm_func.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool ElementwiseMulKernel::Init(ElementwiseMulParam *param) { + return true; +} + +template <> +void ElementwiseMulKernel::Compute( + const ElementwiseMulParam ¶m) const { + ElementwiseMulCompute(param); + param.Out()->set_lod(param.InputX()->lod()); +} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/arm/elementwise_sub_kernel.cpp b/src/operators/kernel/arm/elementwise_sub_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d78b3e31098ef7ef929a0d2c00043fab7193b01c --- /dev/null +++ b/src/operators/kernel/arm/elementwise_sub_kernel.cpp @@ -0,0 +1,38 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef ELEMENTWISESUB_OP + +#include "operators/kernel/elementwise_sub_kernel.h" +#include "operators/kernel/central-arm-func/elementwise_sub_arm_func.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool ElementwiseSubKernel::Init(ElementwiseSubParam *param) { + return true; +} + +template <> +void ElementwiseSubKernel::Compute( + const ElementwiseSubParam ¶m) const { + ElementwiseSubCompute(param); + param.Out()->set_lod(param.InputX()->lod()); +} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/arm/mul_kernel.cpp b/src/operators/kernel/arm/mul_kernel.cpp index aa3ee7077eb7db440c8493eae5b95f03a42196a4..276281f963e449af9d55f7c5ca58ef5da17e6f93 100644 --- a/src/operators/kernel/arm/mul_kernel.cpp +++ b/src/operators/kernel/arm/mul_kernel.cpp @@ -31,6 +31,8 @@ void MulKernel::Compute(const MulParam ¶m) const { param.Out()->set_lod(param.InputX()->lod()); } +template class MulKernel; + } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/kernel/arm/sum_kernel.cpp b/src/operators/kernel/arm/sum_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0290037522a2bf3b3c88ce129eda277a401fecb5 --- /dev/null +++ b/src/operators/kernel/arm/sum_kernel.cpp @@ -0,0 +1,37 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef SUM_OP + +#include "operators/kernel/sum_kernel.h" +#include "operators/kernel/central-arm-func/sum_arm_func.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool SumKernel::Init(SumParam *param) { + return true; +} + +template <> +void SumKernel::Compute(const SumParam ¶m) const { + SumCompute(param); + param.Out()->set_lod(param.Inputs()[0]->lod()); +} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/central-arm-func/elementwise_mul_arm_func.h b/src/operators/kernel/central-arm-func/elementwise_mul_arm_func.h new file mode 100644 index 0000000000000000000000000000000000000000..0aed7ff8d4f7abbe64de288e4f22d3b691a23bbc --- /dev/null +++ b/src/operators/kernel/central-arm-func/elementwise_mul_arm_func.h @@ -0,0 +1,45 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef ELEMENTWISEMUL_OP + +#pragma once +#include "operators/math/elementwise_op_function.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +template +struct MulFunctor { + inline T operator()(T a, T b) const { return a * b; } +}; + +template +void ElementwiseMulCompute(const ElementwiseMulParam ¶m) { + const Tensor *input_x = param.InputX(); + const Tensor *input_y = param.InputY(); + Tensor *Out = param.Out(); + Out->mutable_data(); + int axis = param.Axis(); + ElementwiseComputeEx, float>(input_x, input_y, axis, + MulFunctor(), Out); +} + +template class ElementwiseMulKernel; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/central-arm-func/elementwise_sub_arm_func.h b/src/operators/kernel/central-arm-func/elementwise_sub_arm_func.h new file mode 100644 index 0000000000000000000000000000000000000000..663c65c83a0f5b76e292925ea8cb0994b0f99ad1 --- /dev/null +++ b/src/operators/kernel/central-arm-func/elementwise_sub_arm_func.h @@ -0,0 +1,45 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef ELEMENTWISESUB_OP + +#pragma once +#include "operators/math/elementwise_op_function.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +template +struct SubFunctor { + inline T operator()(T a, T b) const { return a - b; } +}; + +template +void ElementwiseSubCompute(const ElementwiseSubParam ¶m) { + const Tensor *input_x = param.InputX(); + const Tensor *input_y = param.InputY(); + Tensor *Out = param.Out(); + Out->mutable_data(); + int axis = param.Axis(); + ElementwiseComputeEx, float>(input_x, input_y, axis, + SubFunctor(), Out); +} + +template class ElementwiseSubKernel; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/central-arm-func/multiclass_nms_arm_func.h b/src/operators/kernel/central-arm-func/multiclass_nms_arm_func.h index 9de57910540b4c9f7ab807053add9c5af9947ae7..533edd69b6160115fb81066cb1928fb4246ca5be 100644 --- a/src/operators/kernel/central-arm-func/multiclass_nms_arm_func.h +++ b/src/operators/kernel/central-arm-func/multiclass_nms_arm_func.h @@ -20,14 +20,12 @@ limitations under the License. */ #include #include #include "framework/tensor.h" +#include "operators/math/poly_util.h" #include "operators/op_param.h" namespace paddle_mobile { namespace operators { -constexpr int kOutputDim = 6; -constexpr int kBBoxSize = 4; - template bool SortScorePairDescend(const std::pair& pair1, const std::pair& pair2) { @@ -90,6 +88,21 @@ static inline T JaccardOverlap(const T* box1, const T* box2, } } +template +static inline T PolyIoU(const T* box1, const T* box2, const size_t box_size, + const bool normalized) { + T bbox1_area = math::PolyArea(box1, box_size, normalized); + T bbox2_area = math::PolyArea(box2, box_size, normalized); + T inter_area = math::PolyOverlapArea(box1, box2, box_size, normalized); + if (bbox1_area == 0 || bbox2_area == 0 || inter_area == 0) { + // If coordinate values are is invalid + // if area size <= 0, return 0. + return static_cast(0.); + } else { + return inter_area / (bbox1_area + bbox2_area - inter_area); + } +} + template static inline void NMSFast(const framework::Tensor& bbox, const framework::Tensor& scores, @@ -116,8 +129,14 @@ static inline void NMSFast(const framework::Tensor& bbox, for (size_t k = 0; k < selected_indices->size(); ++k) { if (keep) { const int kept_idx = (*selected_indices)[k]; - T overlap = JaccardOverlap(bbox_data + idx * box_size, + T overlap = T(0.); + if (box_size == 4) { + overlap = JaccardOverlap(bbox_data + idx * box_size, bbox_data + kept_idx * box_size, true); + } else { + overlap = PolyIoU(bbox_data + idx * box_size, + bbox_data + kept_idx * box_size, box_size, true); + } keep = overlap <= adaptive_threshold; } else { break; @@ -190,6 +209,8 @@ void MultiClassOutput(const framework::Tensor& scores, const std::map>& selected_indices, framework::Tensor* outs) { int predict_dim = scores.dims()[1]; + int box_size = bboxes.dims()[1]; + int out_dim = bboxes.dims()[1] + 2; auto* scores_data = scores.data(); auto* bboxes_data = bboxes.data(); auto* odata = outs->data(); @@ -202,11 +223,11 @@ void MultiClassOutput(const framework::Tensor& scores, const std::vector& indices = it.second; for (size_t j = 0; j < indices.size(); ++j) { int idx = indices[j]; - const T* bdata = bboxes_data + idx * kBBoxSize; - odata[count * kOutputDim] = label; // label - odata[count * kOutputDim + 1] = sdata[idx]; // score + const T* bdata = bboxes_data + idx * box_size; + odata[count * out_dim] = label; // label + odata[count * out_dim + 1] = sdata[idx]; // score // xmin, ymin, xmax, ymax - std::memcpy(odata + count * kOutputDim + 2, bdata, 4 * sizeof(T)); + std::memcpy(odata + count * out_dim + 2, bdata, box_size * sizeof(T)); count++; } } @@ -256,7 +277,8 @@ void MultiClassNMSCompute(const MultiClassNMSParam& param) { float* od = outs->mutable_data({1}); od[0] = -1; } else { - outs->mutable_data({num_kept, kOutputDim}); + int64_t out_dim = box_dim + 2; + outs->mutable_data({num_kept, out_dim}); for (int64_t i = 0; i < batch_size; ++i) { framework::Tensor ins_score = input_scores->Slice(i, i + 1); ins_score.Resize({class_num, predict_dim}); diff --git a/src/operators/kernel/central-arm-func/sum_arm_func.h b/src/operators/kernel/central-arm-func/sum_arm_func.h new file mode 100644 index 0000000000000000000000000000000000000000..36c7ac9694bde85fbf702ad8adf5ffda8744da1d --- /dev/null +++ b/src/operators/kernel/central-arm-func/sum_arm_func.h @@ -0,0 +1,153 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef SUM_OP +#pragma once + +#include +#include "operators/math/selected_rows_functor.h" + +namespace paddle_mobile { +namespace operators { + +using LoDTensorArray = std::vector; + +template +void SumCompute(const SumParam ¶m) { + auto inputsvars = param.InputsVars(); + int N = inputsvars.size(); + auto *outvar = param.OutVar(); + + bool in_place = outvar == inputsvars[0]; + if (outvar->IsType()) { + auto *out = outvar->GetMutable(); + if (!in_place) { + out->mutable_data(); + } + auto *outptr = out->data(); + // auto result = Flatten(*out); + + if (!in_place) { + std::fill(out->data(), out->data() + out->numel(), 0); + } + math::SelectedRowsAddToTensor functor; + for (int i = in_place ? 1 : 0; i < N; i++) { + if (inputsvars[i]->IsType()) { + auto *in_t = inputsvars[i]->Get(); + auto *inptr = in_t->data(); + if (in_t->numel() == 0) { + continue; + } + for (int j = 0; j < out->numel(); ++j) { + outptr[j] = outptr[j] + inptr[j]; + } + + } else if (inputsvars[i]->IsType()) { + auto *in_t = inputsvars[i]->Get(); + functor(*in_t, out); + } else { + PADDLE_MOBILE_THROW_EXCEPTION( + "Variable type must be LoDTensor/SelectedRows."); + } + } + + } else if (outvar->IsType()) { + std::unique_ptr in0; + if (in_place) { + // If is in_place, we store the input[0] to in0 + auto *in_sel0 = inputsvars[0]->Get(); + auto &rows = in_sel0->rows(); + in0.reset(new framework::SelectedRows(rows, in_sel0->height())); + in0->mutable_value()->ShareDataWith(in_sel0->value()); + } + + auto get_selected_row = [&](size_t i) -> const framework::SelectedRows & { + if (i == 0 && in0) { + return *in0.get(); + } else { + return *(inputsvars[i]->Get()); + } + }; + + auto *out = outvar->GetMutable(); + out->mutable_rows()->clear(); + auto *out_value = out->mutable_value(); + + // Runtime InferShape + size_t first_dim = 0; + for (int i = 0; i < N; i++) { + auto &sel_row = get_selected_row(i); + first_dim += sel_row.rows().size(); + } + auto in_dim = framework::vectorize(get_selected_row(N - 1).value().dims()); + in_dim[0] = static_cast(first_dim); + + out_value->Resize(framework::make_ddim(in_dim)); + + // if all the input sparse vars are empty, no need to + // merge these vars. + if (first_dim == 0UL) { + return; + } + out_value->mutable_data(); + math::SelectedRowsAddTo functor; + + int64_t offset = 0; + for (int i = 0; i < N; i++) { + auto &sel_row = get_selected_row(i); + if (sel_row.rows().size() == 0) { + continue; + } + PADDLE_MOBILE_ENFORCE(out->height() == sel_row.height(), + "seletrows height != outheight"); + functor(sel_row, offset, out); + offset += sel_row.value().numel(); + } + } else if (outvar->IsType()) { + auto &out_array = *outvar->GetMutable(); + for (size_t i = in_place ? 1 : 0; i < inputsvars.size(); ++i) { + PADDLE_MOBILE_ENFORCE(inputsvars[i]->IsType(), + "Only support all inputs are TensorArray"); + auto *in_array = inputsvars[i]->Get(); + + for (size_t i = 0; i < in_array->size(); ++i) { + if ((*in_array)[i].numel() != 0) { + if (i >= out_array.size()) { + out_array.resize(i + 1); + } + if (out_array[i].numel() == 0) { + framework::TensorCopy((*in_array)[i], &out_array[i]); + out_array[i].set_lod((*in_array)[i].lod()); + } else { + PADDLE_MOBILE_ENFORCE(out_array[i].lod() == (*in_array)[i].lod(), + "outLod != inLod"); + auto *inptr = (*in_array)[i].data(); + auto *outptr = out_array[i].data(); + + for (int j = 0; j < (*in_array)[i].numel(); ++j) { + outptr[j] = inptr[j] + outptr[j]; + } + } + } + } + } + } else { + PADDLE_MOBILE_THROW_EXCEPTION( + "Unexpected branch, output variable type is %s", outvar->Type().name()); + } +} +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/elementwise_mul_kernel.h b/src/operators/kernel/elementwise_mul_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..63f0df4815dc143e482140a855eb254bd016d50c --- /dev/null +++ b/src/operators/kernel/elementwise_mul_kernel.h @@ -0,0 +1,37 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef ELEMENTWISEMUL_OP + +#pragma once + +#include "framework/operator.h" +#include "operators/math/elementwise_op_function.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +template +class ElementwiseMulKernel + : public framework::OpKernelBase> { + public: + void Compute(const ElementwiseMulParam ¶m) const; + bool Init(ElementwiseMulParam *param); +}; +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/elementwise_sub_kernel.h b/src/operators/kernel/elementwise_sub_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..9516dcbd3de09debe233571eb5f60b3b8b19a2fa --- /dev/null +++ b/src/operators/kernel/elementwise_sub_kernel.h @@ -0,0 +1,38 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef ELEMENTWISEADD_OP + +#pragma once + +#include "framework/operator.h" +#include "operators/math/elementwise_op_function.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +template +class ElementwiseSubKernel + : public framework::OpKernelBase> { + public: + void Compute(const ElementwiseSubParam ¶m) const; + bool Init(ElementwiseSubParam *param); +}; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/fpga/conv_add_bn_kernel.cpp b/src/operators/kernel/fpga/conv_add_bn_kernel.cpp index 671df76967b4537d111695cdbe091b9c7de2c5a2..9b3944fc9a9ab308d9fe8b791a34e09651b87e6e 100644 --- a/src/operators/kernel/fpga/conv_add_bn_kernel.cpp +++ b/src/operators/kernel/fpga/conv_add_bn_kernel.cpp @@ -66,10 +66,11 @@ bool ConvAddBNKernel::Init(FusionConvAddBNParam *param) { fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); fpga::format_fp16_ofm(out); - fpga::WrapperConvArgs conv_arg = {0}; - fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled, - param->Groups(), param->Strides()[0], param->Strides()[1], - param->Paddings()[0], param->Paddings()[1], bs_ptr); + fpga::SplitConvArgs conv_arg = {0}; + fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled, + param->Groups(), param->Strides()[0], + param->Strides()[1], param->Paddings()[0], + param->Paddings()[1], bs_ptr); param->SetFpgaArgs(conv_arg); return true; diff --git a/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp index ea01245f1207739d4234ea3509451a2de1d321f4..83f74e97d04eda29f3aaa6a0cc16ed7d194321d8 100644 --- a/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp +++ b/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp @@ -65,10 +65,11 @@ bool ConvAddBNReluKernel::Init( fpga::format_fp16_ofm(out); - fpga::WrapperConvArgs conv_arg = {0}; - fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled, - param->Groups(), param->Strides()[0], param->Strides()[1], - param->Paddings()[0], param->Paddings()[1], bs_ptr); + fpga::SplitConvArgs conv_arg = {0}; + fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled, + param->Groups(), param->Strides()[0], + param->Strides()[1], param->Paddings()[0], + param->Paddings()[1], bs_ptr); param->SetFpgaArgs(conv_arg); return true; } diff --git a/src/operators/kernel/fpga/conv_add_relu_kernel.cpp b/src/operators/kernel/fpga/conv_add_relu_kernel.cpp index 928b73e4d30144cdf1128a018628b6208fcfd5f0..4975f2a905dcd76c5b7f013eafaa376dd2bb1646 100644 --- a/src/operators/kernel/fpga/conv_add_relu_kernel.cpp +++ b/src/operators/kernel/fpga/conv_add_relu_kernel.cpp @@ -47,10 +47,11 @@ bool ConvAddReluKernel::Init(FusionConvAddReluParam *param) { fpga::format_fp16_ofm(out); - fpga::WrapperConvArgs conv_arg = {0}; - fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled, - param->Groups(), param->Strides()[0], param->Strides()[1], - param->Paddings()[0], param->Paddings()[1], bs_ptr); + fpga::SplitConvArgs conv_arg = {0}; + fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled, + param->Groups(), param->Strides()[0], + param->Strides()[1], param->Paddings()[0], + param->Paddings()[1], bs_ptr); param->SetFpgaArgs(conv_arg); return true; } diff --git a/src/operators/kernel/fpga/conv_bn_kernel.cpp b/src/operators/kernel/fpga/conv_bn_kernel.cpp index fea211af74b634fc0dd8dcee1db7c2c004145561..276e71b6a44e9a7beba0d5db2f51472a9927d8da 100644 --- a/src/operators/kernel/fpga/conv_bn_kernel.cpp +++ b/src/operators/kernel/fpga/conv_bn_kernel.cpp @@ -59,10 +59,11 @@ bool ConvBNKernel::Init(FusionConvBNParam *param) { fpga::format_fp16_ofm(out); - fpga::WrapperConvArgs conv_arg = {0}; - fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled, - param->Groups(), param->Strides()[0], param->Strides()[1], - param->Paddings()[0], param->Paddings()[1], bs_ptr); + fpga::SplitConvArgs conv_arg = {0}; + fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled, + param->Groups(), param->Strides()[0], + param->Strides()[1], param->Paddings()[0], + param->Paddings()[1], bs_ptr); param->SetFpgaArgs(conv_arg); return true; } diff --git a/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp b/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp index 87fe12664e75717c78d79ec50821a9bb6201c5a0..f519a37cb57378a603969adae255f88ae8a5df2a 100644 --- a/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp +++ b/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp @@ -59,10 +59,11 @@ bool ConvBNReluKernel::Init(FusionConvBNReluParam *param) { fpga::format_fp16_ofm(out); - fpga::WrapperConvArgs conv_arg = {0}; - fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled, - param->Groups(), param->Strides()[0], param->Strides()[1], - param->Paddings()[0], param->Paddings()[1], bs_ptr); + fpga::SplitConvArgs conv_arg = {0}; + fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled, + param->Groups(), param->Strides()[0], + param->Strides()[1], param->Paddings()[0], + param->Paddings()[1], bs_ptr); param->SetFpgaArgs(conv_arg); return true; } diff --git a/src/operators/kernel/fpga/fc_relu_kernel.cpp b/src/operators/kernel/fpga/fc_relu_kernel.cpp index 904dd8a1da9e67d0c1283806e766d3a25dc27309..52d7c0a4e69080e11f86d1507829e7e779a69228 100644 --- a/src/operators/kernel/fpga/fc_relu_kernel.cpp +++ b/src/operators/kernel/fpga/fc_relu_kernel.cpp @@ -44,6 +44,7 @@ bool FusionFcReluKernel::Init(FusionFcReluParam *param) { int width = (uint32_t)input_x->dims()[3]; int filter_channel = chw / height / width; + out->Resize(framework::make_ddim({1, channel, 1, 1})); filter->Resize(framework::make_ddim({num, filter_channel, height, width})); float max_value = fpga::filter_find_max(filter); fpga::format_fc_filter(filter, max_value); @@ -52,9 +53,9 @@ bool FusionFcReluKernel::Init(FusionFcReluParam *param) { fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); fpga::format_fp16_ofm(out); - fpga::WrapperConvArgs conv_arg = {0}; - fpga::fill_conv_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, 0, - 0, bs_ptr); + fpga::SplitConvArgs conv_arg = {0}; + fpga::fill_split_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, + 0, 0, bs_ptr); param->SetFpgaArgs(conv_arg); return true; } diff --git a/src/operators/kernel/fpga/fusion_fc_kernel.cpp b/src/operators/kernel/fpga/fusion_fc_kernel.cpp index 46dae1b2a076add9f17e4e5bc6d3a99ad583fb50..407e14238d542604e876ced624d5a0db698a6101 100644 --- a/src/operators/kernel/fpga/fusion_fc_kernel.cpp +++ b/src/operators/kernel/fpga/fusion_fc_kernel.cpp @@ -45,6 +45,7 @@ bool FusionFcKernel::Init(FusionFcParam *param) { int width = (uint32_t)input_x->dims()[3]; int filter_channel = chw / height / width; + out->Resize(framework::make_ddim({1, channel, 1, 1})); filter->Resize(framework::make_ddim({num, filter_channel, height, width})); float max_value = fpga::filter_find_max(filter); fpga::format_fc_filter(filter, max_value); @@ -53,9 +54,9 @@ bool FusionFcKernel::Init(FusionFcParam *param) { fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); fpga::format_fp16_ofm(out); - fpga::WrapperConvArgs conv_arg = {0}; - fpga::fill_conv_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, 0, - 0, bs_ptr); + fpga::SplitConvArgs conv_arg = {0}; + fpga::fill_split_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, + 0, 0, bs_ptr); param->SetFpgaArgs(conv_arg); return true; } diff --git a/src/operators/kernel/fpga/mul_kernel.cpp b/src/operators/kernel/fpga/mul_kernel.cpp deleted file mode 100644 index 07aa4bcc43d28805ab0660bf89149c5ec5f1c732..0000000000000000000000000000000000000000 --- a/src/operators/kernel/fpga/mul_kernel.cpp +++ /dev/null @@ -1,70 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef MUL_OP - -#include "operators/kernel/mul_kernel.h" - -namespace paddle_mobile { -namespace operators { - -template <> -bool MulKernel::Init(MulParam *param) { - bool relu_enabled = false; - auto input_x = const_cast(param->InputX()); - auto filter = const_cast(param->InputY()); - auto out = param->Out(); - - PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0], - "Image channel should be equal to weight number"); - int channel = (uint32_t)out->dims()[1]; - auto bs_ptr = - (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT - for (int i = 0; i < channel; i++) { - bs_ptr[i + channel] = 1; - bs_ptr[i] = 0; - } - int num = (uint32_t)filter->dims()[1]; - int chw = (uint32_t)filter->dims()[0]; - PADDLE_MOBILE_ENFORCE( - chw == input_x->numel(), - "Filter element num should be equal to IFM element num"); - int height = (uint32_t)input_x->dims()[2]; - int width = (uint32_t)input_x->dims()[3]; - int filter_channel = chw / height / width; - - filter->Resize(framework::make_ddim({num, filter_channel, height, width})); - float max_value = fpga::filter_find_max(filter); - fpga::format_fc_filter(filter, max_value); - - int element_num_per_div = fpga::get_filter_num_per_div(filter, 1); - fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); - fpga::format_fp16_ofm(out); - - fpga::WrapperConvArgs conv_arg = {0}; - fpga::fill_conv_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, 0, - 0, bs_ptr); - param->SetFpgaArgs(conv_arg); - return true; -} - -template <> -void MulKernel::Compute(const MulParam ¶m) const { - fpga::ComputeFpgaConv(param.FpgaArgs()); -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/src/operators/kernel/fpga/softmax_kernel.cpp b/src/operators/kernel/fpga/softmax_kernel.cpp index dba555708f505eb9bdf81d6f4487227c88f0a616..e36db57f4b4f18712df50b2b132cdd1032a41921 100644 --- a/src/operators/kernel/fpga/softmax_kernel.cpp +++ b/src/operators/kernel/fpga/softmax_kernel.cpp @@ -27,7 +27,7 @@ bool SoftmaxKernel::Init(SoftmaxParam *param) { auto input = const_cast(param->InputX()); auto input_ptr = input->data(); auto float_input = new Tensor; - float_input->mutable_data(input->dims()); + float_input->mutable_data({1, input->dims()[1]}); fpga::format_fp32_ofm(float_input); fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; @@ -56,7 +56,6 @@ void SoftmaxKernel::Compute( fpga::fpga_invalidate( (void *)in_x->data(), // NOLINT fpga::get_align_image_cw(in_x->dims()[1]) * sizeof(float)); - math::SoftmaxFuntor()(in_x, out); fpga::fpga_flush(out->data(), out->memory_size()); } diff --git a/src/operators/kernel/sum_kernel.h b/src/operators/kernel/sum_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..ed337432e0fd4bf4035b67d4099379ce29918547 --- /dev/null +++ b/src/operators/kernel/sum_kernel.h @@ -0,0 +1,35 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef SUM_OP + +#pragma once +#include "framework/operator.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +template +class SumKernel + : public framework::OpKernelBase> { + public: + void Compute(const SumParam ¶m) const; + bool Init(SumParam *param); +}; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/math/depthwise_conv_3x3.cpp b/src/operators/math/depthwise_conv_3x3.cpp index 91e11fa8ff0184e5321269167b5f4693de2245ac..b6cf28a9ca665a1496ee8032f87c013137deade8 100644 --- a/src/operators/math/depthwise_conv_3x3.cpp +++ b/src/operators/math/depthwise_conv_3x3.cpp @@ -1667,7 +1667,7 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter, const int w_times = (out_w - 2) / 3; float32x4_t zero = vdupq_n_f32(0.0); for (int b = batch_size; b > 0; --b) { - #pragma omp parallel for +#pragma omp parallel for for (int j = 0; j < c; j++) { const float *input_row_ptr; float *output_row_ptr; @@ -1912,9 +1912,7 @@ void DepthwiseConv3x3s2p0(const Tensor *input, const Tensor *filter, float w20 = filter_data[6]; float w21 = filter_data[7]; float w22 = filter_data[8]; - float32x4_t biasv = vld1q_dup_f32(bias_data); - for (int i = 0; i < output_height; i += 1) { for (int m = 0; m < output_width - 2; m += 3) { float *output_ptr = output_data + i * output_width + m; @@ -1949,8 +1947,9 @@ void DepthwiseConv3x3s2p0(const Tensor *input, const Tensor *filter, out0 = vmlaq_n_f32(out0, in4, w20); out0 = vmlaq_n_f32(out0, tmp4, w21); out0 = vmlaq_n_f32(out0, tmp5, w22); - out0 = vaddq_f32(out0, biasv); - + if (if_bias) { + out0 = vaddq_f32(out0, biasv); + } vst1q_lane_f32(output_ptr, out0, 0); vst1q_lane_f32(output_ptr + 1, out0, 1); vst1q_lane_f32(output_ptr + 2, out0, 2); @@ -1960,16 +1959,18 @@ void DepthwiseConv3x3s2p0(const Tensor *input, const Tensor *filter, } for (int j = m; j < output_width; j++) { output_data[i * output_width + j] = - input_data[(2 * i - 1) * input_width + 2 * j - 1] * w00 + - input_data[(2 * i - 1) * input_width + 2 * j] * w01 + - input_data[(2 * i - 1) * input_width + 2 * j + 1] * w02 + - input_data[(2 * i) * input_width + 2 * j - 1] * w10 + - input_data[(2 * i) * input_width + 2 * j] * w11 + - input_data[(2 * i) * input_width + 2 * j + 1] * w12 + - input_data[(2 * i + 1) * input_width + 2 * j - 1] * w20 + - input_data[(2 * i + 1) * input_width + 2 * j] * w21 + - input_data[(2 * i + 1) * input_width + 2 * j + 1] * w22; - output_data[i * output_width + j] += *bias_data; + input_data[(2 * i) * input_width + 2 * j] * w00 + + input_data[(2 * i) * input_width + 2 * j + 1] * w01 + + input_data[(2 * i) * input_width + 2 * j + 2] * w02 + + input_data[(2 * i + 1) * input_width + 2 * j] * w10 + + input_data[(2 * i + 1) * input_width + 2 * j + 1] * w11 + + input_data[(2 * i + 1) * input_width + 2 * j + 2] * w12 + + input_data[(2 * i + 2) * input_width + 2 * j] * w20 + + input_data[(2 * i + 2) * input_width + 2 * j + 1] * w21 + + input_data[(2 * i + 2) * input_width + 2 * j + 2] * w22; + if (if_bias) { + output_data[i * output_width + j] += *bias_data; + } } } } diff --git a/src/operators/math/gemm.h b/src/operators/math/gemm.h index b937173dd3f2d12b153840d99cb35ccb80317dfd..adc6924d8ad273012a9b44677f8ad1a29bc37787 100644 --- a/src/operators/math/gemm.h +++ b/src/operators/math/gemm.h @@ -187,29 +187,29 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb, const float *B, int ldb, float *C, int ldc, float *p, std::string mode, float *bias, float *bias1); - /************************ 8 bit function cluster ************************/ - // 8 bit int small block inner product + // 8 bits function cluster begins + // 8 bits int small block inner product void AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c, int32_t ldc); - // 8 bit int inner product + // 8 bits int inner product void InnerKernelWithBias(int32_t mc, int32_t nc, int8_t alpha, const int8_t *a, const int8_t *b, int8_t beta, int32_t *c, int32_t *C, int32_t ldc, bool relu, int8_t *bias); - // 8 bit int pack function + // 8 bits int pack function void PackMatrixA_6r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A, int32_t lda, int8_t *buffer); void PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B, int32_t ldb, int8_t *buffer); - // 8 bit int matrix product + // 8 bits int matrix product void Sgemm(int32_t m, int32_t n, int32_t k, int8_t alpha, const int8_t *A, int32_t lda, const int8_t *B, int32_t ldb, int8_t beta, int32_t *C, int32_t ldc, bool relu, int8_t *bias); - // 8 bit int write back + // 8 bits int write back // C = alpha * A * B + beta * C void WriteWithAlphaBeta(int32_t mc, int32_t nc, int32_t *c, int32_t *C, int32_t ldc); @@ -239,7 +239,7 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb, float *packedC; float *zero; - // 8 bit int + // 8 bits int int8_t *packedA_int8; int8_t *packedB_int8; int32_t *packedC_int8; diff --git a/src/operators/math/gemm_int8.cpp b/src/operators/math/gemm_int8.cpp index c52cd2fb299b0d9d5e1086db4c9b004a9033af05..bd5286dbcb5c871d5d327875b836ad9777c270bf 100644 --- a/src/operators/math/gemm_int8.cpp +++ b/src/operators/math/gemm_int8.cpp @@ -27,7 +27,7 @@ namespace paddle_mobile { namespace operators { namespace math { -// 8 bit int small block inner product +// 8 bits int small block inner product void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c, int32_t ldc) { #if __ARM_NEON @@ -36,344 +36,409 @@ void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c, b_ptr = b; int32_t kc1 = k >> 3; int32_t kc2 = k & 7; - int32_t kc3 = kc2 >> 1; - int32_t kc4 = kc2 & 1; + int32_t kc3 = kc2 >> 2; + int32_t kc4 = kc2 & 3; + int32_t kc5 = kc4 >> 1; + int32_t kc6 = kc4 & 1; int32_t step = sizeof(int32_t) * ldc; asm volatile( // q4-q15: save 48 results - "pld [%[a_ptr]] \n\t" - "pld [%[b_ptr]] \n\t" - "vmov.s8 q4, #0 \n\t" - "vmov.s8 q5, #0 \n\t" - "vmov.s8 q6, #0 \n\t" - "vmov.s8 q7, #0 \n\t" - "vmov.s8 q8, #0 \n\t" - "vmov.s8 q9, #0 \n\t" - "vmov.s8 q10, #0 \n\t" - "vmov.s8 q11, #0 \n\t" - "vmov.s8 q12, #0 \n\t" - "vmov.s8 q13, #0 \n\t" - "vmov.s8 q14, #0 \n\t" - "vmov.s8 q15, #0 \n\t" - "mov r0, #6 \n\t" - "subs %[kc1], %[kc1], #1 \n\t" - "blt 1f \n\t" - "0: \n\t" - "vld1.s8 {d0}, [%[a_ptr]], r0 \n\t" // A col0 - "vld1.s8 {d1}, [%[a_ptr]], r0 \n\t" // A col1, q0 - // used - "vld1.s8 {d2-d3}, [%[b_ptr]]! \n\t" // B row0, B - // row1, q1 - // used - "vmov.s8 q2, #0 \n\t" // q2 used - "vdup.s8 d6, d0[0] \n\t" - "vdup.s8 d7, d1[0] \n\t" // q3 used - "vmlal.s8 q2, d2, d6 \n\t" // A col00 * B - // row0 - "vmlal.s8 q2, d3, d7 \n\t" // A col10 * B - // row1, q3 - // free - "vaddw.s16 q4, q4, d4 \n\t" - "vaddw.s16 q5, q5, d5 \n\t" // res row 0 - "vmov.s8 q2, #0 \n\t" - "vdup.s8 d6, d0[1] \n\t" - "vdup.s8 d7, d1[1] \n\t" - "vmlal.s8 q2, d2, d6 \n\t" - "vmlal.s8 q2, d3, d7 \n\t" - "vaddw.s16 q6, q6, d4 \n\t" - "vaddw.s16 q7, q7, d5 \n\t" // res row 1 - "vmov.s8 q2, #0 \n\t" - "vdup.s8 d6, d0[2] \n\t" - "vdup.s8 d7, d1[2] \n\t" - "vmlal.s8 q2, d2, d6 \n\t" - "vmlal.s8 q2, d3, d7 \n\t" - "vaddw.s16 q8, q8, d4 \n\t" - "vaddw.s16 q9, q9, d5 \n\t" // res row 2 - "vmov.s8 q2, #0 \n\t" - "vdup.s8 d6, d0[3] \n\t" - "vdup.s8 d7, d1[3] \n\t" - "vmlal.s8 q2, d2, d6 \n\t" - "vmlal.s8 q2, d3, d7 \n\t" - "vaddw.s16 q10, q10, d4 \n\t" - "vaddw.s16 q11, q11, d5 \n\t" // res row 3 - "vmov.s8 q2, #0. \n\t" - "vdup.s8 d6, d0[4] \n\t" - "vdup.s8 d7, d1[4] \n\t" - "vmlal.s8 q2, d2, d6 \n\t" - "vmlal.s8 q2, d3, d7 \n\t" - "vaddw.s16 q12, q12, d4 \n\t" - "vaddw.s16 q13, q13, d5 \n\t" // res row 4 - "vmov.s8 q2, #0 \n\t" - "vdup.s8 d6, d0[5] \n\t" - "vdup.s8 d7, d1[5] \n\t" - "vmlal.s8 q2, d2, d6 \n\t" - "vmlal.s8 q2, d3, d7 \n\t" - "vaddw.s16 q14, q14, d4 \n\t" - "vaddw.s16 q15, q15, d5 \n\t" // res row 5 - - "vld1.s8 {d0}, [%[a_ptr]], r0 \n\t" // A col0 - "vld1.s8 {d1}, [%[a_ptr]], r0 \n\t" // A col1, q0 - // used - "vld1.s8 {d2-d3}, [%[b_ptr]]! \n\t" // B row0, B - // row1, q1 - // used - "vmov.s8 q2, #0 \n\t" // q2 used - "vdup.s8 d6, d0[0] \n\t" - "vdup.s8 d7, d1[0] \n\t" // q3 used - "vmlal.s8 q2, d2, d6 \n\t" // A col00 * B - // row0 - "vmlal.s8 q2, d3, d7 \n\t" // A col10 * B - // row1, q3 - // free - "vaddw.s16 q4, q4, d4 \n\t" - "vaddw.s16 q5, q5, d5 \n\t" // res row 0 - "vmov.s8 q2, #0 \n\t" - "vdup.s8 d6, d0[1] \n\t" - "vdup.s8 d7, d1[1] \n\t" - "vmlal.s8 q2, d2, d6 \n\t" - "vmlal.s8 q2, d3, d7 \n\t" - "vaddw.s16 q6, q6, d4 \n\t" - "vaddw.s16 q7, q7, d5 \n\t" // res row 1 - "vmov.s8 q2, #0 \n\t" - "vdup.s8 d6, d0[2] \n\t" - "vdup.s8 d7, d1[2] \n\t" - "vmlal.s8 q2, d2, d6 \n\t" - "vmlal.s8 q2, d3, d7 \n\t" - "vaddw.s16 q8, q8, d4 \n\t" - "vaddw.s16 q9, q9, d5 \n\t" // res row 2 - "vmov.s8 q2, #0 \n\t" - "vdup.s8 d6, d0[3] \n\t" - "vdup.s8 d7, d1[3] \n\t" - "vmlal.s8 q2, d2, d6 \n\t" - "vmlal.s8 q2, d3, d7 \n\t" - "vaddw.s16 q10, q10, d4 \n\t" - "vaddw.s16 q11, q11, d5 \n\t" // res row 3 - "vmov.s8 q2, #0. \n\t" - "vdup.s8 d6, d0[4] \n\t" - "vdup.s8 d7, d1[4] \n\t" - "vmlal.s8 q2, d2, d6 \n\t" - "vmlal.s8 q2, d3, d7 \n\t" - "vaddw.s16 q12, q12, d4 \n\t" - "vaddw.s16 q13, q13, d5 \n\t" // res row 4 - "vmov.s8 q2, #0 \n\t" - "vdup.s8 d6, d0[5] \n\t" - "vdup.s8 d7, d1[5] \n\t" - "vmlal.s8 q2, d2, d6 \n\t" - "vmlal.s8 q2, d3, d7 \n\t" - "vaddw.s16 q14, q14, d4 \n\t" - "vaddw.s16 q15, q15, d5 \n\t" // res row 5 - - "vld1.s8 {d0}, [%[a_ptr]], r0 \n\t" // A col0 - "vld1.s8 {d1}, [%[a_ptr]], r0 \n\t" // A col1, q0 - // used - "vld1.s8 {d2-d3}, [%[b_ptr]]! \n\t" // B row0, B - // row1, q1 - // used - "vmov.s8 q2, #0 \n\t" // q2 used - "vdup.s8 d6, d0[0] \n\t" - "vdup.s8 d7, d1[0] \n\t" // q3 used - "vmlal.s8 q2, d2, d6 \n\t" // A col00 * B - // row0 - "vmlal.s8 q2, d3, d7 \n\t" // A col10 * B - // row1, q3 - // free - "vaddw.s16 q4, q4, d4 \n\t" - "vaddw.s16 q5, q5, d5 \n\t" // res row 0 - "vmov.s8 q2, #0 \n\t" - "vdup.s8 d6, d0[1] \n\t" - "vdup.s8 d7, d1[1] \n\t" - "vmlal.s8 q2, d2, d6 \n\t" - "vmlal.s8 q2, d3, d7 \n\t" - "vaddw.s16 q6, q6, d4 \n\t" - "vaddw.s16 q7, q7, d5 \n\t" // res row 1 - "vmov.s8 q2, #0 \n\t" - "vdup.s8 d6, d0[2] \n\t" - "vdup.s8 d7, d1[2] \n\t" - "vmlal.s8 q2, d2, d6 \n\t" - "vmlal.s8 q2, d3, d7 \n\t" - "vaddw.s16 q8, q8, d4 \n\t" - "vaddw.s16 q9, q9, d5 \n\t" // res row 2 - "vmov.s8 q2, #0 \n\t" - "vdup.s8 d6, d0[3] \n\t" - "vdup.s8 d7, d1[3] \n\t" - "vmlal.s8 q2, d2, d6 \n\t" - "vmlal.s8 q2, d3, d7 \n\t" - "vaddw.s16 q10, q10, d4 \n\t" - "vaddw.s16 q11, q11, d5 \n\t" // res row 3 - "vmov.s8 q2, #0. \n\t" - "vdup.s8 d6, d0[4] \n\t" - "vdup.s8 d7, d1[4] \n\t" - "vmlal.s8 q2, d2, d6 \n\t" - "vmlal.s8 q2, d3, d7 \n\t" - "vaddw.s16 q12, q12, d4 \n\t" - "vaddw.s16 q13, q13, d5 \n\t" // res row 4 - "vmov.s8 q2, #0 \n\t" - "vdup.s8 d6, d0[5] \n\t" - "vdup.s8 d7, d1[5] \n\t" - "vmlal.s8 q2, d2, d6 \n\t" - "vmlal.s8 q2, d3, d7 \n\t" - "vaddw.s16 q14, q14, d4 \n\t" - "vaddw.s16 q15, q15, d5 \n\t" // res row 5 - - "vld1.s8 {d0}, [%[a_ptr]], r0 \n\t" // A col0 - "vld1.s8 {d1}, [%[a_ptr]], r0 \n\t" // A col1, q0 - // used - "vld1.s8 {d2-d3}, [%[b_ptr]]! \n\t" // B row0, B - // row1, q1 - // used - "vmov.s8 q2, #0 \n\t" // q2 used - "vdup.s8 d6, d0[0] \n\t" - "vdup.s8 d7, d1[0] \n\t" // q3 used - "vmlal.s8 q2, d2, d6 \n\t" // A col00 * B - // row0 - "vmlal.s8 q2, d3, d7 \n\t" // A col10 * B - // row1, q3 - // free - "vaddw.s16 q4, q4, d4 \n\t" - "vaddw.s16 q5, q5, d5 \n\t" // res row 0 - "vmov.s8 q2, #0 \n\t" - "vdup.s8 d6, d0[1] \n\t" - "vdup.s8 d7, d1[1] \n\t" - "vmlal.s8 q2, d2, d6 \n\t" - "vmlal.s8 q2, d3, d7 \n\t" - "vaddw.s16 q6, q6, d4 \n\t" - "vaddw.s16 q7, q7, d5 \n\t" // res row 1 - "vmov.s8 q2, #0 \n\t" - "vdup.s8 d6, d0[2] \n\t" - "vdup.s8 d7, d1[2] \n\t" - "vmlal.s8 q2, d2, d6 \n\t" - "vmlal.s8 q2, d3, d7 \n\t" - "vaddw.s16 q8, q8, d4 \n\t" - "vaddw.s16 q9, q9, d5 \n\t" // res row 2 - "vmov.s8 q2, #0 \n\t" - "vdup.s8 d6, d0[3] \n\t" - "vdup.s8 d7, d1[3] \n\t" - "vmlal.s8 q2, d2, d6 \n\t" - "vmlal.s8 q2, d3, d7 \n\t" - "vaddw.s16 q10, q10, d4 \n\t" - "vaddw.s16 q11, q11, d5 \n\t" // res row 3 - "vmov.s8 q2, #0. \n\t" - "vdup.s8 d6, d0[4] \n\t" - "vdup.s8 d7, d1[4] \n\t" - "vmlal.s8 q2, d2, d6 \n\t" - "vmlal.s8 q2, d3, d7 \n\t" - "vaddw.s16 q12, q12, d4 \n\t" - "vaddw.s16 q13, q13, d5 \n\t" // res row 4 - "vmov.s8 q2, #0 \n\t" - "vdup.s8 d6, d0[5] \n\t" - "vdup.s8 d7, d1[5] \n\t" - "vmlal.s8 q2, d2, d6 \n\t" - "vmlal.s8 q2, d3, d7 \n\t" - "vaddw.s16 q14, q14, d4 \n\t" - "vaddw.s16 q15, q15, d5 \n\t" // res row 5 - - "subs %[kc1], %[kc1], #1 \n\t" // last <8 rows - "bge 0b \n\t" - "1: \n\t" - "subs %[kc3], %[kc3], #1 \n\t" - "blt 3f \n\t" - "2: \n\t" - "vld1.s8 {d0}, [%[a_ptr]], r0 \n\t" // A col0 - "vld1.s8 {d1}, [%[a_ptr]], r0 \n\t" // A col1, q0 - // used - "vld1.s8 {d2-d3}, [%[b_ptr]]! \n\t" // B row0, B - // row1, q1 - // used - "vmov.s8 q2, #0 \n\t" // q2 used - "vdup.s8 d6, d0[0] \n\t" - "vdup.s8 d7, d1[0] \n\t" // q3 used - "vmlal.s8 q2, d2, d6 \n\t" // A col00 * B - // row0 - "vmlal.s8 q2, d3, d7 \n\t" // A col10 * B - // row1, q3 - // free - "vaddw.s16 q4, q4, d4 \n\t" - "vaddw.s16 q5, q5, d5 \n\t" // res row 0 - "vmov.s8 q2, #0 \n\t" - "vdup.s8 d6, d0[1] \n\t" - "vdup.s8 d7, d1[1] \n\t" - "vmlal.s8 q2, d2, d6 \n\t" - "vmlal.s8 q2, d3, d7 \n\t" - "vaddw.s16 q6, q6, d4 \n\t" - "vaddw.s16 q7, q7, d5 \n\t" // res row 1 - "vmov.s8 q2, #0 \n\t" - "vdup.s8 d6, d0[2] \n\t" - "vdup.s8 d7, d1[2] \n\t" - "vmlal.s8 q2, d2, d6 \n\t" - "vmlal.s8 q2, d3, d7 \n\t" - "vaddw.s16 q8, q8, d4 \n\t" - "vaddw.s16 q9, q9, d5 \n\t" // res row 2 - "vmov.s8 q2, #0 \n\t" - "vdup.s8 d6, d0[3] \n\t" - "vdup.s8 d7, d1[3] \n\t" - "vmlal.s8 q2, d2, d6 \n\t" - "vmlal.s8 q2, d3, d7 \n\t" - "vaddw.s16 q10, q10, d4 \n\t" - "vaddw.s16 q11, q11, d5 \n\t" // res row 3 - "vmov.s8 q2, #0. \n\t" - "vdup.s8 d6, d0[4] \n\t" - "vdup.s8 d7, d1[4] \n\t" - "vmlal.s8 q2, d2, d6 \n\t" - "vmlal.s8 q2, d3, d7 \n\t" - "vaddw.s16 q12, q12, d4 \n\t" - "vaddw.s16 q13, q13, d5 \n\t" // res row 4 - "vmov.s8 q2, #0 \n\t" - "vdup.s8 d6, d0[5] \n\t" - "vdup.s8 d7, d1[5] \n\t" - "vmlal.s8 q2, d2, d6 \n\t" - "vmlal.s8 q2, d3, d7 \n\t" - "vaddw.s16 q14, q14, d4 \n\t" - "vaddw.s16 q15, q15, d5 \n\t" // res row 5 - - "subs %[kc3], %[kc3], #1 \n\t" - "bge 2b \n\t" - - "3: \n\t" // odd, last - // row - "subs %[kc4], %[kc4], #1 \n\t" - "blt 4f \n\t" - "vld1.s8 {d0}, [%[a_ptr]] \n\t" - "vld1.s8 {d1}, [%[b_ptr]] \n\t" - "vdup.s8 d2, d0[0] \n\t" - "vmull.s8 q2, d1, d2 \n\t" - "vaddw.s16 q4, q4, d4 \n\t" - "vaddw.s16 q5, q5, d5 \n\t" // res row 0 - "vdup.s8 d2, d0[1] \n\t" - "vmull.s8 q2, d1, d2 \n\t" - "vaddw.s16 q6, q6, d4 \n\t" - "vaddw.s16 q7, q7, d5 \n\t" // res row 1 - "vdup.s8 d2, d0[2] \n\t" - "vmull.s8 q2, d1, d2 \n\t" - "vaddw.s16 q8, q8, d4 \n\t" - "vaddw.s16 q9, q9, d5 \n\t" // res row 2 - "vdup.s8 d2, d0[3] \n\t" - "vmull.s8 q2, d1, d2 \n\t" - "vaddw.s16 q10, q10, d4 \n\t" - "vaddw.s16 q11, q11, d5 \n\t" // res row 3 - "vdup.s8 d2, d0[4] \n\t" - "vmull.s8 q2, d1, d2 \n\t" - "vaddw.s16 q12, q12, d4 \n\t" - "vaddw.s16 q13, q13, d5 \n\t" // res row 4 - "vdup.s8 d2, d0[5] \n\t" - "vmull.s8 q2, d1, d2 \n\t" - "vaddw.s16 q14, q14, d4 \n\t" - "vaddw.s16 q15, q15, d5 \n\t" // res row 4 - "4: \n\t" - "vst1.32 {q4, q5}, [%[c]], %[step] \n\t" - "vst1.32 {q6, q7}, [%[c]], %[step] \n\t" - "vst1.32 {q8, q9}, [%[c]], %[step] \n\t" - "vst1.32 {q10, q11}, [%[c]], %[step] \n\t" - "vst1.32 {q12, q13}, [%[c]], %[step] \n\t" - "vst1.32 {q14, q15}, [%[c]] \n\t" + "pld [%[a_ptr]] \n\t" + "pld [%[b_ptr]] \n\t" + "pld [%[b_ptr], #64] \n\t" + "vmov.s8 q4, #0 \n\t" + "vmov.s8 q5, #0 \n\t" + "vmov.s8 q6, #0 \n\t" + "vmov.s8 q7, #0 \n\t" + "vmov.s8 q8, #0 \n\t" + "vmov.s8 q9, #0 \n\t" + "vmov.s8 q10, #0 \n\t" + "vmov.s8 q11, #0 \n\t" + "vmov.s8 q12, #0 \n\t" + "vmov.s8 q13, #0 \n\t" + "vmov.s8 q14, #0 \n\t" + "vmov.s8 q15, #0 \n\t" + "mov r0, #12 \n\t" + "subs %[kc1], %[kc1], #1 \n\t" + "blt 1f \n\t" + "0: \n\t" + "pld [%[a_ptr], #64] \n\t" + "pld [%[b_ptr], #128] \n\t" + "vld1.s8 {d0-d2}, [%[a_ptr]]! \n\t" // A 4 cols, q0 used, + // 1/2 q3 used + "vmov.s8 q2, #0 \n\t" // q2 used + "vld1.s8 {d6-d7}, [%[b_ptr]]! \n\t" // B 2 rows, B row1, + // q1 + "vdup.s8 d3, d0[0] \n\t" // q3 used // used + "vmlal.s8 q2, d6, d3 \n\t" // A col00 * B row0 + "vdup.s8 d3, d0[6] \n\t" // q3 used + "vmlal.s8 q2, d7, d3 \n\t" // A col10 * B row1, + // q3 free + "vaddw.s16 q4, q4, d4 \n\t" + "vaddw.s16 q5, q5, d5 \n\t" // res row 0 + "vmov.s8 q2, #0 \n\t" + "vdup.s8 d3, d0[1] \n\t" + "vmlal.s8 q2, d6, d3 \n\t" + "vdup.s8 d3, d0[7] \n\t" + "vmlal.s8 q2, d7, d3 \n\t" + "vaddw.s16 q6, q6, d4 \n\t" + "vaddw.s16 q7, q7, d5 \n\t" // res row 1 + "vmov.s8 q2, #0 \n\t" + "vdup.s8 d3, d0[2] \n\t" + "vmlal.s8 q2, d6, d3 \n\t" + "vdup.s8 d3, d1[0] \n\t" + "vmlal.s8 q2, d7, d3 \n\t" + "vaddw.s16 q8, q8, d4 \n\t" + "vaddw.s16 q9, q9, d5 \n\t" // res row 2 + "vmov.s8 q2, #0 \n\t" + "vdup.s8 d3, d0[3] \n\t" + "vmlal.s8 q2, d6, d3 \n\t" + "vdup.s8 d3, d1[1] \n\t" + "vmlal.s8 q2, d7, d3 \n\t" + "vaddw.s16 q10, q10, d4 \n\t" + "vaddw.s16 q11, q11, d5 \n\t" // res row 3 + "vmov.s8 q2, #0 \n\t" + "vdup.s8 d3, d0[4] \n\t" + "vmlal.s8 q2, d6, d3 \n\t" + "vdup.s8 d3, d1[2] \n\t" + "vmlal.s8 q2, d7, d3 \n\t" + "vaddw.s16 q12, q12, d4 \n\t" + "vaddw.s16 q13, q13, d5 \n\t" // res row 4 + "vmov.s8 q2, #0 \n\t" + "vdup.s8 d3, d0[5] \n\t" + "vmlal.s8 q2, d6, d3 \n\t" + "vdup.s8 d3, d1[3] \n\t" + "vmlal.s8 q2, d7, d3 \n\t" + "vaddw.s16 q14, q14, d4 \n\t" + "vaddw.s16 q15, q15, d5 \n\t" // res row 5 + + "vld1.s8 {d6-d7}, [%[b_ptr]]! \n\t" // B 2 rows, B row1, + // q1 + "vmov.s8 q2, #0 \n\t" // q2 used + "vdup.s8 d3, d1[4] \n\t" // q3 used // used + "vmlal.s8 q2, d6, d3 \n\t" // A col00 * B row0 + "vdup.s8 d3, d2[2] \n\t" // q3 used + "vmlal.s8 q2, d7, d3 \n\t" // A col10 * B row1, + // q3 free + "vaddw.s16 q4, q4, d4 \n\t" + "vaddw.s16 q5, q5, d5 \n\t" // res row 0 + "vmov.s8 q2, #0 \n\t" + "vdup.s8 d3, d1[5] \n\t" + "vmlal.s8 q2, d6, d3 \n\t" + "vdup.s8 d3, d2[3] \n\t" + "vmlal.s8 q2, d7, d3 \n\t" + "vaddw.s16 q6, q6, d4 \n\t" + "vaddw.s16 q7, q7, d5 \n\t" // res row 1 + "vmov.s8 q2, #0 \n\t" + "vdup.s8 d3, d1[6] \n\t" + "vmlal.s8 q2, d6, d3 \n\t" + "vdup.s8 d3, d2[4] \n\t" + "vmlal.s8 q2, d7, d3 \n\t" + "vaddw.s16 q8, q8, d4 \n\t" + "vaddw.s16 q9, q9, d5 \n\t" // res row 2 + "vmov.s8 q2, #0 \n\t" + "vdup.s8 d3, d1[7] \n\t" + "vmlal.s8 q2, d6, d3 \n\t" + "vdup.s8 d3, d2[5] \n\t" + "vmlal.s8 q2, d7, d3 \n\t" + "vaddw.s16 q10, q10, d4 \n\t" + "vaddw.s16 q11, q11, d5 \n\t" // res row 3 + "vmov.s8 q2, #0 \n\t" + "vdup.s8 d3, d2[0] \n\t" + "vmlal.s8 q2, d6, d3 \n\t" + "vdup.s8 d3, d2[6] \n\t" + "vmlal.s8 q2, d7, d3 \n\t" + "vaddw.s16 q12, q12, d4 \n\t" + "vaddw.s16 q13, q13, d5 \n\t" // res row 4 + "vmov.s8 q2, #0 \n\t" + "vdup.s8 d3, d2[1] \n\t" + "vmlal.s8 q2, d6, d3 \n\t" + "vdup.s8 d3, d2[7] \n\t" + "vmlal.s8 q2, d7, d3 \n\t" + "vaddw.s16 q14, q14, d4 \n\t" + "vaddw.s16 q15, q15, d5 \n\t" // res row 5 + + "vld1.s8 {d0-d2}, [%[a_ptr]]! \n\t" // A 4 cols, q0 used, + // 1/2 q3 used + "vmov.s8 q2, #0 \n\t" // q2 used + "vld1.s8 {d6-d7}, [%[b_ptr]]! \n\t" // B 2 rows, B row1, + // q1 + "vdup.s8 d3, d0[0] \n\t" // q3 used // used + "vmlal.s8 q2, d6, d3 \n\t" // A col00 * B row0 + "vdup.s8 d3, d0[6] \n\t" // q3 used + "vmlal.s8 q2, d7, d3 \n\t" // A col10 * B row1, + // q3 free + "vaddw.s16 q4, q4, d4 \n\t" + "vaddw.s16 q5, q5, d5 \n\t" // res row 0 + "vmov.s8 q2, #0 \n\t" + "vdup.s8 d3, d0[1] \n\t" + "vmlal.s8 q2, d6, d3 \n\t" + "vdup.s8 d3, d0[7] \n\t" + "vmlal.s8 q2, d7, d3 \n\t" + "vaddw.s16 q6, q6, d4 \n\t" + "vaddw.s16 q7, q7, d5 \n\t" // res row 1 + "vmov.s8 q2, #0 \n\t" + "vdup.s8 d3, d0[2] \n\t" + "vmlal.s8 q2, d6, d3 \n\t" + "vdup.s8 d3, d1[0] \n\t" + "vmlal.s8 q2, d7, d3 \n\t" + "vaddw.s16 q8, q8, d4 \n\t" + "vaddw.s16 q9, q9, d5 \n\t" // res row 2 + "vmov.s8 q2, #0 \n\t" + "vdup.s8 d3, d0[3] \n\t" + "vmlal.s8 q2, d6, d3 \n\t" + "vdup.s8 d3, d1[1] \n\t" + "vmlal.s8 q2, d7, d3 \n\t" + "vaddw.s16 q10, q10, d4 \n\t" + "vaddw.s16 q11, q11, d5 \n\t" // res row 3 + "vmov.s8 q2, #0 \n\t" + "vdup.s8 d3, d0[4] \n\t" + "vmlal.s8 q2, d6, d3 \n\t" + "vdup.s8 d3, d1[2] \n\t" + "vmlal.s8 q2, d7, d3 \n\t" + "vaddw.s16 q12, q12, d4 \n\t" + "vaddw.s16 q13, q13, d5 \n\t" // res row 4 + "vmov.s8 q2, #0 \n\t" + "vdup.s8 d3, d0[5] \n\t" + "vmlal.s8 q2, d6, d3 \n\t" + "vdup.s8 d3, d1[3] \n\t" + "vmlal.s8 q2, d7, d3 \n\t" + "vaddw.s16 q14, q14, d4 \n\t" + "vaddw.s16 q15, q15, d5 \n\t" // res row 5 + + "vld1.s8 {d6-d7}, [%[b_ptr]]! \n\t" // B 2 rows, B row1, + // q1 + "vmov.s8 q2, #0 \n\t" // q2 used + "vdup.s8 d3, d1[4] \n\t" // q3 used // used + "vmlal.s8 q2, d6, d3 \n\t" // A col00 * B row0 + "vdup.s8 d3, d2[2] \n\t" // q3 used + "vmlal.s8 q2, d7, d3 \n\t" // A col10 * B row1, + // q3 free + "vaddw.s16 q4, q4, d4 \n\t" + "vaddw.s16 q5, q5, d5 \n\t" // res row 0 + "vmov.s8 q2, #0 \n\t" + "vdup.s8 d3, d1[5] \n\t" + "vmlal.s8 q2, d6, d3 \n\t" + "vdup.s8 d3, d2[3] \n\t" + "vmlal.s8 q2, d7, d3 \n\t" + "vaddw.s16 q6, q6, d4 \n\t" + "vaddw.s16 q7, q7, d5 \n\t" // res row 1 + "vmov.s8 q2, #0 \n\t" + "vdup.s8 d3, d1[6] \n\t" + "vmlal.s8 q2, d6, d3 \n\t" + "vdup.s8 d3, d2[4] \n\t" + "vmlal.s8 q2, d7, d3 \n\t" + "vaddw.s16 q8, q8, d4 \n\t" + "vaddw.s16 q9, q9, d5 \n\t" // res row 2 + "vmov.s8 q2, #0 \n\t" + "vdup.s8 d3, d1[7] \n\t" + "vmlal.s8 q2, d6, d3 \n\t" + "vdup.s8 d3, d2[5] \n\t" + "vmlal.s8 q2, d7, d3 \n\t" + "vaddw.s16 q10, q10, d4 \n\t" + "vaddw.s16 q11, q11, d5 \n\t" // res row 3 + "vmov.s8 q2, #0 \n\t" + "vdup.s8 d3, d2[0] \n\t" + "vmlal.s8 q2, d6, d3 \n\t" + "vdup.s8 d3, d2[6] \n\t" + "vmlal.s8 q2, d7, d3 \n\t" + "vaddw.s16 q12, q12, d4 \n\t" + "vaddw.s16 q13, q13, d5 \n\t" // res row 4 + "vmov.s8 q2, #0 \n\t" + "vdup.s8 d3, d2[1] \n\t" + "vmlal.s8 q2, d6, d3 \n\t" + "vdup.s8 d3, d2[7] \n\t" + "vmlal.s8 q2, d7, d3 \n\t" + "vaddw.s16 q14, q14, d4 \n\t" + "vaddw.s16 q15, q15, d5 \n\t" // res row 5 + + "subs %[kc1], %[kc1], #1 \n\t" + "bge 0b \n\t" + "1: \n\t" // last <8 rows + "subs %[kc3], %[kc3], #1 \n\t" + "blt 2f \n\t" + "vld1.s8 {d0-d2}, [%[a_ptr]]! \n\t" + "vmov.s8 q2, #0 \n\t" + "vld1.s8 {d6-d7}, [%[b_ptr]]! \n\t" + "vdup.s8 d3, d0[0] \n\t" + "vmlal.s8 q2, d6, d3 \n\t" + "vdup.s8 d3, d0[6] \n\t" + "vmlal.s8 q2, d7, d3 \n\t" + "vaddw.s16 q4, q4, d4 \n\t" + "vaddw.s16 q5, q5, d5 \n\t" // res row 0 + "vmov.s8 q2, #0 \n\t" + "vdup.s8 d3, d0[1] \n\t" + "vmlal.s8 q2, d6, d3 \n\t" + "vdup.s8 d3, d0[7] \n\t" + "vmlal.s8 q2, d7, d3 \n\t" + "vaddw.s16 q6, q6, d4 \n\t" + "vaddw.s16 q7, q7, d5 \n\t" // res row 1 + "vmov.s8 q2, #0 \n\t" + "vdup.s8 d3, d0[2] \n\t" + "vmlal.s8 q2, d6, d3 \n\t" + "vdup.s8 d3, d1[0] \n\t" + "vmlal.s8 q2, d7, d3 \n\t" + "vaddw.s16 q8, q8, d4 \n\t" + "vaddw.s16 q9, q9, d5 \n\t" // res row 2 + "vmov.s8 q2, #0 \n\t" + "vdup.s8 d3, d0[3] \n\t" + "vmlal.s8 q2, d6, d3 \n\t" + "vdup.s8 d3, d1[1] \n\t" + "vmlal.s8 q2, d7, d3 \n\t" + "vaddw.s16 q10, q10, d4 \n\t" + "vaddw.s16 q11, q11, d5 \n\t" // res row 3 + "vmov.s8 q2, #0 \n\t" + "vdup.s8 d3, d0[4] \n\t" + "vmlal.s8 q2, d6, d3 \n\t" + "vdup.s8 d3, d1[2] \n\t" + "vmlal.s8 q2, d7, d3 \n\t" + "vaddw.s16 q12, q12, d4 \n\t" + "vaddw.s16 q13, q13, d5 \n\t" // res row 4 + "vmov.s8 q2, #0 \n\t" + "vdup.s8 d3, d0[5] \n\t" + "vmlal.s8 q2, d6, d3 \n\t" + "vdup.s8 d3, d1[3] \n\t" + "vmlal.s8 q2, d7, d3 \n\t" + "vaddw.s16 q14, q14, d4 \n\t" + "vaddw.s16 q15, q15, d5 \n\t" // res row 5 + + "vld1.s8 {d6-d7}, [%[b_ptr]]! \n\t" + "vmov.s8 q2, #0 \n\t" + "vdup.s8 d3, d1[4] \n\t" + "vmlal.s8 q2, d6, d3 \n\t" + "vdup.s8 d3, d2[2] \n\t" + "vmlal.s8 q2, d7, d3 \n\t" + "vaddw.s16 q4, q4, d4 \n\t" + "vaddw.s16 q5, q5, d5 \n\t" // res row 0 + "vmov.s8 q2, #0 \n\t" + "vdup.s8 d3, d1[5] \n\t" + "vmlal.s8 q2, d6, d3 \n\t" + "vdup.s8 d3, d2[3] \n\t" + "vmlal.s8 q2, d7, d3 \n\t" + "vaddw.s16 q6, q6, d4 \n\t" + "vaddw.s16 q7, q7, d5 \n\t" // res row 1 + "vmov.s8 q2, #0 \n\t" + "vdup.s8 d3, d1[6] \n\t" + "vmlal.s8 q2, d6, d3 \n\t" + "vdup.s8 d3, d2[4] \n\t" + "vmlal.s8 q2, d7, d3 \n\t" + "vaddw.s16 q8, q8, d4 \n\t" + "vaddw.s16 q9, q9, d5 \n\t" // res row 2 + "vmov.s8 q2, #0 \n\t" + "vdup.s8 d3, d1[7] \n\t" + "vmlal.s8 q2, d6, d3 \n\t" + "vdup.s8 d3, d2[5] \n\t" + "vmlal.s8 q2, d7, d3 \n\t" + "vaddw.s16 q10, q10, d4 \n\t" + "vaddw.s16 q11, q11, d5 \n\t" // res row 3 + "vmov.s8 q2, #0 \n\t" + "vdup.s8 d3, d2[0] \n\t" + "vmlal.s8 q2, d6, d3 \n\t" + "vdup.s8 d3, d2[6] \n\t" + "vmlal.s8 q2, d7, d3 \n\t" + "vaddw.s16 q12, q12, d4 \n\t" + "vaddw.s16 q13, q13, d5 \n\t" // res row 4 + "vmov.s8 q2, #0 \n\t" + "vdup.s8 d3, d2[1] \n\t" + "vmlal.s8 q2, d6, d3 \n\t" + "vdup.s8 d3, d2[7] \n\t" + "vmlal.s8 q2, d7, d3 \n\t" + "vaddw.s16 q14, q14, d4 \n\t" + "vaddw.s16 q15, q15, d5 \n\t" // res row 5 + + "2: \n\t" // last <4 rows + "subs %[kc5], %[kc5], #1 \n\t" + "blt 3f \n\t" + "vld1.s8 {d0, d1}, [%[a_ptr]], r0 \n\t" + "vmov.s8 q2, #0 \n\t" + "vdup.s8 d6, d0[0] \n\t" + "vld1.s8 {d2-d3}, [%[b_ptr]]! \n\t" + "vdup.s8 d7, d0[6] \n\t" + "vmlal.s8 q2, d2, d6 \n\t" + "vmlal.s8 q2, d3, d7 \n\t" + "vaddw.s16 q4, q4, d4 \n\t" + "vaddw.s16 q5, q5, d5 \n\t" // res row 0 + "vmov.s8 q2, #0 \n\t" + "vdup.s8 d6, d0[1] \n\t" + "vdup.s8 d7, d0[7] \n\t" + "vmlal.s8 q2, d2, d6 \n\t" + "vmlal.s8 q2, d3, d7 \n\t" + "vaddw.s16 q6, q6, d4 \n\t" + "vaddw.s16 q7, q7, d5 \n\t" // res row 1 + "vmov.s8 q2, #0 \n\t" + "vdup.s8 d6, d0[2] \n\t" + "vdup.s8 d7, d1[0] \n\t" + "vmlal.s8 q2, d2, d6 \n\t" + "vmlal.s8 q2, d3, d7 \n\t" + "vaddw.s16 q8, q8, d4 \n\t" + "vaddw.s16 q9, q9, d5 \n\t" // res row 2 + "vmov.s8 q2, #0 \n\t" + "vdup.s8 d6, d0[3] \n\t" + "vdup.s8 d7, d1[1] \n\t" + "vmlal.s8 q2, d2, d6 \n\t" + "vmlal.s8 q2, d3, d7 \n\t" + "vaddw.s16 q10, q10, d4 \n\t" + "vaddw.s16 q11, q11, d5 \n\t" // res row 3 + "vmov.s8 q2, #0. \n\t" + "vdup.s8 d6, d0[4] \n\t" + "vdup.s8 d7, d1[2] \n\t" + "vmlal.s8 q2, d2, d6 \n\t" + "vmlal.s8 q2, d3, d7 \n\t" + "vaddw.s16 q12, q12, d4 \n\t" + "vaddw.s16 q13, q13, d5 \n\t" // res row 4 + "vmov.s8 q2, #0 \n\t" + "vdup.s8 d6, d0[5] \n\t" + "vdup.s8 d7, d1[3] \n\t" + "vmlal.s8 q2, d2, d6 \n\t" + "vmlal.s8 q2, d3, d7 \n\t" + "vaddw.s16 q14, q14, d4 \n\t" + "vaddw.s16 q15, q15, d5 \n\t" // res row 5 + + "3: \n\t" // last <2 rows + "subs %[kc6], %[kc6], #1 \n\t" + "blt 4f \n\t" + "vld1.s8 {d0}, [%[a_ptr]] \n\t" + "vld1.s8 {d1}, [%[b_ptr]] \n\t" + "vdup.s8 d2, d0[0] \n\t" + "vmull.s8 q2, d1, d2 \n\t" + "vaddw.s16 q4, q4, d4 \n\t" + "vaddw.s16 q5, q5, d5 \n\t" // res row 0 + "vdup.s8 d2, d0[1] \n\t" + "vmull.s8 q2, d1, d2 \n\t" + "vaddw.s16 q6, q6, d4 \n\t" + "vaddw.s16 q7, q7, d5 \n\t" // res row 1 + "vdup.s8 d2, d0[2] \n\t" + "vmull.s8 q2, d1, d2 \n\t" + "vaddw.s16 q8, q8, d4 \n\t" + "vaddw.s16 q9, q9, d5 \n\t" // res row 2 + "vdup.s8 d2, d0[3] \n\t" + "vmull.s8 q2, d1, d2 \n\t" + "vaddw.s16 q10, q10, d4 \n\t" + "vaddw.s16 q11, q11, d5 \n\t" // res row 3 + "vdup.s8 d2, d0[4] \n\t" + "vmull.s8 q2, d1, d2 \n\t" + "vaddw.s16 q12, q12, d4 \n\t" + "vaddw.s16 q13, q13, d5 \n\t" // res row 4 + "vdup.s8 d2, d0[5] \n\t" + "vmull.s8 q2, d1, d2 \n\t" + "vaddw.s16 q14, q14, d4 \n\t" + "vaddw.s16 q15, q15, d5 \n\t" // res row 4 + "4: \n\t" + "vst1.32 {q4, q5}, [%[c]], %[step] \n\t" + "vst1.32 {q6, q7}, [%[c]], %[step] \n\t" + "vst1.32 {q8, q9}, [%[c]], %[step] \n\t" + "vst1.32 {q10, q11}, [%[c]], %[step] \n\t" + "vst1.32 {q12, q13}, [%[c]], %[step] \n\t" + "vst1.32 {q14, q15}, [%[c]] \n\t" : : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1), - [kc3] "r"(kc3), [kc4] "r"(kc4), [step] "r"(step) + [kc3] "r"(kc3), [kc5] "r"(kc5), [kc6] "r"(kc6), [step] "r"(step) : "cc", "memory", "r0", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); #endif } -// 8 bit int inner product +// 8 bits int inner product void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, int8_t alpha, const int8_t *a, const int8_t *b, int8_t beta, int32_t *c, int32_t *C, int32_t ldc, bool relu, @@ -410,7 +475,7 @@ void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, int8_t alpha, } } -// 8 bit int PackMatrixA +// 8 bits int PackMatrixA void Gemm::PackMatrixA_6r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A, int32_t lda, int8_t *buffer) { const int32_t i_length = m - m_tail; @@ -465,7 +530,7 @@ void Gemm::PackMatrixA_6r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A, } } -// 8 bit int PackMatrixB +// 8 bits int PackMatrixB void Gemm::PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B, int32_t ldb, int8_t *buffer) { const int32_t j_length = n - n_tail; @@ -507,7 +572,7 @@ void Gemm::PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B, } } -// 8 bit int matrix product (m*k x k*n) +// 8 bits int matrix product (m*k x k*n) void Gemm::Sgemm(int32_t m, int32_t n, int32_t k, int8_t alpha, const int8_t *A, int32_t lda, const int8_t *B, int32_t ldb, int8_t beta, int32_t *C, int32_t ldc, bool relu, int8_t *bias) { @@ -570,7 +635,7 @@ void Gemm::Sgemm(int32_t m, int32_t n, int32_t k, int8_t alpha, const int8_t *A, paddle_mobile::memory::Free(zero_int8); } -// 8 bit int write back +// 8 bits int write back // C = alpha * A * B + beta * C void Gemm::WriteWithAlphaBeta(int32_t mc, int32_t nc, int32_t *c, int32_t *C, int32_t ldc) {} diff --git a/src/operators/math/gpc.cpp b/src/operators/math/gpc.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6b7700081a2ab6cb11187fad898e944390217db3 --- /dev/null +++ b/src/operators/math/gpc.cpp @@ -0,0 +1,2142 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef MULTICLASSNMS_OP + +#include "operators/math/gpc.h" + +namespace gpc { + +typedef struct lmt_shape { /* Local minima table */ + double y; /* Y coordinate at local minimum */ + edge_node *first_bound; /* Pointer to bound list */ + struct lmt_shape *next; /* Pointer to next local minimum */ +} lmt_node; + +typedef struct sbt_t_shape { /* Scanbeam tree */ + double y; /* Scanbeam node y value */ + struct sbt_t_shape *less; /* Pointer to nodes with lower y */ + struct sbt_t_shape *more; /* Pointer to nodes with higher y */ +} sb_tree; + +typedef struct it_shape { /* Intersection table */ + edge_node *ie[2]; /* Intersecting edge (bundle) pair */ + gpc_vertex point; /* Point of intersection */ + struct it_shape *next; /* The next intersection table node */ +} it_node; + +typedef struct st_shape { /* Sorted edge table */ + edge_node *edge; /* Pointer to AET edge */ + double xb; /* Scanbeam bottom x coordinate */ + double xt; /* Scanbeam top x coordinate */ + double dx; /* Change in x for a unit y increase */ + struct st_shape *prev; /* Previous edge in sorted list */ +} st_node; + +typedef struct bbox_shape { /* Contour axis-aligned bounding box */ + double xmin; /* Minimum x coordinate */ + double ymin; /* Minimum y coordinate */ + double xmax; /* Maximum x coordinate */ + double ymax; /* Maximum y coordinate */ +} bbox; + +/* +=========================================================================== + Global Data +=========================================================================== +*/ + +/* Horizontal edge state transitions within scanbeam boundary */ +const h_state next_h_state[3][6] = { + /* ABOVE BELOW CROSS */ + /* L R L R L R */ + /* NH */ + {BH, TH, TH, BH, NH, NH}, + /* BH */ + {NH, NH, NH, NH, TH, TH}, + /* TH */ + {NH, NH, NH, NH, BH, BH}}; + +/* +=========================================================================== + Private Functions +=========================================================================== +*/ + +static void reset_it(it_node **it) { + it_node *itn; + + while (*it) { + itn = (*it)->next; + gpc_free(*it); + *it = itn; + } +} + +static void reset_lmt(lmt_node **lmt) { + lmt_node *lmtn; + + while (*lmt) { + lmtn = (*lmt)->next; + gpc_free(*lmt); + *lmt = lmtn; + } +} + +static void insert_bound(edge_node **b, edge_node *e) { + edge_node *existing_bound = NULL; + + if (!*b) { + /* Link node e to the tail of the list */ + *b = e; + } else { + /* Do primary sort on the x field */ + if (e[0].bot.x < (*b)[0].bot.x) { + /* Insert a new node mid-list */ + existing_bound = *b; + *b = e; + (*b)->next_bound = existing_bound; + } else { + if (e[0].bot.x == (*b)[0].bot.x) { + /* Do secondary sort on the dx field */ + if (e[0].dx < (*b)[0].dx) { + /* Insert a new node mid-list */ + existing_bound = *b; + *b = e; + (*b)->next_bound = existing_bound; + } else { + /* Head further down the list */ + insert_bound(&((*b)->next_bound), e); + } + } else { + /* Head further down the list */ + insert_bound(&((*b)->next_bound), e); + } + } + } +} + +static edge_node **bound_list(lmt_node **lmt, double y) { + lmt_node *existing_node; + + if (!*lmt) { + /* Add node onto the tail end of the LMT */ + gpc_malloc(*lmt, sizeof(lmt_node), + const_cast("LMT insertion")); + (*lmt)->y = y; + (*lmt)->first_bound = NULL; + (*lmt)->next = NULL; + return &((*lmt)->first_bound); + } else if (y < (*lmt)->y) { + /* Insert a new LMT node before the current node */ + existing_node = *lmt; + gpc_malloc(*lmt, sizeof(lmt_node), + const_cast("LMT insertion")); + (*lmt)->y = y; + (*lmt)->first_bound = NULL; + (*lmt)->next = existing_node; + return &((*lmt)->first_bound); + } else { + if (y > (*lmt)->y) { + /* Head further up the LMT */ + return bound_list(&((*lmt)->next), y); + } else { + /* Use this existing LMT node */ + return &((*lmt)->first_bound); + } + } +} + +static void add_to_sbtree(int *entries, sb_tree **sbtree, double y) { + if (!*sbtree) { + /* Add a new tree node here */ + gpc_malloc(*sbtree, sizeof(sb_tree), + const_cast("scanbeam tree insertion")); + (*sbtree)->y = y; + (*sbtree)->less = NULL; + (*sbtree)->more = NULL; + (*entries)++; + } else { + if ((*sbtree)->y > y) { + /* Head into the 'less' sub-tree */ + add_to_sbtree(entries, &((*sbtree)->less), y); + } else { + if ((*sbtree)->y < y) { + /* Head into the 'more' sub-tree */ + add_to_sbtree(entries, &((*sbtree)->more), y); + } + } + } +} + +static void build_sbt(int *entries, double *sbt, sb_tree *sbtree) { + if (sbtree->less) { + build_sbt(entries, sbt, sbtree->less); + } + sbt[*entries] = sbtree->y; + (*entries)++; + if (sbtree->more) { + build_sbt(entries, sbt, sbtree->more); + } +} + +static void free_sbtree(sb_tree **sbtree) { + if (*sbtree) { + free_sbtree(&((*sbtree)->less)); + free_sbtree(&((*sbtree)->more)); + gpc_free(*sbtree); + } +} + +static int count_optimal_vertices(gpc_vertex_list c) { + int result = 0; + int i = 0; + + /* Ignore non-contributing contours */ + if (c.num_vertices > 0) { + for (i = 0; i < c.num_vertices; i++) { + /* Ignore superfluous vertices embedded in horizontal edges */ + if (gpc_optimal(c.vertex, i, c.num_vertices)) { + result++; + } + } + } + return result; +} + +static edge_node *build_lmt(lmt_node **lmt, sb_tree **sbtree, int *sbt_entries, + gpc_polygon *p, int type, gpc_op op) { + int c = 0; + int i = 0; + int min = 0; + int max = 0; + int num_edges = 0; + int v = 0; + int num_vertices = 0; + int total_vertices = 0; + int e_index = 0; + edge_node *e = NULL; + edge_node *edge_table = NULL; + + for (c = 0; c < p->num_contours; c++) { + total_vertices += count_optimal_vertices(p->contour[c]); + } + + /* Create the entire input polygon edge table in one go */ + gpc_malloc(edge_table, total_vertices * sizeof(edge_node), + const_cast("edge table creation")); + + for (c = 0; c < p->num_contours; c++) { + if (p->contour[c].num_vertices < 0) { + /* Ignore the non-contributing contour and repair the vertex count */ + p->contour[c].num_vertices = -p->contour[c].num_vertices; + } else { + /* Perform contour optimisation */ + num_vertices = 0; + for (i = 0; i < p->contour[c].num_vertices; i++) { + if (gpc_optimal(p->contour[c].vertex, i, p->contour[c].num_vertices)) { + edge_table[num_vertices].vertex.x = p->contour[c].vertex[i].x; + edge_table[num_vertices].vertex.y = p->contour[c].vertex[i].y; + + /* Record vertex in the scanbeam table */ + add_to_sbtree(sbt_entries, sbtree, edge_table[num_vertices].vertex.y); + + num_vertices++; + } + } + + /* Do the contour forward pass */ + for (min = 0; min < num_vertices; min++) { + /* If a forward local minimum... */ + if (gpc_fwd_min(edge_table, min, num_vertices)) { + /* Search for the next local maximum... */ + num_edges = 1; + max = gpc_next_index(min, num_vertices); + while (gpc_not_fmax(edge_table, max, num_vertices)) { + num_edges++; + max = gpc_next_index(max, num_vertices); + } + + /* Build the next edge list */ + e = &edge_table[e_index]; + e_index += num_edges; + v = min; + e[0].bstate[BELOW] = UNBUNDLED; + e[0].bundle[BELOW][CLIP] = 0; + e[0].bundle[BELOW][SUBJ] = 0; + for (i = 0; i < num_edges; i++) { + e[i].xb = edge_table[v].vertex.x; + e[i].bot.x = edge_table[v].vertex.x; + e[i].bot.y = edge_table[v].vertex.y; + + v = gpc_next_index(v, num_vertices); + + e[i].top.x = edge_table[v].vertex.x; + e[i].top.y = edge_table[v].vertex.y; + e[i].dx = (edge_table[v].vertex.x - e[i].bot.x) / + (e[i].top.y - e[i].bot.y); + e[i].type = type; + e[i].outp[ABOVE] = NULL; + e[i].outp[BELOW] = NULL; + e[i].next = NULL; + e[i].prev = NULL; + e[i].succ = + ((num_edges > 1) && (i < (num_edges - 1))) ? &(e[i + 1]) : NULL; + e[i].pred = ((num_edges > 1) && (i > 0)) ? &(e[i - 1]) : NULL; + e[i].next_bound = NULL; + e[i].bside[CLIP] = (op == GPC_DIFF) ? RIGHT : LEFT; + e[i].bside[SUBJ] = LEFT; + } + insert_bound(bound_list(lmt, edge_table[min].vertex.y), e); + } + } + + /* Do the contour reverse pass */ + for (min = 0; min < num_vertices; min++) { + /* If a reverse local minimum... */ + if (gpc_rev_min(edge_table, min, num_vertices)) { + /* Search for the previous local maximum... */ + num_edges = 1; + max = gpc_prev_index(min, num_vertices); + while (gpc_not_rmax(edge_table, max, num_vertices)) { + num_edges++; + max = gpc_prev_index(max, num_vertices); + } + + /* Build the previous edge list */ + e = &edge_table[e_index]; + e_index += num_edges; + v = min; + e[0].bstate[BELOW] = UNBUNDLED; + e[0].bundle[BELOW][CLIP] = 0; + e[0].bundle[BELOW][SUBJ] = 0; + for (i = 0; i < num_edges; i++) { + e[i].xb = edge_table[v].vertex.x; + e[i].bot.x = edge_table[v].vertex.x; + e[i].bot.y = edge_table[v].vertex.y; + + v = gpc_prev_index(v, num_vertices); + + e[i].top.x = edge_table[v].vertex.x; + e[i].top.y = edge_table[v].vertex.y; + e[i].dx = (edge_table[v].vertex.x - e[i].bot.x) / + (e[i].top.y - e[i].bot.y); + e[i].type = type; + e[i].outp[ABOVE] = NULL; + e[i].outp[BELOW] = NULL; + e[i].next = NULL; + e[i].prev = NULL; + e[i].succ = + ((num_edges > 1) && (i < (num_edges - 1))) ? &(e[i + 1]) : NULL; + e[i].pred = ((num_edges > 1) && (i > 0)) ? &(e[i - 1]) : NULL; + e[i].next_bound = NULL; + e[i].bside[CLIP] = (op == GPC_DIFF) ? RIGHT : LEFT; + e[i].bside[SUBJ] = LEFT; + } + insert_bound(bound_list(lmt, edge_table[min].vertex.y), e); + } + } + } + } + return edge_table; +} // NOLINT + +static void add_edge_to_aet(edge_node **aet, edge_node *edge, edge_node *prev) { + if (!*aet) { + /* Append edge onto the tail end of the AET */ + *aet = edge; + edge->prev = prev; + edge->next = NULL; + } else { + /* Do primary sort on the xb field */ + if (edge->xb < (*aet)->xb) { + /* Insert edge here (before the AET edge) */ + edge->prev = prev; + edge->next = *aet; + (*aet)->prev = edge; + *aet = edge; + } else { + if (edge->xb == (*aet)->xb) { + /* Do secondary sort on the dx field */ + if (edge->dx < (*aet)->dx) { + /* Insert edge here (before the AET edge) */ + edge->prev = prev; + edge->next = *aet; + (*aet)->prev = edge; + *aet = edge; + } else { + /* Head further into the AET */ + add_edge_to_aet(&((*aet)->next), edge, *aet); + } + } else { + /* Head further into the AET */ + add_edge_to_aet(&((*aet)->next), edge, *aet); + } + } + } +} + +static void add_intersection(it_node **it, edge_node *edge0, edge_node *edge1, + double x, double y) { + it_node *existing_node; + + if (!*it) { + /* Append a new node to the tail of the list */ + gpc_malloc(*it, sizeof(it_node), + const_cast("IT insertion")); + (*it)->ie[0] = edge0; + (*it)->ie[1] = edge1; + (*it)->point.x = x; + (*it)->point.y = y; + (*it)->next = NULL; + } else { + if ((*it)->point.y > y) { + /* Insert a new node mid-list */ + existing_node = *it; + gpc_malloc(*it, sizeof(it_node), + const_cast("IT insertion")); + (*it)->ie[0] = edge0; + (*it)->ie[1] = edge1; + (*it)->point.x = x; + (*it)->point.y = y; + (*it)->next = existing_node; + } else { + /* Head further down the list */ + add_intersection(&((*it)->next), edge0, edge1, x, y); + } + } +} + +static void add_st_edge(st_node **st, it_node **it, edge_node *edge, + double dy) { + st_node *existing_node; + double den = 0.0; + double r = 0.0; + double x = 0.0; + double y = 0.0; + + if (!*st) { + /* Append edge onto the tail end of the ST */ + gpc_malloc(*st, sizeof(st_node), + const_cast("ST insertion")); + (*st)->edge = edge; + (*st)->xb = edge->xb; + (*st)->xt = edge->xt; + (*st)->dx = edge->dx; + (*st)->prev = NULL; + } else { + den = ((*st)->xt - (*st)->xb) - (edge->xt - edge->xb); + + /* If new edge and ST edge don't cross */ + if ((edge->xt >= (*st)->xt) || (edge->dx == (*st)->dx) || + (fabs(den) <= DBL_EPSILON)) { + /* No intersection - insert edge here (before the ST edge) */ + existing_node = *st; + gpc_malloc(*st, sizeof(st_node), + const_cast("ST insertion")); + (*st)->edge = edge; + (*st)->xb = edge->xb; + (*st)->xt = edge->xt; + (*st)->dx = edge->dx; + (*st)->prev = existing_node; + } else { + /* Compute intersection between new edge and ST edge */ + r = (edge->xb - (*st)->xb) / den; + x = (*st)->xb + r * ((*st)->xt - (*st)->xb); + y = r * dy; + + /* Insert the edge pointers and the intersection point in the IT */ + add_intersection(it, (*st)->edge, edge, x, y); + + /* Head further into the ST */ + add_st_edge(&((*st)->prev), it, edge, dy); + } + } +} + +static void build_intersection_table(it_node **it, edge_node *aet, double dy) { + st_node *st; + st_node *stp; + edge_node *edge = NULL; + + /* Build intersection table for the current scanbeam */ + reset_it(it); + st = NULL; + + /* Process each AET edge */ + for (edge = aet; edge; edge = edge->next) { + if ((edge->bstate[ABOVE] == BUNDLE_HEAD) || edge->bundle[ABOVE][CLIP] || + edge->bundle[ABOVE][SUBJ]) { + add_st_edge(&st, it, edge, dy); + } + } + + /* Free the sorted edge table */ + while (st) { + stp = st->prev; + gpc_free(st); + st = stp; + } +} + +static int count_contours(polygon_node *polygon) { + int nc = 0; + int nv = 0; + vertex_node *v = NULL; + vertex_node *nextv = NULL; + + for (nc = 0; polygon; polygon = polygon->next) { + if (polygon->active) { + /* Count the vertices in the current contour */ + nv = 0; + for (v = polygon->proxy->v[LEFT]; v; v = v->next) { + nv++; + } + + /* Record valid vertex counts in the active field */ + if (nv > 2) { + polygon->active = nv; + nc++; + } else { + /* Invalid contour: just free the heap */ + for (v = polygon->proxy->v[LEFT]; v; v = nextv) { + nextv = v->next; + gpc_free(v); + } + polygon->active = 0; + } + } + } + return nc; +} + +static void add_left(polygon_node *p, double x, double y) { + vertex_node *nv = NULL; + + /* Create a new vertex node and set its fields */ + gpc_malloc(nv, sizeof(vertex_node), + const_cast("vertex node creation")); + nv->x = x; + nv->y = y; + + /* Add vertex nv to the left end of the polygon's vertex list */ + nv->next = p->proxy->v[LEFT]; + + /* Update proxy->[LEFT] to point to nv */ + p->proxy->v[LEFT] = nv; +} + +static void merge_left(polygon_node *p, polygon_node *q, polygon_node *list) { + polygon_node *target = NULL; + + /* Label contour as a hole */ + q->proxy->hole = 1; + + if (p->proxy != q->proxy) { + /* Assign p's vertex list to the left end of q's list */ + p->proxy->v[RIGHT]->next = q->proxy->v[LEFT]; + q->proxy->v[LEFT] = p->proxy->v[LEFT]; + + /* Redirect any p->proxy references to q->proxy */ + + for (target = p->proxy; list; list = list->next) { + if (list->proxy == target) { + list->active = 0; + list->proxy = q->proxy; + } + } + } +} + +static void add_right(polygon_node *p, double x, double y) { + vertex_node *nv = NULL; + + /* Create a new vertex node and set its fields */ + gpc_malloc(nv, sizeof(vertex_node), + const_cast("vertex node creation")); + nv->x = x; + nv->y = y; + nv->next = NULL; + + /* Add vertex nv to the right end of the polygon's vertex list */ + p->proxy->v[RIGHT]->next = nv; + + /* Update proxy->v[RIGHT] to point to nv */ + p->proxy->v[RIGHT] = nv; +} + +static void merge_right(polygon_node *p, polygon_node *q, polygon_node *list) { + polygon_node *target = NULL; + + /* Label contour as external */ + q->proxy->hole = 0; + + if (p->proxy != q->proxy) { + /* Assign p's vertex list to the right end of q's list */ + q->proxy->v[RIGHT]->next = p->proxy->v[LEFT]; + q->proxy->v[RIGHT] = p->proxy->v[RIGHT]; + + /* Redirect any p->proxy references to q->proxy */ + for (target = p->proxy; list; list = list->next) { + if (list->proxy == target) { + list->active = 0; + list->proxy = q->proxy; + } + } + } +} + +static void add_local_min(polygon_node **p, edge_node *edge, double x, + double y) { + polygon_node *existing_min = NULL; + vertex_node *nv = NULL; + + existing_min = *p; + + gpc_malloc(*p, sizeof(polygon_node), + const_cast("polygon node creation")); + + /* Create a new vertex node and set its fields */ + gpc_malloc(nv, sizeof(vertex_node), + const_cast("vertex node creation")); + nv->x = x; + nv->y = y; + nv->next = NULL; + + /* Initialise proxy to point to p itself */ + (*p)->proxy = (*p); + (*p)->active = 1; + (*p)->next = existing_min; + + /* Make v[LEFT] and v[RIGHT] point to new vertex nv */ + (*p)->v[LEFT] = nv; + (*p)->v[RIGHT] = nv; + + /* Assign polygon p to the edge */ + edge->outp[ABOVE] = *p; +} + +static int count_tristrips(polygon_node *tn) { + int total = 0; + + for (total = 0; tn; tn = tn->next) { + if (tn->active > 2) { + total++; + } + } + return total; +} + +void add_vertex(vertex_node **t, double x, double y) { + if (!(*t)) { + gpc_malloc(*t, sizeof(vertex_node), + const_cast("tristrip vertex creation")); + (*t)->x = x; + (*t)->y = y; + (*t)->next = NULL; + } else { + /* Head further down the list */ + add_vertex(&((*t)->next), x, y); + } +} + +void gpc_vertex_create(edge_node *e, int p, int s, double x, double y) { + add_vertex(&(e->outp[p]->v[s]), x, y); + e->outp[p]->active++; +} + +static void new_tristrip(polygon_node **tn, edge_node *edge, double x, + double y) { + if (!(*tn)) { + gpc_malloc(*tn, sizeof(polygon_node), + const_cast("tristrip node creation")); + (*tn)->next = NULL; + (*tn)->v[LEFT] = NULL; + (*tn)->v[RIGHT] = NULL; + (*tn)->active = 1; + add_vertex(&((*tn)->v[LEFT]), x, y); + edge->outp[ABOVE] = *tn; + } else { + /* Head further down the list */ + new_tristrip(&((*tn)->next), edge, x, y); + } +} + +static bbox *create_contour_bboxes(gpc_polygon *p) { + bbox *box; + int c = 0; + int v = 0; + + gpc_malloc(box, p->num_contours * sizeof(bbox), + const_cast("Bounding box creation")); + + /* Construct contour bounding boxes */ + for (c = 0; c < p->num_contours; c++) { + /* Initialise bounding box extent */ + box[c].xmin = DBL_MAX; + box[c].ymin = DBL_MAX; + box[c].xmax = -DBL_MAX; + box[c].ymax = -DBL_MAX; + + for (v = 0; v < p->contour[c].num_vertices; v++) { + /* Adjust bounding box */ + if (p->contour[c].vertex[v].x < box[c].xmin) { + box[c].xmin = p->contour[c].vertex[v].x; + } + if (p->contour[c].vertex[v].y < box[c].ymin) { + box[c].ymin = p->contour[c].vertex[v].y; + } + if (p->contour[c].vertex[v].x > box[c].xmax) { + box[c].xmax = p->contour[c].vertex[v].x; + } + if (p->contour[c].vertex[v].y > box[c].ymax) { + box[c].ymax = p->contour[c].vertex[v].y; + } + } + } + return box; +} + +static void minimax_test(gpc_polygon *subj, gpc_polygon *clip, gpc_op op) { + bbox *s_bbox; + bbox *c_bbox; + int s = 0; + int c = 0; + int *o_table = NULL; + int overlap = 0; + + s_bbox = create_contour_bboxes(subj); + c_bbox = create_contour_bboxes(clip); + + gpc_malloc(o_table, + subj->num_contours * clip->num_contours * sizeof(int), + const_cast("overlap table creation")); + + /* Check all subject contour bounding boxes against clip boxes */ + for (s = 0; s < subj->num_contours; s++) { + for (c = 0; c < clip->num_contours; c++) { + o_table[c * subj->num_contours + s] = + (!((s_bbox[s].xmax < c_bbox[c].xmin) || + (s_bbox[s].xmin > c_bbox[c].xmax))) && + (!((s_bbox[s].ymax < c_bbox[c].ymin) || + (s_bbox[s].ymin > c_bbox[c].ymax))); + } + } + + /* For each clip contour, search for any subject contour overlaps */ + for (c = 0; c < clip->num_contours; c++) { + overlap = 0; + for (s = 0; (!overlap) && (s < subj->num_contours); s++) { + overlap = o_table[c * subj->num_contours + s]; + } + + if (!overlap) { + /* Flag non contributing status by negating vertex count */ + clip->contour[c].num_vertices = -clip->contour[c].num_vertices; + } + } + + if (op == GPC_INT) { + /* For each subject contour, search for any clip contour overlaps */ + for (s = 0; s < subj->num_contours; s++) { + overlap = 0; + for (c = 0; (!overlap) && (c < clip->num_contours); c++) { + overlap = o_table[c * subj->num_contours + s]; + } + + if (!overlap) { + /* Flag non contributing status by negating vertex count */ + subj->contour[s].num_vertices = -subj->contour[s].num_vertices; + } + } + } + + gpc_free(s_bbox); + gpc_free(c_bbox); + gpc_free(o_table); +} + +/* +=========================================================================== + Public Functions +=========================================================================== +*/ + +void gpc_free_polygon(gpc_polygon *p) { + int c = 0; + + for (c = 0; c < p->num_contours; c++) { + gpc_free(p->contour[c].vertex); + } + gpc_free(p->hole); + gpc_free(p->contour); + p->num_contours = 0; +} + +void gpc_add_contour(gpc_polygon *p, gpc_vertex_list *new_contour, int hole) { + int *extended_hole = NULL; + int c = 0; + int v = 0; + gpc_vertex_list *extended_contour = NULL; + + /* Create an extended hole array */ + gpc_malloc(extended_hole, (p->num_contours + 1) * sizeof(int), + const_cast("contour hole addition")); + + /* Create an extended contour array */ + gpc_malloc(extended_contour, + (p->num_contours + 1) * sizeof(gpc_vertex_list), + const_cast("contour addition")); + + /* Copy the old contour and hole data into the extended arrays */ + for (c = 0; c < p->num_contours; c++) { + extended_hole[c] = p->hole[c]; + extended_contour[c] = p->contour[c]; + } + + /* Copy the new contour and hole onto the end of the extended arrays */ + c = p->num_contours; + extended_hole[c] = hole; + extended_contour[c].num_vertices = new_contour->num_vertices; + gpc_malloc(extended_contour[c].vertex, + new_contour->num_vertices * sizeof(gpc_vertex), + const_cast("contour addition")); + for (v = 0; v < new_contour->num_vertices; v++) { + extended_contour[c].vertex[v] = new_contour->vertex[v]; + } + + /* Dispose of the old contour */ + gpc_free(p->contour); + gpc_free(p->hole); + + /* Update the polygon information */ + p->num_contours++; + p->hole = extended_hole; + p->contour = extended_contour; +} + +// gpc_polygon_clip +void gpc_polygon_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip, + gpc_polygon *result) { + sb_tree *sbtree = NULL; + it_node *it = NULL; + it_node *intersect = NULL; + edge_node *edge = NULL; + edge_node *prev_edge = NULL; + edge_node *next_edge = NULL; + edge_node *succ_edge = NULL; + edge_node *e0 = NULL; + edge_node *e1 = NULL; + edge_node *aet = NULL; + edge_node *c_heap = NULL; + edge_node *s_heap = NULL; + lmt_node *lmt = NULL; + lmt_node *local_min = NULL; + polygon_node *out_poly = NULL; + polygon_node *p = NULL; + polygon_node *q = NULL; + polygon_node *poly = NULL; + polygon_node *npoly = NULL; + polygon_node *cf = NULL; + vertex_node *vtx = NULL; + vertex_node *nv = NULL; + h_state horiz[2]; + int in[2]; + int exists[2]; + int parity[2] = {LEFT, LEFT}; + int c = 0; + int v = 0; + int contributing = 0; + int search = 0; + int scanbeam = 0; + int sbt_entries = 0; + int vclass = 0; + int bl = 0; + int br = 0; + int tl = 0; + int tr = 0; + double *sbt = NULL; + double xb = 0.0; + double px = 0.0; + double yb = 0.0; + double yt = 0.0; + double dy = 0.0; + double ix = 0.0; + double iy = 0.0; + + /* Test for trivial NULL result cases */ + if (((subj->num_contours == 0) && (clip->num_contours == 0)) || + ((subj->num_contours == 0) && ((op == GPC_INT) || (op == GPC_DIFF))) || + ((clip->num_contours == 0) && (op == GPC_INT))) { + result->num_contours = 0; + result->hole = NULL; + result->contour = NULL; + return; + } + /* Identify potentialy contributing contours */ + if (((op == GPC_INT) || (op == GPC_DIFF)) && (subj->num_contours > 0) && + (clip->num_contours > 0)) { + minimax_test(subj, clip, op); + } + /* Build LMT */ + if (subj->num_contours > 0) { + s_heap = build_lmt(&lmt, &sbtree, &sbt_entries, subj, SUBJ, op); + } + if (clip->num_contours > 0) { + c_heap = build_lmt(&lmt, &sbtree, &sbt_entries, clip, CLIP, op); + } + /* Return a NULL result if no contours contribute */ + if (lmt == NULL) { + result->num_contours = 0; + result->hole = NULL; + result->contour = NULL; + reset_lmt(&lmt); + gpc_free(s_heap); + gpc_free(c_heap); + return; + } + + /* Build scanbeam table from scanbeam tree */ + gpc_malloc(sbt, sbt_entries * sizeof(double), + const_cast("sbt creation")); + build_sbt(&scanbeam, sbt, sbtree); + scanbeam = 0; + free_sbtree(&sbtree); + /* Allow pointer re-use without causing memory leak */ + if (subj == result) { + gpc_free_polygon(subj); + } + if (clip == result) { + gpc_free_polygon(clip); + } + /* Invert clip polygon for difference operation */ + if (op == GPC_DIFF) { + parity[CLIP] = RIGHT; + } + local_min = lmt; + + // Process each scanbeam + while (scanbeam < sbt_entries) { + /* Set yb and yt to the bottom and top of the scanbeam */ + yb = sbt[scanbeam++]; + if (scanbeam < sbt_entries) { + yt = sbt[scanbeam]; + dy = yt - yb; + } + /* === SCANBEAM BOUNDARY PROCESSING ================================ */ + /* If LMT node corresponding to yb exists */ + if (local_min) { + if (local_min->y == yb) { + /* Add edges starting at this local minimum to the AET */ + for (edge = local_min->first_bound; edge; edge = edge->next_bound) { + add_edge_to_aet(&aet, edge, NULL); + } + local_min = local_min->next; + } + } + /* Set dummy previous x value */ + px = -DBL_MAX; + /* Create bundles within AET */ + e0 = aet; + e1 = aet; + /* Set up bundle fields of first edge */ + aet->bundle[ABOVE][aet->type] = (aet->top.y != yb); + aet->bundle[ABOVE][!aet->type] = 0; + aet->bstate[ABOVE] = UNBUNDLED; + + for (next_edge = aet->next; next_edge; next_edge = next_edge->next) { + /* Set up bundle fields of next edge */ + next_edge->bundle[ABOVE][next_edge->type] = (next_edge->top.y != yb); + next_edge->bundle[ABOVE][!next_edge->type] = 0; + next_edge->bstate[ABOVE] = UNBUNDLED; + /* Bundle edges above the scanbeam boundary if they coincide */ + if (next_edge->bundle[ABOVE][next_edge->type]) { + if (gpc_eq(e0->xb, next_edge->xb) && gpc_eq(e0->dx, next_edge->dx) && + (e0->top.y != yb)) { + next_edge->bundle[ABOVE][next_edge->type] ^= + e0->bundle[ABOVE][next_edge->type]; + next_edge->bundle[ABOVE][!next_edge->type] = + e0->bundle[ABOVE][!next_edge->type]; + next_edge->bstate[ABOVE] = BUNDLE_HEAD; + e0->bundle[ABOVE][CLIP] = 0; + e0->bundle[ABOVE][SUBJ] = 0; + e0->bstate[ABOVE] = BUNDLE_TAIL; + } + e0 = next_edge; + } + } + horiz[CLIP] = NH; + horiz[SUBJ] = NH; + + // Process each edge at this scanbeam boundary + for (edge = aet; edge; edge = edge->next) { + exists[CLIP] = + edge->bundle[ABOVE][CLIP] + (edge->bundle[BELOW][CLIP] << 1); + exists[SUBJ] = + edge->bundle[ABOVE][SUBJ] + (edge->bundle[BELOW][SUBJ] << 1); + if (exists[CLIP] || exists[SUBJ]) { + /* Set bundle side */ + edge->bside[CLIP] = parity[CLIP]; + edge->bside[SUBJ] = parity[SUBJ]; + /* Determine contributing status and quadrant occupancies */ + switch (op) { + case GPC_DIFF: + case GPC_INT: + contributing = (exists[CLIP] && (parity[SUBJ] || horiz[SUBJ])) || + (exists[SUBJ] && (parity[CLIP] || horiz[CLIP])) || + (exists[CLIP] && exists[SUBJ] && + (parity[CLIP] == parity[SUBJ])); + br = (parity[CLIP]) && (parity[SUBJ]); + bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) && + (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]); + tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) && + (parity[SUBJ] ^ (horiz[SUBJ] != NH)); + tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^ + edge->bundle[BELOW][CLIP]) && + (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^ + edge->bundle[BELOW][SUBJ]); + break; + case GPC_XOR: + contributing = exists[CLIP] || exists[SUBJ]; + br = (parity[CLIP]) ^ (parity[SUBJ]); + bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) ^ + (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]); + tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) ^ + (parity[SUBJ] ^ (horiz[SUBJ] != NH)); + tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^ + edge->bundle[BELOW][CLIP]) ^ + (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^ + edge->bundle[BELOW][SUBJ]); + break; + case GPC_UNION: + contributing = (exists[CLIP] && (!parity[SUBJ] || horiz[SUBJ])) || + (exists[SUBJ] && (!parity[CLIP] || horiz[CLIP])) || + (exists[CLIP] && exists[SUBJ] && + (parity[CLIP] == parity[SUBJ])); + br = (parity[CLIP]) || (parity[SUBJ]); + bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) || + (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]); + tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) || + (parity[SUBJ] ^ (horiz[SUBJ] != NH)); + tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^ + edge->bundle[BELOW][CLIP]) || + (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^ + edge->bundle[BELOW][SUBJ]); + break; + } + // Update parity + parity[CLIP] ^= edge->bundle[ABOVE][CLIP]; + parity[SUBJ] ^= edge->bundle[ABOVE][SUBJ]; + /* Update horizontal state */ + if (exists[CLIP]) { + horiz[CLIP] = next_h_state[horiz[CLIP]] + [((exists[CLIP] - 1) << 1) + parity[CLIP]]; + } + if (exists[SUBJ]) { + horiz[SUBJ] = next_h_state[horiz[SUBJ]] + [((exists[SUBJ] - 1) << 1) + parity[SUBJ]]; + } + vclass = tr + (tl << 1) + (br << 2) + (bl << 3); + if (contributing) { + xb = edge->xb; + switch (vclass) { + case EMN: + case IMN: + add_local_min(&out_poly, edge, xb, yb); + px = xb; + cf = edge->outp[ABOVE]; + break; + case ERI: + if (xb != px) { + add_right(cf, xb, yb); + px = xb; + } + edge->outp[ABOVE] = cf; + cf = NULL; + break; + case ELI: + add_left(edge->outp[BELOW], xb, yb); + px = xb; + cf = edge->outp[BELOW]; + break; + case EMX: + if (xb != px) { + add_left(cf, xb, yb); + px = xb; + } + merge_right(cf, edge->outp[BELOW], out_poly); + cf = NULL; + break; + case ILI: + if (xb != px) { + add_left(cf, xb, yb); + px = xb; + } + edge->outp[ABOVE] = cf; + cf = NULL; + break; + case IRI: + add_right(edge->outp[BELOW], xb, yb); + px = xb; + cf = edge->outp[BELOW]; + edge->outp[BELOW] = NULL; + break; + case IMX: + if (xb != px) { + add_right(cf, xb, yb); + px = xb; + } + merge_left(cf, edge->outp[BELOW], out_poly); + cf = NULL; + edge->outp[BELOW] = NULL; + break; + case IMM: + if (xb != px) { + add_right(cf, xb, yb); + px = xb; + } + merge_left(cf, edge->outp[BELOW], out_poly); + edge->outp[BELOW] = NULL; + add_local_min(&out_poly, edge, xb, yb); + cf = edge->outp[ABOVE]; + break; + case EMM: + if (xb != px) { + add_left(cf, xb, yb); + px = xb; + } + merge_right(cf, edge->outp[BELOW], out_poly); + edge->outp[BELOW] = NULL; + add_local_min(&out_poly, edge, xb, yb); + cf = edge->outp[ABOVE]; + break; + case LED: + if (edge->bot.y == yb) { + add_left(edge->outp[BELOW], xb, yb); + } + edge->outp[ABOVE] = edge->outp[BELOW]; + px = xb; + break; + case RED: + if (edge->bot.y == yb) { + add_right(edge->outp[BELOW], xb, yb); + } + edge->outp[ABOVE] = edge->outp[BELOW]; + px = xb; + break; + default: + break; + } /* End of switch */ + } /* End of contributing conditional */ + } /* End of edge exists conditional */ + } // End of AET loop + + /* Delete terminating edges from the AET, otherwise compute xt */ + for (edge = aet; edge; edge = edge->next) { + if (edge->top.y == yb) { + prev_edge = edge->prev; + next_edge = edge->next; + if (prev_edge) { + prev_edge->next = next_edge; + } else { + aet = next_edge; + } + if (next_edge) { + next_edge->prev = prev_edge; + } + /* Copy bundle head state to the adjacent tail edge if required */ + if ((edge->bstate[BELOW] == BUNDLE_HEAD) && prev_edge) { + if (prev_edge->bstate[BELOW] == BUNDLE_TAIL) { + prev_edge->outp[BELOW] = edge->outp[BELOW]; + prev_edge->bstate[BELOW] = UNBUNDLED; + if (prev_edge->prev) { + if (prev_edge->prev->bstate[BELOW] == BUNDLE_TAIL) { + prev_edge->bstate[BELOW] = BUNDLE_HEAD; + } + } + } + } + } else { + if (edge->top.y == yt) { + edge->xt = edge->top.x; + } else { + edge->xt = edge->bot.x + edge->dx * (yt - edge->bot.y); + } + } + } + + if (scanbeam < sbt_entries) { + /* === SCANBEAM INTERIOR PROCESSING ============================== */ + build_intersection_table(&it, aet, dy); + /* Process each node in the intersection table */ + for (intersect = it; intersect; intersect = intersect->next) { + e0 = intersect->ie[0]; + e1 = intersect->ie[1]; + /* Only generate output for contributing intersections */ + if ((e0->bundle[ABOVE][CLIP] || e0->bundle[ABOVE][SUBJ]) && + (e1->bundle[ABOVE][CLIP] || e1->bundle[ABOVE][SUBJ])) { + p = e0->outp[ABOVE]; + q = e1->outp[ABOVE]; + ix = intersect->point.x; + iy = intersect->point.y + yb; + + in[CLIP] = (e0->bundle[ABOVE][CLIP] && !e0->bside[CLIP]) || + (e1->bundle[ABOVE][CLIP] && e1->bside[CLIP]) || + (!e0->bundle[ABOVE][CLIP] && !e1->bundle[ABOVE][CLIP] && + e0->bside[CLIP] && e1->bside[CLIP]); + in[SUBJ] = (e0->bundle[ABOVE][SUBJ] && !e0->bside[SUBJ]) || + (e1->bundle[ABOVE][SUBJ] && e1->bside[SUBJ]) || + (!e0->bundle[ABOVE][SUBJ] && !e1->bundle[ABOVE][SUBJ] && + e0->bside[SUBJ] && e1->bside[SUBJ]); + + // Determine quadrant occupancies + switch (op) { + case GPC_DIFF: + case GPC_INT: + tr = (in[CLIP]) && (in[SUBJ]); + tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) && + (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]); + br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) && + (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]); + bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^ + e0->bundle[ABOVE][CLIP]) && + (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^ + e0->bundle[ABOVE][SUBJ]); + break; + case GPC_XOR: + tr = (in[CLIP]) ^ (in[SUBJ]); + tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) ^ + (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]); + br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) ^ + (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]); + bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^ + e0->bundle[ABOVE][CLIP]) ^ + (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^ + e0->bundle[ABOVE][SUBJ]); + break; + case GPC_UNION: + tr = (in[CLIP]) || (in[SUBJ]); + tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) || + (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]); + br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) || + (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]); + bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^ + e0->bundle[ABOVE][CLIP]) || + (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^ + e0->bundle[ABOVE][SUBJ]); + break; + } + vclass = tr + (tl << 1) + (br << 2) + (bl << 3); + switch (vclass) { + case EMN: + add_local_min(&out_poly, e0, ix, iy); + e1->outp[ABOVE] = e0->outp[ABOVE]; + break; + case ERI: + if (p) { + add_right(p, ix, iy); + e1->outp[ABOVE] = p; + e0->outp[ABOVE] = NULL; + } + break; + case ELI: + if (q) { + add_left(q, ix, iy); + e0->outp[ABOVE] = q; + e1->outp[ABOVE] = NULL; + } + break; + case EMX: + if (p && q) { + add_left(p, ix, iy); + merge_right(p, q, out_poly); + e0->outp[ABOVE] = NULL; + e1->outp[ABOVE] = NULL; + } + break; + case IMN: + add_local_min(&out_poly, e0, ix, iy); + e1->outp[ABOVE] = e0->outp[ABOVE]; + break; + case ILI: + if (p) { + add_left(p, ix, iy); + e1->outp[ABOVE] = p; + e0->outp[ABOVE] = NULL; + } + break; + case IRI: + if (q) { + add_right(q, ix, iy); + e0->outp[ABOVE] = q; + e1->outp[ABOVE] = NULL; + } + break; + case IMX: + if (p && q) { + add_right(p, ix, iy); + merge_left(p, q, out_poly); + e0->outp[ABOVE] = NULL; + e1->outp[ABOVE] = NULL; + } + break; + case IMM: + if (p && q) { + add_right(p, ix, iy); + merge_left(p, q, out_poly); + add_local_min(&out_poly, e0, ix, iy); + e1->outp[ABOVE] = e0->outp[ABOVE]; + } + break; + case EMM: + if (p && q) { + add_left(p, ix, iy); + merge_right(p, q, out_poly); + add_local_min(&out_poly, e0, ix, iy); + e1->outp[ABOVE] = e0->outp[ABOVE]; + } + break; + default: + break; + } // End of switch + } /* End of contributing intersection conditional */ + + /* Swap bundle sides in response to edge crossing */ + if (e0->bundle[ABOVE][CLIP]) { + e1->bside[CLIP] = !e1->bside[CLIP]; + } + if (e1->bundle[ABOVE][CLIP]) { + e0->bside[CLIP] = !e0->bside[CLIP]; + } + if (e0->bundle[ABOVE][SUBJ]) { + e1->bside[SUBJ] = !e1->bside[SUBJ]; + } + if (e1->bundle[ABOVE][SUBJ]) { + e0->bside[SUBJ] = !e0->bside[SUBJ]; + } + + /* Swap e0 and e1 bundles in the AET */ + prev_edge = e0->prev; + next_edge = e1->next; + if (next_edge) { + next_edge->prev = e0; + } + if (e0->bstate[ABOVE] == BUNDLE_HEAD) { + search = 1; + while (search) { + prev_edge = prev_edge->prev; + if (prev_edge) { + if (prev_edge->bstate[ABOVE] != BUNDLE_TAIL) { + search = 0; + } + } else { + search = 0; + } + } + } + if (!prev_edge) { + aet->prev = e1; + e1->next = aet; + aet = e0->next; + } else { + prev_edge->next->prev = e1; + e1->next = prev_edge->next; + prev_edge->next = e0->next; + } + e0->next->prev = prev_edge; + e1->next->prev = e1; + e0->next = next_edge; + } /* End of IT loop*/ + + // Prepare for next scanbeam + for (edge = aet; edge; edge = next_edge) { + next_edge = edge->next; + succ_edge = edge->succ; + if ((edge->top.y == yt) && succ_edge) { + /* Replace AET edge by its successor */ + succ_edge->outp[BELOW] = edge->outp[ABOVE]; + succ_edge->bstate[BELOW] = edge->bstate[ABOVE]; + succ_edge->bundle[BELOW][CLIP] = edge->bundle[ABOVE][CLIP]; + succ_edge->bundle[BELOW][SUBJ] = edge->bundle[ABOVE][SUBJ]; + prev_edge = edge->prev; + if (prev_edge) { + prev_edge->next = succ_edge; + } else { + aet = succ_edge; + } + if (next_edge) { + next_edge->prev = succ_edge; + } + succ_edge->prev = prev_edge; + succ_edge->next = next_edge; + } else { + /* Update this edge */ + edge->outp[BELOW] = edge->outp[ABOVE]; + edge->bstate[BELOW] = edge->bstate[ABOVE]; + edge->bundle[BELOW][CLIP] = edge->bundle[ABOVE][CLIP]; + edge->bundle[BELOW][SUBJ] = edge->bundle[ABOVE][SUBJ]; + edge->xb = edge->xt; + } + edge->outp[ABOVE] = NULL; + } + } + } /* === END OF SCANBEAM PROCESSING ================================== */ + // Generate result polygon from out_poly + result->contour = NULL; + result->hole = NULL; + result->num_contours = count_contours(out_poly); + if (result->num_contours > 0) { + gpc_malloc(result->hole, result->num_contours * sizeof(int), + const_cast("hole flag table creation")); + gpc_malloc(result->contour, + result->num_contours * sizeof(gpc_vertex_list), + const_cast("contour creation")); + + c = 0; + for (poly = out_poly; poly; poly = npoly) { + npoly = poly->next; + if (poly->active) { + result->hole[c] = poly->proxy->hole; + result->contour[c].num_vertices = poly->active; + gpc_malloc( + result->contour[c].vertex, + result->contour[c].num_vertices * sizeof(gpc_vertex), + const_cast("vertex creation")); + + v = result->contour[c].num_vertices - 1; + for (vtx = poly->proxy->v[LEFT]; vtx; vtx = nv) { + nv = vtx->next; + result->contour[c].vertex[v].x = vtx->x; + result->contour[c].vertex[v].y = vtx->y; + gpc_free(vtx); + v--; + } + c++; + } + gpc_free(poly); + } + } else { + for (poly = out_poly; poly; poly = npoly) { + npoly = poly->next; + gpc_free(poly); + } + } + + // Tidy up + reset_it(&it); + reset_lmt(&lmt); + gpc_free(c_heap); + gpc_free(s_heap); + gpc_free(sbt); +} // NOLINT + +void gpc_free_tristrip(gpc_tristrip *t) { + int s = 0; + for (s = 0; s < t->num_strips; s++) { + gpc_free(t->strip[s].vertex); + } + gpc_free(t->strip); + t->num_strips = 0; +} + +void gpc_polygon_to_tristrip(gpc_polygon *s, gpc_tristrip *t) { + gpc_polygon c; + c.num_contours = 0; + c.hole = NULL; + c.contour = NULL; + gpc_tristrip_clip(GPC_DIFF, s, &c, t); +} + +// gpc_tristrip_clip +void gpc_tristrip_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip, + gpc_tristrip *result) { + sb_tree *sbtree = NULL; + it_node *it = NULL; + it_node *intersect = NULL; + edge_node *edge = NULL; + edge_node *prev_edge = NULL; + edge_node *next_edge = NULL; + edge_node *succ_edge = NULL; + edge_node *e0 = NULL; + edge_node *e1 = NULL; + edge_node *aet = NULL; + edge_node *c_heap = NULL; + edge_node *s_heap = NULL; + edge_node *cf = NULL; + lmt_node *lmt = NULL; + lmt_node *local_min = NULL; + polygon_node *tlist = NULL; + polygon_node *tn = NULL; + polygon_node *tnn = NULL; + polygon_node *p = NULL; + polygon_node *q = NULL; + vertex_node *lt = NULL; + vertex_node *ltn = NULL; + vertex_node *rt = NULL; + vertex_node *rtn = NULL; + h_state horiz[2]; + vertex_type cft = NUL; + int in[2]; + int exists[2]; + int parity[2] = {LEFT, LEFT}; + int s = 0; + int v = 0; + int contributing = 0; + int search = 0; + int scanbeam = 0; + int sbt_entries = 0; + int vclass = 0; + int bl = 0; + int br = 0; + int tl = 0; + int tr = 0; + double *sbt = NULL; + double xb = 0.0; + double px = 0.0; + double nx = 0.0; + double yb = 0.0; + double yt = 0.0; + double dy = 0.0; + double ix = 0.0; + double iy = 0.0; + + /* Test for trivial NULL result cases */ + if (((subj->num_contours == 0) && (clip->num_contours == 0)) || + ((subj->num_contours == 0) && ((op == GPC_INT) || (op == GPC_DIFF))) || + ((clip->num_contours == 0) && (op == GPC_INT))) { + result->num_strips = 0; + result->strip = NULL; + return; + } + + /* Identify potentialy contributing contours */ + if (((op == GPC_INT) || (op == GPC_DIFF)) && (subj->num_contours > 0) && + (clip->num_contours > 0)) { + minimax_test(subj, clip, op); + } + /* Build LMT */ + if (subj->num_contours > 0) { + s_heap = build_lmt(&lmt, &sbtree, &sbt_entries, subj, SUBJ, op); + } + if (clip->num_contours > 0) { + c_heap = build_lmt(&lmt, &sbtree, &sbt_entries, clip, CLIP, op); + } + /* Return a NULL result if no contours contribute */ + if (lmt == NULL) { + result->num_strips = 0; + result->strip = NULL; + reset_lmt(&lmt); + gpc_free(s_heap); + gpc_free(c_heap); + return; + } + + /* Build scanbeam table from scanbeam tree */ + gpc_malloc(sbt, sbt_entries * sizeof(double), + const_cast("sbt creation")); + build_sbt(&scanbeam, sbt, sbtree); + scanbeam = 0; + free_sbtree(&sbtree); + + /* Invert clip polygon for difference operation */ + if (op == GPC_DIFF) { + parity[CLIP] = RIGHT; + } + local_min = lmt; + + // Process each scanbeam + while (scanbeam < sbt_entries) { + /* Set yb and yt to the bottom and top of the scanbeam */ + yb = sbt[scanbeam++]; + if (scanbeam < sbt_entries) { + yt = sbt[scanbeam]; + dy = yt - yb; + } + + /* === SCANBEAM BOUNDARY PROCESSING ================================ */ + /* If LMT node corresponding to yb exists */ + if (local_min) { + if (local_min->y == yb) { + /* Add edges starting at this local minimum to the AET */ + for (edge = local_min->first_bound; edge; edge = edge->next_bound) { + add_edge_to_aet(&aet, edge, NULL); + } + local_min = local_min->next; + } + } + /* Set dummy previous x value */ + /* Create bundles within AET */ + px = -DBL_MAX; + e0 = aet; + e1 = aet; + + /* Set up bundle fields of first edge */ + aet->bundle[ABOVE][aet->type] = (aet->top.y != yb); + aet->bundle[ABOVE][!aet->type] = 0; + aet->bstate[ABOVE] = UNBUNDLED; + + for (next_edge = aet->next; next_edge; next_edge = next_edge->next) { + /* Set up bundle fields of next edge */ + next_edge->bundle[ABOVE][next_edge->type] = (next_edge->top.y != yb); + next_edge->bundle[ABOVE][!next_edge->type] = 0; + next_edge->bstate[ABOVE] = UNBUNDLED; + + /* Bundle edges above the scanbeam boundary if they coincide */ + if (next_edge->bundle[ABOVE][next_edge->type]) { + if (gpc_eq(e0->xb, next_edge->xb) && gpc_eq(e0->dx, next_edge->dx) && + (e0->top.y != yb)) { + next_edge->bundle[ABOVE][next_edge->type] ^= + e0->bundle[ABOVE][next_edge->type]; + next_edge->bundle[ABOVE][!next_edge->type] = + e0->bundle[ABOVE][!next_edge->type]; + next_edge->bstate[ABOVE] = BUNDLE_HEAD; + e0->bundle[ABOVE][CLIP] = 0; + e0->bundle[ABOVE][SUBJ] = 0; + e0->bstate[ABOVE] = BUNDLE_TAIL; + } + e0 = next_edge; + } + } + horiz[CLIP] = NH; + horiz[SUBJ] = NH; + + /* Process each edge at this scanbeam boundary */ + for (edge = aet; edge; edge = edge->next) { + exists[CLIP] = + edge->bundle[ABOVE][CLIP] + (edge->bundle[BELOW][CLIP] << 1); + exists[SUBJ] = + edge->bundle[ABOVE][SUBJ] + (edge->bundle[BELOW][SUBJ] << 1); + + if (exists[CLIP] || exists[SUBJ]) { + /* Set bundle side */ + edge->bside[CLIP] = parity[CLIP]; + edge->bside[SUBJ] = parity[SUBJ]; + + /* Determine contributing status and quadrant occupancies */ + switch (op) { + case GPC_DIFF: + case GPC_INT: + contributing = (exists[CLIP] && (parity[SUBJ] || horiz[SUBJ])) || + (exists[SUBJ] && (parity[CLIP] || horiz[CLIP])) || + (exists[CLIP] && exists[SUBJ] && + (parity[CLIP] == parity[SUBJ])); + br = (parity[CLIP]) && (parity[SUBJ]); + bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) && + (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]); + tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) && + (parity[SUBJ] ^ (horiz[SUBJ] != NH)); + tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^ + edge->bundle[BELOW][CLIP]) && + (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^ + edge->bundle[BELOW][SUBJ]); + break; + case GPC_XOR: + contributing = exists[CLIP] || exists[SUBJ]; + br = (parity[CLIP]) ^ (parity[SUBJ]); + bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) ^ + (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]); + tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) ^ + (parity[SUBJ] ^ (horiz[SUBJ] != NH)); + tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^ + edge->bundle[BELOW][CLIP]) ^ + (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^ + edge->bundle[BELOW][SUBJ]); + break; + case GPC_UNION: + contributing = (exists[CLIP] && (!parity[SUBJ] || horiz[SUBJ])) || + (exists[SUBJ] && (!parity[CLIP] || horiz[CLIP])) || + (exists[CLIP] && exists[SUBJ] && + (parity[CLIP] == parity[SUBJ])); + br = (parity[CLIP]) || (parity[SUBJ]); + bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) || + (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]); + tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) || + (parity[SUBJ] ^ (horiz[SUBJ] != NH)); + tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^ + edge->bundle[BELOW][CLIP]) || + (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^ + edge->bundle[BELOW][SUBJ]); + break; + } + + // Update parity + parity[CLIP] ^= edge->bundle[ABOVE][CLIP]; + parity[SUBJ] ^= edge->bundle[ABOVE][SUBJ]; + + /* Update horizontal state */ + if (exists[CLIP]) { + horiz[CLIP] = next_h_state[horiz[CLIP]] + [((exists[CLIP] - 1) << 1) + parity[CLIP]]; + } + if (exists[SUBJ]) { + horiz[SUBJ] = next_h_state[horiz[SUBJ]] + [((exists[SUBJ] - 1) << 1) + parity[SUBJ]]; + } + vclass = tr + (tl << 1) + (br << 2) + (bl << 3); + + if (contributing) { + xb = edge->xb; + switch (vclass) { + case EMN: + new_tristrip(&tlist, edge, xb, yb); + cf = edge; + break; + case ERI: + edge->outp[ABOVE] = cf->outp[ABOVE]; + if (xb != cf->xb) { + gpc_vertex_create(edge, ABOVE, RIGHT, xb, yb); + } + cf = NULL; + break; + case ELI: + gpc_vertex_create(edge, BELOW, LEFT, xb, yb); + edge->outp[ABOVE] = NULL; + cf = edge; + break; + case EMX: + if (xb != cf->xb) { + gpc_vertex_create(edge, BELOW, RIGHT, xb, yb); + } + edge->outp[ABOVE] = NULL; + cf = NULL; + break; + case IMN: + if (cft == LED) { + if (cf->bot.y != yb) { + gpc_vertex_create(cf, BELOW, LEFT, cf->xb, yb); + } + new_tristrip(&tlist, cf, cf->xb, yb); + } + edge->outp[ABOVE] = cf->outp[ABOVE]; + gpc_vertex_create(edge, ABOVE, RIGHT, xb, yb); + break; + case ILI: + new_tristrip(&tlist, edge, xb, yb); + cf = edge; + cft = ILI; + break; + case IRI: + if (cft == LED) { + if (cf->bot.y != yb) { + gpc_vertex_create(cf, BELOW, LEFT, cf->xb, yb); + } + new_tristrip(&tlist, cf, cf->xb, yb); + } + gpc_vertex_create(edge, BELOW, RIGHT, xb, yb); + edge->outp[ABOVE] = NULL; + break; + case IMX: + gpc_vertex_create(edge, BELOW, LEFT, xb, yb); + edge->outp[ABOVE] = NULL; + cft = IMX; + break; + case IMM: + gpc_vertex_create(edge, BELOW, LEFT, xb, yb); + edge->outp[ABOVE] = cf->outp[ABOVE]; + if (xb != cf->xb) { + gpc_vertex_create(cf, ABOVE, RIGHT, xb, yb); + } + cf = edge; + break; + case EMM: + gpc_vertex_create(edge, BELOW, RIGHT, xb, yb); + edge->outp[ABOVE] = NULL; + new_tristrip(&tlist, edge, xb, yb); + cf = edge; + break; + case LED: + if (edge->bot.y == yb) { + gpc_vertex_create(edge, BELOW, LEFT, xb, yb); + } + edge->outp[ABOVE] = edge->outp[BELOW]; + cf = edge; + cft = LED; + break; + case RED: + edge->outp[ABOVE] = cf->outp[ABOVE]; + if (cft == LED) { + if (cf->bot.y == yb) { + gpc_vertex_create(edge, BELOW, RIGHT, xb, yb); + } else { + if (edge->bot.y == yb) { + gpc_vertex_create(cf, BELOW, LEFT, cf->xb, yb); + gpc_vertex_create(edge, BELOW, RIGHT, xb, yb); + } + } + } else { + gpc_vertex_create(edge, BELOW, RIGHT, xb, yb); + gpc_vertex_create(edge, ABOVE, RIGHT, xb, yb); + } + cf = NULL; + break; + default: + break; + } /* End of switch */ + } /* End of contributing conditional */ + } /* End of edge exists conditional */ + } // End of AET loop + + /* Delete terminating edges from the AET, otherwise compute xt */ + for (edge = aet; edge; edge = edge->next) { + if (edge->top.y == yb) { + prev_edge = edge->prev; + next_edge = edge->next; + if (prev_edge) { + prev_edge->next = next_edge; + } else { + aet = next_edge; + } + if (next_edge) { + next_edge->prev = prev_edge; + } + + /* Copy bundle head state to the adjacent tail edge if required */ + if ((edge->bstate[BELOW] == BUNDLE_HEAD) && prev_edge) { + if (prev_edge->bstate[BELOW] == BUNDLE_TAIL) { + prev_edge->outp[BELOW] = edge->outp[BELOW]; + prev_edge->bstate[BELOW] = UNBUNDLED; + if (prev_edge->prev) { + if (prev_edge->prev->bstate[BELOW] == BUNDLE_TAIL) { + prev_edge->bstate[BELOW] = BUNDLE_HEAD; + } + } + } + } + } else { + if (edge->top.y == yt) { + edge->xt = edge->top.x; + } else { + edge->xt = edge->bot.x + edge->dx * (yt - edge->bot.y); + } + } + } + + if (scanbeam < sbt_entries) { + /* === SCANBEAM INTERIOR PROCESSING ============================== */ + build_intersection_table(&it, aet, dy); + /* Process each node in the intersection table */ + for (intersect = it; intersect; intersect = intersect->next) { + e0 = intersect->ie[0]; + e1 = intersect->ie[1]; + + /* Only generate output for contributing intersections */ + if ((e0->bundle[ABOVE][CLIP] || e0->bundle[ABOVE][SUBJ]) && + (e1->bundle[ABOVE][CLIP] || e1->bundle[ABOVE][SUBJ])) { + p = e0->outp[ABOVE]; + q = e1->outp[ABOVE]; + ix = intersect->point.x; + iy = intersect->point.y + yb; + + in[CLIP] = (e0->bundle[ABOVE][CLIP] && !e0->bside[CLIP]) || + (e1->bundle[ABOVE][CLIP] && e1->bside[CLIP]) || + (!e0->bundle[ABOVE][CLIP] && !e1->bundle[ABOVE][CLIP] && + e0->bside[CLIP] && e1->bside[CLIP]); + in[SUBJ] = (e0->bundle[ABOVE][SUBJ] && !e0->bside[SUBJ]) || + (e1->bundle[ABOVE][SUBJ] && e1->bside[SUBJ]) || + (!e0->bundle[ABOVE][SUBJ] && !e1->bundle[ABOVE][SUBJ] && + e0->bside[SUBJ] && e1->bside[SUBJ]); + + switch (op) { // Determine quadrant occupancies + case GPC_DIFF: + case GPC_INT: + tr = (in[CLIP]) && (in[SUBJ]); + tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) && + (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]); + br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) && + (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]); + bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^ + e0->bundle[ABOVE][CLIP]) && + (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^ + e0->bundle[ABOVE][SUBJ]); + break; + case GPC_XOR: + tr = (in[CLIP]) ^ (in[SUBJ]); + tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) ^ + (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]); + br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) ^ + (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]); + bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^ + e0->bundle[ABOVE][CLIP]) ^ + (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^ + e0->bundle[ABOVE][SUBJ]); + break; + case GPC_UNION: + tr = (in[CLIP]) || (in[SUBJ]); + tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) || + (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]); + br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) || + (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]); + bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^ + e0->bundle[ABOVE][CLIP]) || + (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^ + e0->bundle[ABOVE][SUBJ]); + break; + } + + vclass = tr + (tl << 1) + (br << 2) + (bl << 3); + switch (vclass) { + case EMN: + new_tristrip(&tlist, e1, ix, iy); + e0->outp[ABOVE] = e1->outp[ABOVE]; + break; + case ERI: + if (p) { + gpc_p_edge(prev_edge, e0, ABOVE); + gpc_vertex_create(prev_edge, ABOVE, LEFT, px, iy); + gpc_vertex_create(e0, ABOVE, RIGHT, ix, iy); + e1->outp[ABOVE] = e0->outp[ABOVE]; + e0->outp[ABOVE] = NULL; + } + break; + case ELI: + if (q) { + gpc_n_edge(next_edge, e1, ABOVE); + gpc_vertex_create(e1, ABOVE, LEFT, ix, iy); + gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy); + e0->outp[ABOVE] = e1->outp[ABOVE]; + e1->outp[ABOVE] = NULL; + } + break; + case EMX: + if (p && q) { + gpc_vertex_create(e0, ABOVE, LEFT, ix, iy); + e0->outp[ABOVE] = NULL; + e1->outp[ABOVE] = NULL; + } + break; + case IMN: + gpc_p_edge(prev_edge, e0, ABOVE); + gpc_vertex_create(prev_edge, ABOVE, LEFT, px, iy); + gpc_n_edge(next_edge, e1, ABOVE); + gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy); + new_tristrip(&tlist, prev_edge, px, iy); + e1->outp[ABOVE] = prev_edge->outp[ABOVE]; + gpc_vertex_create(e1, ABOVE, RIGHT, ix, iy); + new_tristrip(&tlist, e0, ix, iy); + next_edge->outp[ABOVE] = e0->outp[ABOVE]; + gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy); + break; + case ILI: + if (p) { + gpc_vertex_create(e0, ABOVE, LEFT, ix, iy); + gpc_n_edge(next_edge, e1, ABOVE); + gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy); + e1->outp[ABOVE] = e0->outp[ABOVE]; + e0->outp[ABOVE] = NULL; + } + break; + case IRI: + if (q) { + gpc_vertex_create(e1, ABOVE, RIGHT, ix, iy); + gpc_p_edge(prev_edge, e0, ABOVE); + gpc_vertex_create(prev_edge, ABOVE, LEFT, px, iy); + e0->outp[ABOVE] = e1->outp[ABOVE]; + e1->outp[ABOVE] = NULL; + } + break; + case IMX: + if (p && q) { + gpc_vertex_create(e0, ABOVE, RIGHT, ix, iy); + gpc_vertex_create(e1, ABOVE, LEFT, ix, iy); + e0->outp[ABOVE] = NULL; + e1->outp[ABOVE] = NULL; + gpc_p_edge(prev_edge, e0, ABOVE); + gpc_vertex_create(prev_edge, ABOVE, LEFT, px, iy); + new_tristrip(&tlist, prev_edge, px, iy); + gpc_n_edge(next_edge, e1, ABOVE); + gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy); + next_edge->outp[ABOVE] = prev_edge->outp[ABOVE]; + gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy); + } + break; + case IMM: + if (p && q) { + gpc_vertex_create(e0, ABOVE, RIGHT, ix, iy); + gpc_vertex_create(e1, ABOVE, LEFT, ix, iy); + gpc_p_edge(prev_edge, e0, ABOVE); + gpc_vertex_create(prev_edge, ABOVE, LEFT, px, iy); + new_tristrip(&tlist, prev_edge, px, iy); + gpc_n_edge(next_edge, e1, ABOVE); + gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy); + e1->outp[ABOVE] = prev_edge->outp[ABOVE]; + gpc_vertex_create(e1, ABOVE, RIGHT, ix, iy); + new_tristrip(&tlist, e0, ix, iy); + next_edge->outp[ABOVE] = e0->outp[ABOVE]; + gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy); + } + break; + case EMM: + if (p && q) { + gpc_vertex_create(e0, ABOVE, LEFT, ix, iy); + new_tristrip(&tlist, e1, ix, iy); + e0->outp[ABOVE] = e1->outp[ABOVE]; + } + break; + default: + break; + } /* End of switch */ + } /* End of contributing intersection conditional */ + + // Swap bundle sides in response to edge crossing + if (e0->bundle[ABOVE][CLIP]) { + e1->bside[CLIP] = !e1->bside[CLIP]; + } + if (e1->bundle[ABOVE][CLIP]) { + e0->bside[CLIP] = !e0->bside[CLIP]; + } + if (e0->bundle[ABOVE][SUBJ]) { + e1->bside[SUBJ] = !e1->bside[SUBJ]; + } + if (e1->bundle[ABOVE][SUBJ]) { + e0->bside[SUBJ] = !e0->bside[SUBJ]; + } + + /* Swap e0 and e1 bundles in the AET */ + prev_edge = e0->prev; + next_edge = e1->next; + if (e1->next) { + e1->next->prev = e0; + } + + if (e0->bstate[ABOVE] == BUNDLE_HEAD) { + search = 1; + while (search) { + prev_edge = prev_edge->prev; + if (prev_edge) { + if (prev_edge->bundle[ABOVE][CLIP] || + prev_edge->bundle[ABOVE][SUBJ] || + (prev_edge->bstate[ABOVE] == BUNDLE_HEAD)) { + search = 0; + } + } else { + search = 0; + } + } + } + if (!prev_edge) { + e1->next = aet; + aet = e0->next; + } else { + e1->next = prev_edge->next; + prev_edge->next = e0->next; + } + e0->next->prev = prev_edge; + e1->next->prev = e1; + e0->next = next_edge; + } /* End of IT loop*/ + + /* Prepare for next scanbeam */ + for (edge = aet; edge; edge = next_edge) { + next_edge = edge->next; + succ_edge = edge->succ; + + if ((edge->top.y == yt) && succ_edge) { + /* Replace AET edge by its successor */ + succ_edge->outp[BELOW] = edge->outp[ABOVE]; + succ_edge->bstate[BELOW] = edge->bstate[ABOVE]; + succ_edge->bundle[BELOW][CLIP] = edge->bundle[ABOVE][CLIP]; + succ_edge->bundle[BELOW][SUBJ] = edge->bundle[ABOVE][SUBJ]; + prev_edge = edge->prev; + if (prev_edge) { + prev_edge->next = succ_edge; + } else { + aet = succ_edge; + } + if (next_edge) { + next_edge->prev = succ_edge; + } + succ_edge->prev = prev_edge; + succ_edge->next = next_edge; + } else { + /* Update this edge */ + edge->outp[BELOW] = edge->outp[ABOVE]; + edge->bstate[BELOW] = edge->bstate[ABOVE]; + edge->bundle[BELOW][CLIP] = edge->bundle[ABOVE][CLIP]; + edge->bundle[BELOW][SUBJ] = edge->bundle[ABOVE][SUBJ]; + edge->xb = edge->xt; + } + edge->outp[ABOVE] = NULL; + } + } + } /* === END OF SCANBEAM PROCESSING ================================== */ + + // Generate result tristrip from tlist + result->strip = NULL; + result->num_strips = count_tristrips(tlist); + if (result->num_strips > 0) { + gpc_malloc(result->strip, + result->num_strips * sizeof(gpc_vertex_list), + const_cast("tristrip list creation")); + + s = 0; + for (tn = tlist; tn; tn = tnn) { + tnn = tn->next; + if (tn->active > 2) { + /* Valid tristrip: copy the vertices and free the heap */ + result->strip[s].num_vertices = tn->active; + gpc_malloc(result->strip[s].vertex, + tn->active * sizeof(gpc_vertex), + const_cast("tristrip creation")); + v = 0; + if (0) { + lt = tn->v[RIGHT]; + rt = tn->v[LEFT]; + } else { + lt = tn->v[LEFT]; + rt = tn->v[RIGHT]; + } + while (lt || rt) { + if (lt) { + ltn = lt->next; + result->strip[s].vertex[v].x = lt->x; + result->strip[s].vertex[v].y = lt->y; + v++; + gpc_free(lt); + lt = ltn; + } + if (rt) { + rtn = rt->next; + result->strip[s].vertex[v].x = rt->x; + result->strip[s].vertex[v].y = rt->y; + v++; + gpc_free(rt); + rt = rtn; + } + } + s++; + } else { + /* Invalid tristrip: just free the heap */ + for (lt = tn->v[LEFT]; lt; lt = ltn) { + ltn = lt->next; + gpc_free(lt); + } + for (rt = tn->v[RIGHT]; rt; rt = rtn) { + rtn = rt->next; + gpc_free(rt); + } + } + gpc_free(tn); + } + } + // Tidy up + reset_it(&it); + reset_lmt(&lmt); + gpc_free(c_heap); + gpc_free(s_heap); + gpc_free(sbt); +} // NOLINT + +} // namespace gpc + +#endif diff --git a/src/operators/math/gpc.h b/src/operators/math/gpc.h new file mode 100644 index 0000000000000000000000000000000000000000..2cae7fe18458ee6f42f3cc6f374982214f041f84 --- /dev/null +++ b/src/operators/math/gpc.h @@ -0,0 +1,222 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef MULTICLASSNMS_OP +#pragma once + +#include +#include +#include +#include + +namespace gpc { + +typedef enum { // Set operation type + GPC_DIFF, // Difference + GPC_INT, // Intersection + GPC_XOR, // Exclusive or + GPC_UNION // Union +} gpc_op; + +typedef struct { // Polygon vertex structure + double x; // Vertex x component + double y; // vertex y component +} gpc_vertex; + +typedef struct { // Vertex list structure + int num_vertices; // Number of vertices in list + gpc_vertex *vertex; // Vertex array pointer +} gpc_vertex_list; + +typedef struct { // Polygon set structure + int num_contours; // Number of contours in polygon + int *hole; // Hole external contour flags + gpc_vertex_list *contour; // Contour array pointer +} gpc_polygon; + +typedef struct { // Tristrip set structure + int num_strips; // Number of tristrips + gpc_vertex_list *strip; // Tristrip array pointer +} gpc_tristrip; + +typedef enum { LEFT, RIGHT } gpc_left_right; + +typedef enum { ABOVE, BELOW } gpc_above_below; + +typedef enum { CLIP, SUBJ } gpc_clip_subj; + +typedef enum { /* Edge intersection classes */ + NUL, /* Empty non-intersection */ + EMX, /* External maximum */ + ELI, /* External left intermediate */ + TED, /* Top edge */ + ERI, /* External right intermediate */ + RED, /* Right edge */ + IMM, /* Internal maximum and minimum */ + IMN, /* Internal minimum */ + EMN, /* External minimum */ + EMM, /* External maximum and minimum */ + LED, /* Left edge */ + ILI, /* Internal left intermediate */ + BED, /* Bottom edge */ + IRI, /* Internal right intermediate */ + IMX, /* Internal maximum */ + FUL /* Full non-intersection */ +} vertex_type; + +typedef enum { /* Horizontal edge states */ + NH, /* No horizontal edge */ + BH, /* Bottom horizontal edge */ + TH /* Top horizontal edge */ +} h_state; + +typedef enum { /* Edge bundle state */ + UNBUNDLED, /* Isolated edge not within a bundle */ + BUNDLE_HEAD, /* Bundle head node */ + BUNDLE_TAIL /* Passive bundle tail node */ +} bundle_state; + +typedef struct v_shape { /* Internal vertex list datatype */ + double x; /* X coordinate component */ + double y; /* Y coordinate component */ + struct v_shape *next; /* Pointer to next vertex in list */ +} vertex_node; + +typedef struct p_shape { /* Internal contour / tristrip type */ + int active; /* Active flag / vertex count */ + int hole; /* Hole / external contour flag */ + vertex_node *v[2]; /* Left and right vertex list ptrs */ + struct p_shape *next; /* Pointer to next polygon contour */ + struct p_shape *proxy; /* Pointer to actual structure used */ +} polygon_node; + +typedef struct edge_shape { + gpc_vertex vertex; /* Piggy-backed contour vertex data */ + gpc_vertex bot; /* Edge lower (x, y) coordinate */ + gpc_vertex top; /* Edge upper (x, y) coordinate */ + double xb; /* Scanbeam bottom x coordinate */ + double xt; /* Scanbeam top x coordinate */ + double dx; /* Change in x for a unit y increase */ + int type; /* Clip / subject edge flag */ + int bundle[2][2]; /* Bundle edge flags */ + int bside[2]; /* Bundle left / right indicators */ + bundle_state bstate[2]; /* Edge bundle state */ + polygon_node *outp[2]; /* Output polygon / tristrip pointer */ + struct edge_shape *prev; /* Previous edge in the AET */ + struct edge_shape *next; /* Next edge in the AET */ + struct edge_shape *pred; /* Edge connected at the lower end */ + struct edge_shape *succ; /* Edge connected at the upper end */ + struct edge_shape *next_bound; /* Pointer to next bound in LMT */ +} edge_node; + +inline bool gpc_eq(float a, float b) { return (fabs(a - b) <= 1e-6); } + +inline bool gpc_prev_index(float a, float b) { return (fabs(a - b) <= 1e-6); } + +inline int gpc_prev_index(int i, int n) { return ((i - 1 + n) % n); } + +inline int gpc_next_index(int i, int n) { return ((i + 1) % n); } + +inline int gpc_optimal(gpc_vertex *v, int i, int n) { + return (v[(i + 1) % n].y != v[i].y || v[(i - 1 + n) % n].y != v[i].y); +} + +inline int gpc_fwd_min(edge_node *v, int i, int n) { + return (v[(i + 1) % n].vertex.y > v[i].vertex.y && + v[(i - 1 + n) % n].vertex.y >= v[i].vertex.y); +} + +inline int gpc_not_fmax(edge_node *v, int i, int n) { + return (v[(i + 1) % n].vertex.y > v[i].vertex.y); +} + +inline int gpc_rev_min(edge_node *v, int i, int n) { + return (v[(i + 1) % n].vertex.y >= v[i].vertex.y && + v[(i - 1 + n) % n].vertex.y > v[i].vertex.y); +} + +inline int gpc_not_rmax(edge_node *v, int i, int n) { + return (v[(i - 1 + n) % n].vertex.y > v[i].vertex.y); +} + +// inline void gpc_p_edge(edge_node *d, edge_node *e, int p, double i, double j) +// { +inline void gpc_p_edge(edge_node *d, edge_node *e, int p) { + d = e; + do { + d = d->prev; + } while (!d->outp[p]); + // i = d->bot.x + d->dx * (j - d->bot.y); +} + +// inline void gpc_n_edge(edge_node *d, edge_node *e, int p, double i, double j) +// { +inline void gpc_n_edge(edge_node *d, edge_node *e, int p) { + d = e; + do { + d = d->next; + } while (!d->outp[p]); + // i = d->bot.x + d->dx * (j - d->bot.y); +} + +template +void gpc_malloc(T *&p, int b, char *s) { // NOLINT + if (b > 0) { + p = reinterpret_cast(malloc(b)); + + if (!p) { + fprintf(stderr, "gpc malloc failure: %s\n", s); + exit(0); + } + } else { + p = NULL; + } +} + +template +void gpc_free(T *&p) { // NOLINT + if (p) { + free(p); + p = NULL; + } +} + +/* +=========================================================================== + Public Function Prototypes +=========================================================================== +*/ + +void add_vertex(vertex_node **t, double x, double y); + +void gpc_vertex_create(edge_node *e, int p, int s, double x, double y); + +void gpc_add_contour(gpc_polygon *polygon, gpc_vertex_list *contour, int hole); + +void gpc_polygon_clip(gpc_op set_operation, gpc_polygon *subject_polygon, + gpc_polygon *clip_polygon, gpc_polygon *result_polygon); + +void gpc_tristrip_clip(gpc_op set_operation, gpc_polygon *subject_polygon, + gpc_polygon *clip_polygon, + gpc_tristrip *result_tristrip); + +void gpc_polygon_to_tristrip(gpc_polygon *polygon, gpc_tristrip *tristrip); + +void gpc_free_polygon(gpc_polygon *polygon); + +void gpc_free_tristrip(gpc_tristrip *tristrip); + +} // namespace gpc + +#endif diff --git a/src/operators/math/poly_util.cpp b/src/operators/math/poly_util.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1cc1e2a40374204c8644267e8ab84af3cba5c65a --- /dev/null +++ b/src/operators/math/poly_util.cpp @@ -0,0 +1,120 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef MULTICLASSNMS_OP + +#include "operators/math/poly_util.h" + +namespace paddle_mobile { +namespace operators { +namespace math { + +template +void Array2PointVec(const T* box, const size_t box_size, + std::vector>* vec) { + size_t pts_num = box_size / 2; + vec->resize(pts_num); + for (size_t i = 0; i < pts_num; i++) { + vec->at(i).x = box[2 * i]; + vec->at(i).y = box[2 * i + 1]; + } +} + +template +void Array2Poly(const T* box, const size_t box_size, gpc::gpc_polygon* poly) { + size_t pts_num = box_size / 2; + poly->num_contours = 1; + poly->hole = reinterpret_cast(malloc(sizeof(int))); + poly->hole[0] = 0; + poly->contour = (gpc::gpc_vertex_list*)malloc(sizeof(gpc::gpc_vertex_list)); + poly->contour->num_vertices = pts_num; + poly->contour->vertex = + (gpc::gpc_vertex*)malloc(sizeof(gpc::gpc_vertex) * pts_num); + for (size_t i = 0; i < pts_num; ++i) { + poly->contour->vertex[i].x = box[2 * i]; + poly->contour->vertex[i].y = box[2 * i + 1]; + } +} + +template void Array2Poly(const float* box, const size_t box_size, + gpc::gpc_polygon* poly); + +template +void Poly2PointVec(const gpc::gpc_vertex_list& contour, + std::vector>* vec) { + int pts_num = contour.num_vertices; + vec->resize(pts_num); + for (size_t i = 0; i < pts_num; i++) { + vec->at(i).x = contour.vertex[i].x; + vec->at(i).y = contour.vertex[i].y; + } +} + +template +T GetContourArea(const std::vector>& vec) { + int pts_num = vec.size(); + if (pts_num < 3) return T(0.); + T area = T(0.); + for (size_t i = 0; i < pts_num; ++i) { + area += vec[i].x * vec[(i + 1) % pts_num].y - + vec[i].y * vec[(i + 1) % pts_num].x; + } + return fabs(area / 2.0); +} + +template +T PolyArea(const T* box, const size_t box_size, const bool normalized) { + // If coordinate values are is invalid + // if area size <= 0, return 0. + std::vector> vec; + Array2PointVec(box, box_size, &vec); + return GetContourArea(vec); +} + +template float PolyArea(const float* box, const size_t box_size, + const bool normalized); + +template +T PolyOverlapArea(const T* box1, const T* box2, const size_t box_size, + const bool normalized) { + gpc::gpc_polygon poly1; + gpc::gpc_polygon poly2; + Array2Poly(box1, box_size, &poly1); + Array2Poly(box2, box_size, &poly2); + gpc::gpc_polygon respoly; + gpc::gpc_op op = gpc::GPC_INT; + gpc::gpc_polygon_clip(op, &poly2, &poly1, &respoly); + + T inter_area = T(0.); + int contour_num = respoly.num_contours; + for (int i = 0; i < contour_num; ++i) { + std::vector> resvec; + Poly2PointVec(respoly.contour[i], &resvec); + inter_area += GetContourArea(resvec); + } + + gpc::gpc_free_polygon(&poly1); + gpc::gpc_free_polygon(&poly2); + gpc::gpc_free_polygon(&respoly); + return inter_area; +} + +template float PolyOverlapArea(const float* box1, const float* box2, + const size_t box_size, const bool normalized); + +} // namespace math +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/math/poly_util.h b/src/operators/math/poly_util.h new file mode 100644 index 0000000000000000000000000000000000000000..96951a0ab1ff9ab25553b7290cfbb4a21c54cfc8 --- /dev/null +++ b/src/operators/math/poly_util.h @@ -0,0 +1,70 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef MULTICLASSNMS_OP +#pragma once + +#include +#include "operators/math/gpc.h" + +namespace paddle_mobile { +namespace operators { +namespace math { + +template +class Point_ { + public: + // default constructor + Point_() {} + Point_(T _x, T _y) {} + Point_(const Point_& pt) {} + + Point_& operator=(const Point_& pt); + // conversion to another data type + // template operator Point_<_T>() const; + // conversion to the old-style C structures + // operator Vec() const; + + // checks whether the point is inside the specified rectangle + // bool inside(const Rect_& r) const; + T x; //!< x coordinate of the point + T y; //!< y coordinate of the point +}; + +template +void Array2PointVec(const T* box, const size_t box_size, + std::vector>* vec); + +template +void Array2Poly(const T* box, const size_t box_size, gpc::gpc_polygon* poly); + +template +void Poly2PointVec(const gpc::gpc_vertex_list& contour, + std::vector>* vec); + +template +T GetContourArea(const std::vector>& vec); + +template +T PolyArea(const T* box, const size_t box_size, const bool normalized); + +template +T PolyOverlapArea(const T* box1, const T* box2, const size_t box_size, + const bool normalized); + +} // namespace math +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/math/selected_rows_functor.h b/src/operators/math/selected_rows_functor.h new file mode 100644 index 0000000000000000000000000000000000000000..f8b5521e4d19fd3199e7b05a902c98b731c9fbd0 --- /dev/null +++ b/src/operators/math/selected_rows_functor.h @@ -0,0 +1,174 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "framework/selected_rows.h" + +#define INLINE_FOR2(sizei, sizej) \ + for (int64_t i = 0; i < sizei; i++) \ + for (int64_t j = 0; j < sizej; j++) + +namespace paddle_mobile { +namespace operators { +namespace math { + +// SelectedRows + SelectedRows will simplely concat value and rows. +// The real computation happens in dealing with LoDTensor. +// template +// struct SelectedRowsAdd { +// void operator()( +// const framework::SelectedRows& input1, +// const framework::SelectedRows& input2, +// framework::SelectedRows* output); +//}; +// +// template +// struct SelectedRowsAddTensor { +// void operator()( +// const framework::SelectedRows& input1, +// const framework::Tensor& input2, framework::Tensor* output); +//}; + +// input2 = input1 + input2 +template +struct SelectedRowsAddTo { + void operator()(const framework::SelectedRows& input1, + const int64_t input2_offset, + framework::SelectedRows* input2) { + auto in1_height = input1.height(); + PADDLE_MOBILE_ENFORCE(in1_height == input2->height(), "height error"); + + auto& in1_rows = input1.rows(); + auto& in2_rows = *(input2->mutable_rows()); + + auto& in1_value = input1.value(); + auto* in2_value = input2->mutable_value(); + + // concat rows + in2_rows.Extend(in1_rows.begin(), in1_rows.end()); + + // auto in1_place = input1.place(); + // PADDLE_ENFORCE(platform::is_cpu_place(in1_place)); + // auto in2_place = input2->place(); + // PADDLE_ENFORCE(platform::is_cpu_place(in2_place)); + + auto* in1_data = in1_value.data(); + auto* in2_data = in2_value->data(); + memory::Copy(in2_data + input2_offset, in1_data, + in1_value.numel() * sizeof(T)); + } +}; + +// input2 = input1 + input2 +template +struct SelectedRowsAddToTensor { + void operator()(const framework::SelectedRows& input1, + framework::Tensor* input2) { + auto in1_height = input1.height(); + auto in2_dims = input2->dims(); + PADDLE_MOBILE_ENFORCE(in1_height == in2_dims[0], "height != dims[0]"); + + auto& in1_value = input1.value(); + auto& in1_rows = input1.rows(); + + int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); + PADDLE_MOBILE_ENFORCE(in1_row_numel == input2->numel() / in1_height, + "row_numel error"); + + auto* in1_data = in1_value.data(); + auto* input2_data = input2->data(); + + for (size_t i = 0; i < in1_rows.size(); i++) { + for (int64_t j = 0; j < in1_row_numel; j++) { + input2_data[in1_rows[i] * in1_row_numel + j] += + in1_data[i * in1_row_numel + j]; + } + } + } +}; + +// namespace scatter { +//// functors for manuplating SelectedRows data +// template +// struct MergeAdd { +// // unary functor, merge by adding duplicated rows in +// // the input SelectedRows object. +// framework::SelectedRows operator()( +// const framework::SelectedRows& input); +//}; + +// template +// struct Add { +// framework::SelectedRows operator()( +// const framework::SelectedRows& input1, +// const framework::SelectedRows& input2) { +// framework::SelectedRows out; +// out.set_rows(input1.rows()); +// out.set_height(input1.height()); +// out.mutable_value()->mutable_data(input1.value().dims(), +// ); +// auto e_out = framework::EigenVector::Flatten(*(out.mutable_value())); +// auto e_in1 = framework::EigenVector::Flatten(input1.value()); +// auto e_in2 = framework::EigenVector::Flatten(input2.value()); +// e_out.device(*context.eigen_device()) = e_in1 + e_in2; +// return out; +// } +//}; + +// template +// struct Mul { +// // multiply two SelectedRows +// framework::SelectedRows operator()( +// const framework::SelectedRows& input1, +// const framework::SelectedRows& input2) { +// framework::SelectedRows out; +// out.set_rows(input1.rows()); +// out.set_height(input1.height()); +// out.mutable_value()->mutable_data(input1.value().dims() +// ); +// auto e_out = framework::EigenVector::Flatten(*(out.mutable_value())); +// auto e_in1 = framework::EigenVector::Flatten(input1.value()); +// auto e_in2 = framework::EigenVector::Flatten(input2.value()); +// e_out.device(*context.eigen_device()) = e_in1 * e_in2; +// return out; +// } +// // multiply scalar to SelectedRows +// framework::SelectedRows operator()( +// const framework::SelectedRows& input1, +// const T input2) { +// framework::SelectedRows out; +// out.set_rows(input1.rows()); +// out.set_height(input1.height()); +// out.mutable_value()->mutable_data(input1.value().dims(), +// ); +// auto e_out = framework::EigenVector::Flatten(*(out.mutable_value())); +// auto e_in1 = framework::EigenVector::Flatten(input1.value()); +// e_out.device(*context.eigen_device()) = input2 * e_in1; +// return out; +// } +//}; + +enum class ScatterOps { ASSIGN, ADD, SUB, SUBBY, MUL, DIV, DIVBY }; + +// out = seleted_rows_in / tensor +template +struct UpdateToTensor { + void operator()(const ScatterOps& op, const framework::SelectedRows& input1, + framework::Tensor* input2); +}; + +// namespace scatter +} // namespace math +} // namespace operators +} // namespace paddle_mobile diff --git a/src/operators/multiclass_nms_op.cpp b/src/operators/multiclass_nms_op.cpp index 97f4f1a1c650e2810b99a2938962ee7f8371dd2f..9d6ffaf3a78c036beb3e1783930c68d08be0cc0e 100644 --- a/src/operators/multiclass_nms_op.cpp +++ b/src/operators/multiclass_nms_op.cpp @@ -25,14 +25,15 @@ void MultiClassNMSOp::InferShape() const { if (input_scores_dims.size() != 3) { LOG(kLOG_ERROR) << "Input Scores size must be 3"; } - if (input_bboxes_dims[2] != 4) { - LOG(kLOG_ERROR) << "Input BBoxes 2nd dimension must be 4"; + if (input_bboxes_dims[2] % 4 != 0 || input_bboxes_dims[2] < 4) { + LOG(kLOG_ERROR) << "Input BBoxes 2nd dimension must be multiples of 4"; } if (input_bboxes_dims[1] != input_scores_dims[2]) { LOG(kLOG_ERROR) << "Predict bboxes must be equal"; } // pre size, will change in Compute. - this->param_.Out()->Resize(framework::make_ddim({input_bboxes_dims[1], 6})); + this->param_.Out()->Resize( + framework::make_ddim({input_bboxes_dims[1], input_bboxes_dims[2] + 2})); } } // namespace operators diff --git a/src/operators/op_param.h b/src/operators/op_param.h index 9c89a5b9b9266686eafd580170a1d4673c601b28..b2ced3294381d2ec97672dcbc86fe9da741de4d0 100644 --- a/src/operators/op_param.h +++ b/src/operators/op_param.h @@ -35,6 +35,7 @@ using framework::AttributeMap; using framework::LoDTensor; using framework::Scope; using framework::Tensor; +using framework::Variable; using std::string; using std::vector; @@ -182,6 +183,11 @@ class OpParam { return GetMultiVarValue("X", inputs, scope); } + static vector InputMultiVarsFrom(const VariableNameMap &inputs, + const Scope &scope) { + return GetMultiVar("X", inputs, scope); + } + template static T *OutputBatchGateFrom(const VariableNameMap &outputs, const Scope &scope) { @@ -216,6 +222,11 @@ class OpParam { return GetVarValue("Output", outputs, scope); } + static Variable *OutVarFrom(const VariableNameMap &outputs, + const Scope &scope) { + return GetVar("Out", outputs, scope); + } + template static T *OutFrom(const VariableNameMap &outputs, const Scope &scope) { return GetVarValue("Out", outputs, scope); @@ -286,6 +297,19 @@ class OpParam { } } + static Variable *GetVar(const string &key, const VariableNameMap &var_map, + const Scope &scope) { + PADDLE_MOBILE_ENFORCE(var_map.count(key) > 0, + "%s is not contained in var_map", key.c_str()) + auto var_vec = var_map.at(key); + if (!var_vec.empty()) { + auto var = scope.FindVar(var_vec[0]); + return var; + } else { + return nullptr; + } + } + static std::string getkey(const string &key, const VariableNameMap &var_map, int index) { auto var_vec = var_map.at(key); @@ -319,6 +343,19 @@ class OpParam { } return var_res; } + + static vector GetMultiVar(const string &key, + const VariableNameMap &var_map, + const Scope &scope) { + auto var_vecs = var_map.at(key); + assert(var_vecs.size() > 1); + vector var_res; + for (auto &var_vec : var_vecs) { + auto var = scope.FindVar(var_vec); + var_res.push_back(var); + } + return var_res; + } }; template @@ -405,11 +442,75 @@ class ElementwiseAddParam : OpParam { #endif }; +#ifdef ELEMENTWISEMUL_OP +template +class ElementwiseMulParam : OpParam { + typedef typename DtypeTensorTrait::gtype GType; + typedef typename DtypeTensorTrait::rtype RType; + + public: + ElementwiseMulParam(const VariableNameMap &inputs, + const VariableNameMap &outputs, const AttributeMap &attrs, + const Scope &scope) { + input_x_ = InputXFrom(inputs, scope); + input_y_ = InputYFrom(inputs, scope); + out_ = OutFrom(outputs, scope); + axis_ = GetAttr("axis", attrs); + } + + const GType *InputX() const { return input_x_; } + + const GType *InputY() const { return input_y_; } + + GType *Out() const { return out_; } + + const int &Axis() const { return axis_; } + + private: + GType *input_x_; + GType *input_y_; + GType *out_; + int axis_; +}; +#endif + #ifdef FUSION_ELEMENTWISEADDRELU_OP template using ElementwiseAddReluParam = ElementwiseAddParam; #endif +#ifdef ELEMENTWISESUB_OP +template +class ElementwiseSubParam : OpParam { + typedef typename DtypeTensorTrait::gtype GType; + typedef typename DtypeTensorTrait::rtype RType; + + public: + ElementwiseSubParam(const VariableNameMap &inputs, + const VariableNameMap &outputs, const AttributeMap &attrs, + const Scope &scope) { + input_x_ = InputXFrom(inputs, scope); + input_y_ = InputYFrom(inputs, scope); + out_ = OutFrom(outputs, scope); + axis_ = GetAttr("axis", attrs); + } + + const GType *InputX() const { return input_x_; } + + const GType *InputY() const { return input_y_; } + + GType *Out() const { return out_; } + + const int &Axis() const { return axis_; } + + private: + GType *input_x_; + GType *input_y_; + GType *out_; + int axis_; +}; +#endif + #ifdef MUL_OP template class MulParam : OpParam { @@ -445,11 +546,11 @@ class MulParam : OpParam { #ifdef PADDLE_MOBILE_FPGA private: - fpga::WrapperConvArgs fpga_conv_args; + fpga::SplitConvArgs fpga_conv_args; public: - const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; } - void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; } + const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; } + void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; } #endif }; #endif @@ -490,6 +591,37 @@ class ConcatParam : public OpParam { }; #endif +#ifdef SUM_OP +template +class SumParam : public OpParam { + typedef typename DtypeTensorTrait::gtype GType; + typedef typename DtypeTensorTrait::rtype RType; + + public: + SumParam(const VariableNameMap &inputs, const VariableNameMap &outputs, + const AttributeMap &attrs, const Scope &scope) { + inputs_vars_ = InputMultiVarsFrom(inputs, scope); + out_var_ = OutVarFrom(outputs, scope); + inputs_ = InputMultiFrom(inputs, scope); + out_ = OutFrom(outputs, scope); + } + + vector InputsVars() const { return inputs_vars_; } + + Variable *OutVar() const { return out_var_; } + + vector Inputs() const { return inputs_; } + + GType *Out() const { return out_; } + + private: + vector inputs_vars_; + Variable *out_var_; + vector inputs_; + GType *out_; +}; +#endif + #ifdef LRN_OP template class LrnParam : public OpParam { @@ -1269,11 +1401,11 @@ class FusionFcParam : public OpParam { #ifdef PADDLE_MOBILE_FPGA private: - fpga::WrapperConvArgs fpga_conv_args; + fpga::SplitConvArgs fpga_conv_args; public: - const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; } - void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; } + const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; } + void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; } #endif }; @@ -1309,11 +1441,11 @@ class FusionConvAddParam : public ConvParam { #ifdef PADDLE_MOBILE_FPGA private: - fpga::WrapperConvArgs fpga_conv_args; + fpga::SplitConvArgs fpga_conv_args; public: - const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; } - void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; } + const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; } + void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; } #endif }; @@ -1364,11 +1496,11 @@ class FusionConvAddPReluParam : public ConvParam { #ifdef PADDLE_MOBILE_FPGA private: - fpga::WrapperConvArgs fpga_conv_args; + fpga::SplitConvArgs fpga_conv_args; public: - const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; } - void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; } + const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; } + void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; } #endif }; #endif @@ -1422,11 +1554,11 @@ class FusionConvAddAddPReluParam : public ConvParam { #ifdef PADDLE_MOBILE_FPGA private: - fpga::WrapperConvArgs fpga_conv_args; + fpga::SplitConvArgs fpga_conv_args; public: - const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; } - void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; } + const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; } + void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; } #endif }; #endif @@ -1497,11 +1629,11 @@ class FusionConvAddBNReluParam : public ConvParam { #ifdef PADDLE_MOBILE_FPGA private: - fpga::WrapperConvArgs fpga_conv_args; + fpga::SplitConvArgs fpga_conv_args; public: - const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; } - void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; } + const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; } + void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; } #endif }; #endif @@ -1583,11 +1715,11 @@ class FusionConvBNAddReluParam : public ConvParam { #ifdef PADDLE_MOBILE_FPGA private: - fpga::WrapperConvArgs fpga_conv_args; + fpga::SplitConvArgs fpga_conv_args; public: - const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; } - void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; } + const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; } + void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; } #endif }; #endif @@ -1650,11 +1782,11 @@ class FusionConvBNParam : public ConvParam { #ifdef PADDLE_MOBILE_FPGA private: - fpga::WrapperConvArgs fpga_conv_args; + fpga::SplitConvArgs fpga_conv_args; public: - const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; } - void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; } + const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; } + void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; } #endif }; #endif @@ -1725,11 +1857,11 @@ class FusionConvAddBNParam : public ConvParam { #ifdef PADDLE_MOBILE_FPGA private: - fpga::WrapperConvArgs fpga_conv_args; + fpga::SplitConvArgs fpga_conv_args; public: - const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; } - void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; } + const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; } + void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; } #endif }; #endif @@ -1851,11 +1983,11 @@ class FusionConvBNReluParam : public ConvParam { #ifdef PADDLE_MOBILE_FPGA private: - fpga::WrapperConvArgs fpga_conv_args; + fpga::SplitConvArgs fpga_conv_args; public: - const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; } - void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; } + const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; } + void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; } #endif }; #endif diff --git a/src/operators/sum_op.cpp b/src/operators/sum_op.cpp new file mode 100644 index 0000000000000000000000000000000000000000..2e10363b07498128b5573e27a3d63b59c454d8b6 --- /dev/null +++ b/src/operators/sum_op.cpp @@ -0,0 +1,70 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef SUM_OP + +#include + +#include "operators/sum_op.h" + +namespace paddle_mobile { +namespace operators { + +template +void SumOp::InferShape() const { + auto inputs = this->param_.Inputs(); + const size_t n = inputs.size(); + + std::vector inputs_dims; + inputs_dims.reserve(n); + for (int i = 0; i < n; i++) { + inputs_dims.push_back(inputs[i]->dims()); + } + + if (n == 1) { + DLOG << "Warning: sum op have only one input, " + "may waste memory"; + } + + framework::DDim in_dim({0}); + + for (auto& x_dim : inputs_dims) { + if (framework::product(x_dim) == 0) { + continue; + } + if (framework::product(in_dim) == 0) { + in_dim = x_dim; + } else { + PADDLE_MOBILE_ENFORCE(in_dim == x_dim, + "input tensors must have same shape"); + } + } + + this->param_.Out()->Resize(in_dim); +} + +} // namespace operators +} // namespace paddle_mobile + +namespace ops = paddle_mobile::operators; +#ifdef PADDLE_MOBILE_CPU +REGISTER_OPERATOR_CPU(sum, ops::SumOp); +#endif +#ifdef PADDLE_MOBILE_MALI_GPU +REGISTER_OPERATOR_MALI_GPU(sum, ops::ConcatOp); +#endif +#ifdef PADDLE_MOBILE_FPGA +#endif + +#endif diff --git a/src/operators/sum_op.h b/src/operators/sum_op.h new file mode 100644 index 0000000000000000000000000000000000000000..aad8e8322b60d0e931215c9d48d97862f9b14107 --- /dev/null +++ b/src/operators/sum_op.h @@ -0,0 +1,49 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef SUM_OP + +#pragma once + +#include +#include "framework/operator.h" +#include "operators/kernel/sum_kernel.h" +#include "operators/op_param.h" +namespace paddle_mobile { +namespace operators { +using std::string; +template +class SumOp : public framework::OperatorWithKernel< + DeviceType, SumParam, + operators::SumKernel> { + public: + SumOp(const string &type, const VariableNameMap &inputs, + const VariableNameMap &outputs, const framework::AttributeMap &attrs, + std::shared_ptr scope) + : framework::OperatorWithKernel, + operators::SumKernel>( + type, inputs, outputs, attrs, scope) {} + + using framework::OperatorWithKernel< + DeviceType, SumParam, + operators::SumKernel>::OperatorWithKernel; + void InferShape() const override; + + protected: +}; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index b38ff2e47a9d80d8b907e88b6ed5d6d4bcbed513..9086f25de516f01c3033428b26331829d78e14e0 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -61,38 +61,11 @@ endif () list(FIND NET "FPGAnets" CON) if (CON GREATER -1) - ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-resnet paddle-mobile) - ADD_EXECUTABLE(test-resnet50 fpga/test_resnet50.cpp test_helper.h test_include.h executor_for_test.h) target_link_libraries(test-resnet50 paddle-mobile) - ADD_EXECUTABLE(test-fpga-EW fpga/test_fpga_EW.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-fpga-EW paddle-mobile) - - ADD_EXECUTABLE(test-fpga-conv fpga/test_fpga_conv.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-fpga-conv paddle-mobile) - - ADD_EXECUTABLE(test-fpga-pooling fpga/test_fpga_pooling.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-fpga-pooling paddle-mobile) - - ADD_EXECUTABLE(test-fpga-bypass fpga/test_fpga_bypass.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-fpga-bypass paddle-mobile) - - ADD_EXECUTABLE(test-fpga-softmax fpga/test_fpga_softmax.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-fpga-softmax paddle-mobile) - - ADD_EXECUTABLE(test-fpga-concat fpga/test_fpga_concat.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-fpga-concat paddle-mobile) - - ADD_EXECUTABLE(test-tensor-quant fpga/test_tensor_quant.cpp test_helper.h test_include.h executor_for_test.h) - target_link_libraries(test-tensor-quant paddle-mobile) - - ADD_EXECUTABLE(test-fpga-concat-op fpga/test_concat_op.cpp test_helper.h test_include.h) - target_link_libraries(test-fpga-concat-op paddle-mobile) - - ADD_EXECUTABLE(test-format-data fpga/test_format_data.cpp test_helper.h test_include.h) - target_link_libraries(test-format-data paddle-mobile) +# ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h test_include.h executor_for_test.h) +# target_link_libraries(test-resnet paddle-mobile) set(FOUND_MATCH ON) endif () @@ -173,6 +146,14 @@ if (NOT FOUND_MATCH) target_link_libraries(test-elementwiseadd-op paddle-mobile) # gen test + ADD_EXECUTABLE(test-elementwisesub-op operators/test_elementwise_sub_op.cpp test_helper.h test_include.h) + target_link_libraries(test-elementwisesub-op paddle-mobile) + + # gen test + ADD_EXECUTABLE(test-im2sequence-op operators/test_im2sequence_op.cpp test_helper.h test_include.h) + target_link_libraries(test-im2sequence-op paddle-mobile) + + # gen test ADD_EXECUTABLE(test-concat-op operators/test_concat_op.cpp test_helper.h test_include.h) target_link_libraries(test-concat-op paddle-mobile) @@ -212,6 +193,10 @@ if (NOT FOUND_MATCH) ADD_EXECUTABLE(test-fc-op operators/test_fusion_fc_op.cpp test_helper.h test_include.h) target_link_libraries(test-fc-op paddle-mobile) + # gen test + ADD_EXECUTABLE(test-sum-op operators/test_sum_op.cpp test_helper.h test_include.h) + target_link_libraries(test-sum-op paddle-mobile) + # test quantize op ADD_EXECUTABLE(test-quantize-op operators/test_quantize_op.cpp test_helper.h test_include.h) target_link_libraries(test-quantize-op paddle-mobile) diff --git a/test/executor_for_test.h b/test/executor_for_test.h index 93847af20a6d48a6df33dc50f6c6a1db76facf51..60f1856bb9294c6f9b4bd5cfb7d44f984c6f0794 100644 --- a/test/executor_for_test.h +++ b/test/executor_for_test.h @@ -43,7 +43,7 @@ template class Executor4Test : public Executor { public: Executor4Test(Program p, string op_type, - bool use_optimize = false, int predict_op_count = 1) + bool use_optimize = false) : Executor() { this->use_optimize_ = use_optimize; this->program_ = p; @@ -64,7 +64,7 @@ class Executor4Test : public Executor { std::vector> ops = block_desc->Ops(); for (int i = 0; i < ops.size(); ++i) { auto op = ops[i]; - if (op->Type() == op_type && i < predict_op_count) { + if (op->Type() == op_type) { DLOG << "匹配到: " << op->Type(); /// test first meeting op in program @@ -74,6 +74,7 @@ class Executor4Test : public Executor { op->Type(), op->GetInputs(), op->GetOutputs(), op->GetAttrMap(), this->program_.scope); this->ops_of_block_[*block_desc.get()].push_back(op_ptr); + break; } } } diff --git a/test/fpga/test_resnet50.cpp b/test/fpga/test_resnet50.cpp index f850eb3e5ea3a03fe90d82c1eca2af6c9f8e9106..6754a51fa55b0744b94ee70209da1a3fe88f2f32 100644 --- a/test/fpga/test_resnet50.cpp +++ b/test/fpga/test_resnet50.cpp @@ -30,7 +30,11 @@ int main() { input_tensor.data() + input_tensor.numel()); paddle_mobile.FeedData(input_tensor); - paddle_mobile.Predict_To(-1); + for (int i = 0; i < 1000; i++) { + paddle_mobile.Predict_To(-1); + if (i % 100 == 0) std::cout << i << std::endl; + } + // paddle_mobile.Predict_From(73); // paddle_mobile.Predict_From_To(72, 73); diff --git a/test/operators/test_elementwise_sub_op.cpp b/test/operators/test_elementwise_sub_op.cpp new file mode 100644 index 0000000000000000000000000000000000000000..cfac83eff7a012d52d47f96e088bd8519603cadc --- /dev/null +++ b/test/operators/test_elementwise_sub_op.cpp @@ -0,0 +1,159 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "../test_helper.h" +#include "../test_include.h" +#include "operators/elementwise_sub_op.h" + +namespace paddle_mobile { +namespace framework { + +template +class TestElementwiseSubOp { + public: + explicit TestElementwiseSubOp(const Program p) : program_(p) { + if (use_optimize_) { + to_predict_program_ = program_.optimizeProgram; + } else { + to_predict_program_ = program_.originProgram; + } + + const std::vector> blocks = + to_predict_program_->Blocks(); + // DLOG << " **block size " << blocks.size(); + for (int i = 0; i < blocks.size(); ++i) { + std::shared_ptr block_desc = blocks[i]; + std::vector> ops = block_desc->Ops(); + // DLOG << " ops " << ops.size(); + for (int j = 0; j < ops.size(); ++j) { + std::shared_ptr op = ops[j]; + if (op->Type() == "elementwise_sub" && + op->Input("X")[0] == "sigmoid_1.tmp_0") { + DLOG << " elementwise_sub attr size: " << op->GetAttrMap().size(); + DLOG << " inputs size: " << op->GetInputs().size(); + DLOG << " outputs size: " << op->GetOutputs().size(); + + std::shared_ptr> lrn = + std::make_shared>( + op->Type(), op->GetInputs(), op->GetOutputs(), + op->GetAttrMap(), program_.scope); + ops_of_block_[*block_desc.get()].push_back(lrn); + } + } + } + } + + std::shared_ptr predict_bn(const Tensor &t1, const Tensor &t2) { + // feed + auto scope = program_.scope; + Variable *x1_feed_value = scope->Var("tmp_0"); + auto tensor_x1 = x1_feed_value->GetMutable(); + tensor_x1->ShareDataWith(t1); + + Variable *x2_feed_value = scope->Var("sigmoid_1.tmp_0"); + auto tensor_x2 = x2_feed_value->GetMutable(); + tensor_x2->ShareDataWith(t2); + + Variable *output = scope->Var("tmp_1"); + auto *output_tensor = output->GetMutable(); + output_tensor->mutable_data({1, 1, 6, 6}); + // DLOG << typeid(output_tensor).name(); + // DLOG << "output_tensor dims: " << output_tensor->dims(); + + std::shared_ptr out_tensor = std::make_shared(); + out_tensor.reset(output_tensor); + + predict_bn(t1, t2, 0); + return out_tensor; + } + + private: + const framework::Program program_; + std::shared_ptr to_predict_program_; + std::map>>> + ops_of_block_; + bool use_optimize_ = false; + + void predict_bn(const Tensor &t1, const Tensor &t2, int block_id) { + std::shared_ptr to_predict_block = + to_predict_program_->Block(block_id); + for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) { + auto op = ops_of_block_[*to_predict_block.get()][j]; + DLOG << "op -> run()"; + op->Run(); + } + } +}; + +template class TestElementwiseSubOp; +} // namespace framework +} // namespace paddle_mobile + +int main() { + DLOG << "----------**********----------"; + DLOG << "begin to run ElementwiseSub Test"; + paddle_mobile::Loader loader; + auto program = loader.Load(std::string(g_ocr) + "/model", + std::string(g_ocr) + "/params"); + + /// input x1 (1,1,6,6) + paddle_mobile::framework::Tensor inputx1; + SetupTensor(&inputx1, {1, 1, 6, 6}, static_cast(0), + static_cast(1)); + auto *inputx1_ptr = inputx1.data(); + + /// input x2 (1,1,6,6) + paddle_mobile::framework::Tensor inputx2; + SetupTensor(&inputx2, {1, 1, 6, 6}, static_cast(0), + static_cast(1)); + auto *inputx2_ptr = inputx2.data(); + + paddle_mobile::framework::TestElementwiseSubOp + testElementwiseSubOp(program); + + auto output_op = testElementwiseSubOp.predict_bn(inputx1, inputx2); + auto *output_op_ptr = output_op->data(); + + auto inputx1_dim = inputx1.numel() / inputx1.dims()[0]; + DLOG << " input1 : "; + for (int i = 0; i < inputx1.dims()[0]; ++i) { + for (int j = 0; j < inputx1_dim; ++j) { + DLOGF("%f ", inputx1_ptr[i * inputx1_dim + j]); + } + DLOGF("\n"); + } + + auto inputx2_dim = inputx2.numel() / inputx2.dims()[0]; + DLOG << " input2 : "; + for (int i = 0; i < inputx2.dims()[0]; ++i) { + for (int j = 0; j < inputx2_dim; ++j) { + DLOGF("%f ", inputx2_ptr[i * inputx2_dim + j]); + } + DLOGF("\n"); + } + + auto output_dim = output_op->numel() / output_op->dims()[0]; + DLOG << " output : "; + for (int i = 0; i < output_op->dims()[0]; ++i) { + for (int j = 0; j < output_dim; ++j) { + DLOGF("%f ", output_op_ptr[i * output_dim + j]); + } + DLOGF("\n"); + } + + return 0; +} diff --git a/test/operators/test_im2sequence_op.cpp b/test/operators/test_im2sequence_op.cpp index a7512d3bf3cffcb100fe292e50fc7b7b23fa0aa0..b45e437e12f95cd9f7050247fc03a152246d8122 100644 --- a/test/operators/test_im2sequence_op.cpp +++ b/test/operators/test_im2sequence_op.cpp @@ -12,51 +12,129 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "../executor_for_test.h" +#pragma once + +#include "../test_helper.h" #include "../test_include.h" #include "operators/im2sequence_op.h" -int main() { - paddle_mobile::Loader loader; - auto program = loader.Load(g_ocr_recg); - PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr, - "program file read fail"); +namespace paddle_mobile { +namespace framework { - Executor4Test> - executor(program, "im2sequence"); +template +class TestIm2SequenceOp { + public: + explicit TestIm2SequenceOp(const Program p) : program_(p) { + if (use_optimize_) { + to_predict_program_ = program_.optimizeProgram; + } else { + to_predict_program_ = program_.originProgram; + } - // 1. input_tensors; - vector input_tensors; + const std::vector> blocks = + to_predict_program_->Blocks(); + // DLOG << " **block size " << blocks.size(); + for (int i = 0; i < blocks.size(); ++i) { + std::shared_ptr block_desc = blocks[i]; + std::vector> ops = block_desc->Ops(); + // DLOG << " ops " << ops.size(); + for (int j = 0; j < ops.size(); ++j) { + std::shared_ptr op = ops[j]; + if (op->Type() == "im2sequence" && + op->Input("X")[0] == "conv2d_19.tmp_1") { + DLOG << " im2squence attr size: " << op->GetAttrMap().size(); + DLOG << " inputs size: " << op->GetInputs().size(); + DLOG << " outputs size: " << op->GetOutputs().size(); - Tensor input1; - auto input1_data = CreateInput(&input1, {2, 2, 3, 3}, -1, 1); - input_tensors.push_back(input1); + std::shared_ptr> lrn = + std::make_shared>( + op->Type(), op->GetInputs(), op->GetOutputs(), + op->GetAttrMap(), program_.scope); + ops_of_block_[*block_desc.get()].push_back(lrn); + } + } + } + } - // 2. input_names - vector input_names({ - "conv2d_19.tmp_1", - }); + std::shared_ptr predict_bn(const Tensor &t1) { + // feed + auto scope = program_.scope; + Variable *x1_feed_value = scope->Var("conv2d_19.tmp_1"); + auto tensor_x1 = x1_feed_value->GetMutable(); + tensor_x1->ShareDataWith(t1); - // 3. output_names - vector output_names({"im2sequence_0.tmp_0"}); + Variable *output = scope->Var("im2sequence_0.tmp_0"); + auto *output_tensor = output->GetMutable(); + output_tensor->mutable_data({2, 12}); + // DLOG << typeid(output_tensor).name(); + // DLOG << "output_tensor dims: " << output_tensor->dims(); - // 4. out_dims; - vector out_ddims; - auto out_ddim = paddle_mobile::framework::make_ddim({8, 9}); - out_ddims.push_back(out_ddim); + std::shared_ptr out_tensor = std::make_shared(); + out_tensor.reset(output_tensor); - auto output = executor.Predict(input_tensors, input_names, - output_names, out_ddims); + predict_bn(t1, 0); + return out_tensor; + } - auto output0_data = output[0]->data(); + private: + const framework::Program program_; + std::shared_ptr to_predict_program_; + std::map>>> + ops_of_block_; + bool use_optimize_ = false; - for (int j = 0; j < input_tensors[0].numel(); ++j) { - DLOG << " value of input: " << input1_data[j]; + void predict_bn(const Tensor &t1, int block_id) { + std::shared_ptr to_predict_block = + to_predict_program_->Block(block_id); + for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) { + auto op = ops_of_block_[*to_predict_block.get()][j]; + DLOG << "op -> run()"; + op->Run(); + } } +}; + +template class TestIm2SequenceOp; +} // namespace framework +} // namespace paddle_mobile - for (int j = 0; j < output[0]->numel(); ++j) { - DLOG << " value of output: " << output0_data[j]; +int main() { + DLOG << "----------**********----------"; + DLOG << "begin to run Im2Sequence Test"; + paddle_mobile::Loader loader; + auto program = loader.Load(std::string(g_eng) + "/model", + std::string(g_eng) + "/params"); + + /// input x (4,10,2,2) + paddle_mobile::framework::Tensor inputx; + SetupTensor(&inputx, {1, 2, 6, 2}, static_cast(0), + static_cast(1)); + auto *inputx_ptr = inputx.data(); + + paddle_mobile::framework::TestIm2SequenceOp + testIm2SequenceOp(program); + + auto output_op = testIm2SequenceOp.predict_bn(inputx); + auto *output_op_ptr = output_op->data(); + + auto input_dim = inputx.numel() / inputx.dims()[0]; + DLOG << " input : "; + for (int i = 0; i < inputx.dims()[0]; ++i) { + for (int j = 0; j < input_dim; ++j) { + DLOGF("%f ", inputx_ptr[i * input_dim + j]); + } + DLOGF("\n"); } + + auto output_dim = output_op->numel() / output_op->dims()[0]; + DLOG << " output : "; + for (int i = 0; i < output_op->dims()[0]; ++i) { + for (int j = 0; j < output_dim; ++j) { + DLOGF("%f ", output_op_ptr[i * output_dim + j]); + } + DLOGF("\n"); + } + return 0; } diff --git a/test/operators/test_mul_op.cpp b/test/operators/test_mul_op.cpp index 3080100e70fe7f6051b91fbe1bf40b968056c257..678add6dcedd22e788e0bd2df64a8eba59ad8514 100644 --- a/test/operators/test_mul_op.cpp +++ b/test/operators/test_mul_op.cpp @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include "../test_helper.h" #include "../test_include.h" #include "operators/mul_op.h" @@ -73,12 +74,20 @@ int TestMulOP() { } } + int32_t eq = 0; + int32_t neq = 0; for (int32_t i = 0; i < m * n; ++i) { PADDLE_MOBILE_ENFORCE( output_data[i] == c[i], "output[%d] = %d, output_cmp[%d] = %d", i, static_cast(output_data[i]), i, static_cast(c[i])); + if (static_cast(output_data[i] == c[i])) { + ++eq; + } else { + ++neq; + } } - DLOG << "Run MulOp successfully!"; + DLOG << "mnk=" << m << " " << n << " " << k << " eq=" << eq + << " neq=" << neq; delete op; return 0; } diff --git a/test/operators/test_multiclass_nms_op.cpp b/test/operators/test_multiclass_nms_op.cpp index e6c41bd4b3bb241964a23accf4633e65818465be..d1b98d4965fd182ab1adc480279f38cea53974be 100644 --- a/test/operators/test_multiclass_nms_op.cpp +++ b/test/operators/test_multiclass_nms_op.cpp @@ -127,18 +127,25 @@ int main() { DLOG << "----------**********----------"; DLOG << "begin to run MulticlassNMS Test"; paddle_mobile::Loader loader; - auto program = loader.Load(std::string("../../test/models/mobilenet+ssd")); + auto program = loader.Load(std::string(g_mobilenet_ssd)); - /// input x (1,3,300,300) paddle_mobile::framework::Tensor inputx1; - SetupTensor(&inputx1, {10, 1917, 4}, static_cast(0), + SetupTensor(&inputx1, {1, 2, 4}, static_cast(0), static_cast(1)); auto *inputx1_ptr = inputx1.data(); + const float x1[] = {0, 0, 100, 100, 50, 50, 150, 150}; + for (int i = 0; i < 8; ++i) { + *(inputx1_ptr + i) = x1[i]; + } paddle_mobile::framework::Tensor inputx2; - SetupTensor(&inputx2, {10, 21, 1917}, static_cast(0), + SetupTensor(&inputx2, {1, 2, 2}, static_cast(0), static_cast(1)); auto *inputx2_ptr = inputx2.data(); + const float x2[] = {0.4, 0.3, 0.6, 0.7}; + for (int i = 0; i < 4; ++i) { + *(inputx2_ptr + i) = x2[i]; + } paddle_mobile::framework::TestMultiClassNMSOp testMultiClassNMSOp(program); @@ -146,8 +153,26 @@ int main() { auto output = testMultiClassNMSOp.predict(inputx1, inputx2); auto *output_ptr = output->data(); - for (int i = 0; i < output->numel(); i++) { + for (int i = 0; i < output->numel(); ++i) { DLOG << output_ptr[i]; } + + // test multi point + paddle_mobile::framework::Tensor inputx3; + SetupTensor(&inputx3, {1, 2, 8}, static_cast(0), + static_cast(1)); + auto *inputx3_ptr = inputx3.data(); + const float x3[] = {0, 0, 100, 0, 100, 100, 0, 100, + 50, 50, 150, 50, 150, 150, 50, 150}; + for (int i = 0; i < 16; ++i) { + *(inputx3_ptr + i) = x3[i]; + } + + auto output2 = testMultiClassNMSOp.predict(inputx3, inputx2); + auto *output_ptr2 = output2->data(); + + for (int i = 0; i < output2->numel(); ++i) { + DLOG << output_ptr2[i]; + } return 0; } diff --git a/test/operators/test_sum_op.cpp b/test/operators/test_sum_op.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e51d1cff5e99c5d9c444db046e78eee6a03f9243 --- /dev/null +++ b/test/operators/test_sum_op.cpp @@ -0,0 +1,133 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "../test_helper.h" +#include "../test_include.h" +#include "operators/sum_op.h" + +namespace paddle_mobile { +namespace framework { + +template +class TestSumOp { + public: + explicit TestSumOp(const Program p) : program_(p) { + if (use_optimize_) { + to_predict_program_ = program_.optimizeProgram; + } else { + to_predict_program_ = program_.originProgram; + } + + const std::vector> blocks = + to_predict_program_->Blocks(); + // DLOG << " **block size " << blocks.size(); + for (int i = 0; i < blocks.size(); ++i) { + std::shared_ptr block_desc = blocks[i]; + std::vector> ops = block_desc->Ops(); + // DLOG << " ops " << ops.size(); + for (int j = 0; j < ops.size(); ++j) { + std::shared_ptr op = ops[j]; + if (op->Type() == "sum" && op->Input("X")[0] == "fc_2.tmp_0") { + DLOG << " sum attr size: " << op->GetAttrMap().size(); + DLOG << " inputs size: " << op->GetInputs().size(); + DLOG << " outputs size: " << op->GetOutputs().size(); + + std::shared_ptr> lrn = + std::make_shared>( + op->Type(), op->GetInputs(), op->GetOutputs(), + op->GetAttrMap(), program_.scope); + ops_of_block_[*block_desc.get()].push_back(lrn); + } + } + } + } + + std::shared_ptr predict_bn(const Tensor &t1, const Tensor &t2) { + // feed + auto scope = program_.scope; + Variable *x1_feed_value = scope->Var("fc_2.tmp_0"); + auto tensor_x1 = x1_feed_value->GetMutable(); + tensor_x1->ShareDataWith(t1); + + Variable *x2_feed_value = scope->Var("fc_2.tmp_1"); + auto tensor_x2 = x2_feed_value->GetMutable(); + tensor_x2->ShareDataWith(t2); + + Variable *output = scope->Var("fc_2.tmp_2"); + auto *output_tensor = output->GetMutable(); + output_tensor->mutable_data({2, 96}); + // DLOG << typeid(output_tensor).name(); + // DLOG << "output_tensor dims: " << output_tensor->dims(); + + std::shared_ptr out_tensor = std::make_shared(); + out_tensor.reset(output_tensor); + + predict_bn(t1, t2, 0); + return out_tensor; + } + + private: + const framework::Program program_; + std::shared_ptr to_predict_program_; + std::map>>> + ops_of_block_; + bool use_optimize_ = false; + + void predict_bn(const Tensor &t1, const Tensor &t2, int block_id) { + std::shared_ptr to_predict_block = + to_predict_program_->Block(block_id); + for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) { + auto op = ops_of_block_[*to_predict_block.get()][j]; + DLOG << "op -> run()"; + op->Run(); + } + } +}; + +template class TestSumOp; +} // namespace framework +} // namespace paddle_mobile + +int main() { + DLOG << "----------**********----------"; + DLOG << "begin to run Sum Test"; + paddle_mobile::Loader loader; + auto program = loader.Load(std::string(g_eng) + "/model", + std::string(g_eng) + "/params"); + + /// input x (4,10,2,2) + paddle_mobile::framework::Tensor inputx1; + SetupTensor(&inputx1, {2, 96}, static_cast(0), + static_cast(1)); + auto *inputx1_ptr = inputx1.data(); + + paddle_mobile::framework::Tensor inputx2; + SetupTensor(&inputx2, {2, 96}, static_cast(0), + static_cast(1)); + auto *inputx2_ptr = inputx2.data(); + + paddle_mobile::framework::TestSumOp testSumOp(program); + + auto output_sum = testSumOp.predict_bn(inputx1, inputx2); + auto *output_sum_ptr = output_sum->data(); + + DLOG << "input1 44: " << inputx1_ptr[44]; + DLOG << "input2 44: " << inputx2_ptr[44]; + DLOG << "out 44 :" << output_sum_ptr[44]; + + return 0; +} diff --git a/test/test_helper.h b/test/test_helper.h index ecbc251a815e343f75b1247ffc430e9c52d6abfd..03ee27d71d58eb5c727172a8112aeedfde244d0f 100644 --- a/test/test_helper.h +++ b/test/test_helper.h @@ -27,6 +27,7 @@ limitations under the License. */ static const char *g_ocr = "../models/ocr"; static const char *g_mobilenet_ssd = "../models/mobilenet+ssd"; static const char *g_genet_combine = "../models/enet"; +static const char *g_eng = "../models/eng_20conv_1_9_fc"; static const char *g_mobilenet_ssd_gesture = "../models/mobilenet+ssd_gesture"; static const char *g_mobilenet_combined = "../models/mobilenet_combine"; static const char *g_googlenetv1_combined = "../models/googlenetv1_combine"; @@ -51,6 +52,7 @@ static const char *g_test_image_1x3x224x224_banana = static const char *g_test_image_desktop_1_3_416_416_nchw_float = "../images/in_put_1_3_416_416_2"; static const char *g_hand = "../images/hand_image"; +static const char *g_moto = "../images/moto_300x300_float"; static const char *g_imgfssd_ar = "../images/test_image_ssd_ar"; static const char *g_imgfssd_ar1 = "../images/003_0001.txt"; static const char *g_img = "../images/img.bin"; diff --git a/tools/op.cmake b/tools/op.cmake index 898f66a634d70a5def7c7ce328a7a291d9b55c70..6e89fa4f66073c13ae216583d48d10327e6631ce 100644 --- a/tools/op.cmake +++ b/tools/op.cmake @@ -33,6 +33,7 @@ if (CON GREATER -1) set(POOL_OP ON) set(RESHAPE_OP ON) set(FUSION_CONVADDBNRELU_OP ON) + set(FUSION_CONVADDRELU_OP ON) set(FUSION_CONVADD_OP ON) set(FOUND_MATCH ON) @@ -117,12 +118,9 @@ if (CON GREATER -1) set(POOL_OP ON) set(CONCAT_OP ON) set(SOFTMAX_OP ON) - set(DROPOUT_OP ON) set(FUSION_CONVBNRELU_OP ON) set(FUSION_CONVBN_OP ON) set(FUSION_CONVADD_OP ON) - set(MUL_OP ON) - set(FOUND_MATCH ON) endif() @@ -188,6 +186,8 @@ if(NOT FOUND_MATCH) set(CONV_OP ON) set(DEPTHWISECONV_OP ON) set(ELEMENTWISEADD_OP ON) + set(ELEMENTWISESUB_OP ON) + set(IM2SEQUENCE_OP ON) set(FUSION_CONVADD_OP ON) set(FUSION_CONVADDPRELU_OP ON) set(FUSION_CONVADDRELU_OP ON) @@ -220,6 +220,8 @@ if(NOT FOUND_MATCH) set(SPLIT_OP ON) set(FLATTEN_OP ON) set(SHAPE_OP ON) + set(ELEMENTWISEMUL_OP ON) + set(SUM_OP ON) endif() # option(BATCHNORM_OP "" ON) @@ -261,6 +263,9 @@ endif() if (ELEMENTWISEADD_OP) add_definitions(-DELEMENTWISEADD_OP) endif() +if (ELEMENTWISESUB_OP) + add_definitions(-DELEMENTWISESUB_OP) +endif() if (FUSION_CONVADD_OP) add_definitions(-DFUSION_CONVADD_OP) endif() @@ -388,3 +393,11 @@ endif() if (SHAPE_OP) add_definitions(-DSHAPE_OP) endif() + +if (ELEMENTWISEMUL_OP) + add_definitions(-DELEMENTWISEMUL_OP) +endif() +if (SUM_OP) + add_definitions(-DSUM_OP) +endif() + diff --git a/tools/pre-commit.hooks/cpplint.hook b/tools/pre-commit.hooks/cpplint.hook index 78ca3cfcdda52a223be609801e6b12ec58b79323..26c25c2e12662c1fca32b9a0eea8981b58d74f44 100644 --- a/tools/pre-commit.hooks/cpplint.hook +++ b/tools/pre-commit.hooks/cpplint.hook @@ -5,7 +5,7 @@ TOTAL_ERRORS=0 # The trick to remove deleted files: https://stackoverflow.com/a/2413151 for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}' | \ grep -v ".pb.cpp" | grep -v ".pb.h" | grep -v ".pb-c.h" | grep -v ".pb-c.c" | \ - grep -v "protobuf-c.h" | grep -v "protobuf-c.c"); do + grep -v "protobuf-c.h" | grep -v "protobuf-c.c" | grep -v "variant.h"); do cpplint $file; TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?); done diff --git a/python/tools/imagetools/imagetools.py b/tools/python/imagetools/imagetools.py similarity index 100% rename from python/tools/imagetools/imagetools.py rename to tools/python/imagetools/imagetools.py diff --git a/python/tools/imagetools/img2nchw.py b/tools/python/imagetools/img2nchw.py similarity index 86% rename from python/tools/imagetools/img2nchw.py rename to tools/python/imagetools/img2nchw.py index 70ca456a1b1b5d20b92d0aaa51b01abb352c1d54..b38c9808059e08b089303208063184bb956667c1 100644 --- a/python/tools/imagetools/img2nchw.py +++ b/tools/python/imagetools/img2nchw.py @@ -45,13 +45,13 @@ def combine_bgrs_nchw(bgrs, means_b_g_r, scale, channel_type=ChannelType.BGR): print '------------------' print bgrs_float_array[0] - print bgrs_float_array[416 * 416 * 2 + 416 * 2 + 2] + print bgrs_float_array[224 * 224 * 2 + 224 * 2 + 2] # for i in range(0, 9): # print'bs %d' % i # print bs[i] / 255. - print bs[416 * 2 + 2] / 255. + print bs[224 * 2 + 2] / 255. print '--------------combine_bgrs_nchw-----------------end' return bgrs_float_array @@ -64,6 +64,6 @@ def combine_bgrs_nchw(bgrs, means_b_g_r, scale, channel_type=ChannelType.BGR): # cv2.waitKey(0) -bgrs = tools.resize_take_rgbs('datas/newyolo.jpg', (416, 416, 3)) +bgrs = tools.resize_take_rgbs('datas/jpgs/0000_0.9834-148196_82452-0ad4b83ec6bc0f9c5f28101539267054.jpg_p0_0.126571263346.jpg', (224, 224, 3)) array = combine_bgrs_nchw(bgrs, (0, 0, 0), 1. / 255, ChannelType.RGB) -tools.save_to_file('datas/desktop_1_3_416_416_nchw_float', array) +tools.save_to_file('datas/desktop_1_3_224_224_nchw_float', array) diff --git a/python/tools/imagetools/img2nhwc.py b/tools/python/imagetools/img2nhwc.py similarity index 100% rename from python/tools/imagetools/img2nhwc.py rename to tools/python/imagetools/img2nhwc.py diff --git a/python/tools/imagetools/numpy2binary.py b/tools/python/imagetools/numpy2binary.py similarity index 58% rename from python/tools/imagetools/numpy2binary.py rename to tools/python/imagetools/numpy2binary.py index dd4bc6e10074183b8dcee4122860c4140ff54229..87f0fda76666225256e7a80ddf3a5b0cda8ad12f 100644 --- a/python/tools/imagetools/numpy2binary.py +++ b/tools/python/imagetools/numpy2binary.py @@ -15,11 +15,11 @@ from array import array # image.resize(shape_h_w) -data = np.fromfile('datas/img.res') +data = np.fromfile('/Users/xiebaiyuan/PaddleProject/paddle-mobile/tools/python/imagetools/datas/jpgs2/0000_0.9834-148196_82452-0ad4b83ec6bc0f9c5f28101539267054.jpg_p0_0.126571263346.jpg.input.npfile','f') print data.size -print data[0] +print data -data.reshape(1, 3, 416, 416) +data.reshape(1, 3, 224, 224) out_array = array('f') print'--------------------' print data.size @@ -27,12 +27,12 @@ print data[0] print '如果是nhwc --------' # rgb rgb rgb rgb rgb -print data[416 * 3 * 2 + 3 * 2 + 2] +print data[224 * 3 * 2 + 3 * 2 + 2] # print data[2] print '如果是nchw --------' # rgb rgb rgb rgb rgb -print data[416 * 416 * 2 + 416 * 2 + 2] +print data[224 * 224 * 2 + 224 * 2 + 2] # print data[2] # 明明是nchw @@ -42,6 +42,8 @@ for i in range(0, data.size): print len(out_array) -print out_array[416 * 416 * 2 + 416 * 2 + 2] +print out_array[224 * 224 * 2 + 224 * 2 + 2] + +# print out_array -tools.save_to_file('datas/in_put_1_3_416_416_2', out_array) +tools.save_to_file('datas/in_put_1_3_224_224_nchw', out_array) diff --git a/tools/python/modeltools/.gitignore b/tools/python/modeltools/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..4108f5244bc039cb95b06e391d51250bb9d0ce42 --- /dev/null +++ b/tools/python/modeltools/.gitignore @@ -0,0 +1,109 @@ +# Created by .ignore support plugin (hsz.mobi) +### Python template +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ + +/yolo/datas/ +/mobilenet/datas/ diff --git a/tools/python/modeltools/core/__init__.py b/tools/python/modeltools/core/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/python/tools/mdl2fluid/framework.proto b/tools/python/modeltools/core/framework.proto similarity index 100% rename from python/tools/mdl2fluid/framework.proto rename to tools/python/modeltools/core/framework.proto diff --git a/python/tools/mdl2fluid/framework_pb2.py b/tools/python/modeltools/core/framework_pb2.py similarity index 100% rename from python/tools/mdl2fluid/framework_pb2.py rename to tools/python/modeltools/core/framework_pb2.py diff --git a/python/tools/mdl2fluid/op_types.py b/tools/python/modeltools/core/op_types.py similarity index 59% rename from python/tools/mdl2fluid/op_types.py rename to tools/python/modeltools/core/op_types.py index ff7d78d20835c605dc581ef14ad2d7d5171fea1d..550f87339c9a048a3732daa7707dd6427965029a 100644 --- a/python/tools/mdl2fluid/op_types.py +++ b/tools/python/modeltools/core/op_types.py @@ -5,22 +5,28 @@ layer_mdl_conv = 'ConvolutionLayer' layer_mdl_deepwise_conv = 'DepthwiseConvolutionLayer' layer_mdl_relu = 'ReluLayer' layer_mdl_pointwise_add = 'PointwiseConvolutionLayer' +layer_mdl_pooling = 'PoolingLayer' +layer_mdl_softmax = 'SoftmaxLayer' # fluid ops op_fluid_fusion_conv_add = 'fusion_conv_add' op_fluid_relu = 'relu' +op_fluid_pooling = 'pool2d' +op_fluid_softmax = 'softmax' # dict mdk layer --- fluid op mdl2fluid_op_layer_dict = { layer_mdl_conv: op_fluid_fusion_conv_add, layer_mdl_deepwise_conv: op_fluid_fusion_conv_add, layer_mdl_relu: op_fluid_relu, - layer_mdl_pointwise_add: op_fluid_fusion_conv_add + layer_mdl_pointwise_add: op_fluid_fusion_conv_add, + layer_mdl_pooling: op_fluid_pooling, + layer_mdl_softmax: op_fluid_softmax } mdl_outputs_key = "outputs" mdl_inputs_key = "inputs" -mdl_weight_key = "weights" +mdl_weight_key = "weight" mdl_attrs_key = "params" # dict of mdl-input _out param to fluid input out attrs @@ -39,13 +45,30 @@ fusion_conv_add_dict = { relu_dict = { mdl_inputs_key: 'X', mdl_outputs_key: 'Out', - mdl_weight_key: () + # mdl_weight_key: () } + +pool2d_dict = { + mdl_inputs_key: 'X', + mdl_outputs_key: 'Out', + # mdl_weight_key: (), + mdl_attrs_key: ('pooling_type', 'global_pooling') + +} + +softmax_dict = { + mdl_inputs_key: 'X', + mdl_outputs_key: 'Out', + mdl_weight_key: (), + mdl_attrs_key: () +} # mdl layers --- fluid ops op_io_dict = { 'fusion_conv_add': fusion_conv_add_dict, - 'relu': relu_dict + 'relu': relu_dict, + 'pool2d': pool2d_dict, + 'softmax': softmax_dict } # fluid attr key --- mdl params key @@ -54,70 +77,17 @@ fusion_conv_add_attrs_dict = { 'strides': 'stride', 'groups': 'group' } + +# fluid attr key --- mdl params key +pool2d_attrs_dict = { + 'global_pooling': 'global_pooling', + 'pooling_type': 'type' +} + + # fluid attr key --- mdl params key fluid_attrs_type_dict = { 'paddings': 0, 'strides': 6, 'groups': 6 } - -# '': "bias_term", 是不是要add 目前 yolo的模型都是 bias_term = 1 - - -# attrs { -# name: "axis" -# type: INT -# i: 1 -# } - - -# attrs_name = { -# 'name': "workspace_size_MB", -# 'type': 'INT', -# 'i': '4096' -# } -# attrs -# { -# name: "data_format" -# type: STRING -# s: "AnyLayout" -# } -# attrs -# { -# name: "use_mkldnn" -# type: BOOLEAN -# b: false -# } -# attrs -# { -# name: "use_cudnn" -# type: BOOLEAN -# b: true -# } -# attrs -# { -# name: "dilations" -# type: INTS -# ints: 1 -# ints: 1 -# } -# attrs -# { -# name: "groups" -# type: INT -# i: 1 -# } -# attrs -# { -# name: "paddings" -# type: INTS -# ints: 0 -# ints: 0 -# } -# attrs -# { -# name: "strides" -# type: INTS -# ints: 1 -# ints: 1 -# } diff --git a/tools/python/modeltools/mobilenet/__init__.py b/tools/python/modeltools/mobilenet/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tools/python/modeltools/mobilenet/converter_mobilenet.py b/tools/python/modeltools/mobilenet/converter_mobilenet.py new file mode 100644 index 0000000000000000000000000000000000000000..ca1e1f7f4d83cf219e1e74603bb23a15c34cfb36 --- /dev/null +++ b/tools/python/modeltools/mobilenet/converter_mobilenet.py @@ -0,0 +1,509 @@ +# coding=utf-8 +import json +import os + +from core import framework_pb2 as framework_pb2, op_types as types +from mobilenet.swicher import Swichter +import shutil + + +def load_mdl(mdl_json_path): + # print('mdl json path : ' + mdl_json_path) + with open(mdl_json_path, 'r') as f: + return json.load(f) + + +def create_if_not_exit(target_dir): + if os.path.exists(target_dir): + shutil.rmtree(target_dir) + os.makedirs(target_dir, 0777) + + +class Converter: + 'convert mdlmodel to fluidmodel' + + def __init__(self, base_dir, mdl_json_path): + print 'base_dir: ' + base_dir + self.mdl_json_path = base_dir + mdl_json_path + self.base_dir = base_dir + print mdl_json_path + self.source_weights_dir = self.base_dir + 'datas/sourcemodels/source_weights/' + self.target_weight_dir = self.base_dir + 'datas/target/target_weights/' + + create_if_not_exit(self.target_weight_dir) + + self.mdl_json = load_mdl(self.mdl_json_path) + self.program_desc = framework_pb2.ProgramDesc() + self.weight_list_ = [] + self.deepwise_weight_list_ = [] + # print(json_dick) + # layers = (json_dick['layer']) + # for layer in layers: + # print(layer) + + def convert(self): + print 'convert begin.....' + # add block_desc + block_desc = self.program_desc.blocks.add() + block_desc.idx = 0 + block_desc.parent_idx = -1 + self.package_ops(block_desc) + self.package_vars(block_desc) + print 'blocks: ' + print self.program_desc.blocks + print 'convert end.....' + desc_serialize_to_string = self.program_desc.SerializeToString() + + outputmodel_dir = self.base_dir + 'datas/target/mobilenet_classfication/' + if os.path.exists(outputmodel_dir): + shutil.rmtree(outputmodel_dir) + os.makedirs(outputmodel_dir, 0777) + + if os.path.exists(outputmodel_dir): + shutil.rmtree(outputmodel_dir) + # create_if_not_exit(outputmodel_dir) + + shutil.copytree(self.target_weight_dir, outputmodel_dir) + + f = open(outputmodel_dir + "__model__", "wb") + f.write(desc_serialize_to_string) + f.close() + + def package_ops(self, block_desc): + + self.add_op_feed(block_desc) + + # add ops with layer + if 'layer' in self.mdl_json: + + layers_ = self.mdl_json['layer'] + for layer in layers_: + + if layer['type'] == 'SoftmaxLayer': + pass + else: + desc_ops_add = block_desc.ops.add() + + # print layer + # for i in layer: + # print i + if 'name' in layer: + l_name = layer['name'] + if 'type' in layer: + self.package_ops_type(desc_ops_add, layer) + + if 'weight' in layer: + self.package_ops_weight2inputs(desc_ops_add, layer) + + if 'output' in layer: + self.package_ops_outputs(desc_ops_add, layer) + + if 'input' in layer: + self.package_ops_inputs(desc_ops_add, layer) + + self.package_ops_attrs(desc_ops_add, layer) + + self.add_op_fetch(block_desc) + + def add_op_feed(self, block_desc): + desc_ops_add = block_desc.ops.add() + inputs_add = desc_ops_add.inputs.add() + inputs_add.parameter = 'X' + inputs_add.arguments.append('feed') + desc_ops_add.type = 'feed' + outputs_add = desc_ops_add.outputs.add() + outputs_add.parameter = 'Out' + outputs_add.arguments.append('data') + attrs_add = desc_ops_add.attrs.add() + attrs_add.name = 'col' + # boolean + attrs_add.type = 0 + attrs_add.i = 0 + + def add_op_fetch(self, block_desc): + desc_ops_add = block_desc.ops.add() + inputs_add = desc_ops_add.inputs.add() + inputs_add.parameter = 'X' + # todo pick last layer --> op output + inputs_add.arguments.append('fc7') + desc_ops_add.type = 'fetch' + outputs_add = desc_ops_add.outputs.add() + outputs_add.parameter = 'Out' + outputs_add.arguments.append('fetch') + attrs_add = desc_ops_add.attrs.add() + attrs_add.name = 'col' + # boolean + attrs_add.type = 0 + attrs_add.i = 0 + + @staticmethod + def package_ops_attrs(desc_ops_add, layer): + # print l_params + # print desc_ops_add.type + if desc_ops_add.type == types.op_fluid_fusion_conv_add: + Converter.pack_fusion_conv_add_attr(desc_ops_add, layer) + elif desc_ops_add.type == types.op_fluid_relu: + # fusion_conv_add : attrs + attrs_add = desc_ops_add.attrs.add() + attrs_add.name = 'use_mkldnn' + # boolean + attrs_add.type = 6 + attrs_add.b = 0 + elif desc_ops_add.type == types.op_fluid_pooling: + Converter.pack_pooling_attr(desc_ops_add, layer) + pass + elif desc_ops_add.type == types.op_fluid_softmax: + pass + + @staticmethod + def pack_pooling_attr(desc_ops_add, layer): + print layer + l_params = layer['param'] + + attrs_add = desc_ops_add.attrs.add() + attrs_add.name = 'use_mkldnn' + # boolean + attrs_add.type = 6 + attrs_add.b = 0 + + attrs_add = desc_ops_add.attrs.add() + attrs_add.name = 'use_cudnn' + # boolean + attrs_add.type = 6 + attrs_add.b = 1 + + attrs_add = desc_ops_add.attrs.add() + attrs_add.name = 'paddings' + # ints + attrs_add.type = 3 + attrs_add.ints.append(0) + attrs_add.ints.append(0) + + attrs_add = desc_ops_add.attrs.add() + attrs_add.name = 'strides' + # ints + attrs_add.type = 3 + attrs_add.ints.append(1) + attrs_add.ints.append(1) + + attrs_add = desc_ops_add.attrs.add() + attrs_add.name = 'global_pooling' + # boolean + attrs_add.type = 6 + attrs_add.b = (l_params[types.pool2d_attrs_dict.get('global_pooling')]) + + attrs_add = desc_ops_add.attrs.add() + attrs_add.name = 'pooling_type' + # 2-->STRING + attrs_add.type = 2 + # 注意这里 avg but mdl is ave + attrs_add.s = l_params[types.pool2d_attrs_dict.get('pooling_type')] + + attrs_add = desc_ops_add.attrs.add() + attrs_add.name = 'ceil_mode' + # boolean + attrs_add.type = 6 + attrs_add.b = 1 + + attrs_add = desc_ops_add.attrs.add() + attrs_add.name = 'ksize' + # ints + attrs_add.type = 3 + attrs_add.ints.append(7) + attrs_add.ints.append(7) + + # type: "pool2d" + # attrs + # { + # name: "use_mkldnn" + # type: BOOLEAN + # b: false + # } + # attrs + # { + # name: "ceil_mode" + # type: BOOLEAN + # b: true + # } + # attrs + # { + # name: "use_cudnn" + # type: BOOLEAN + # b: true + # } + # attrs + # { + # name: "paddings" + # type: INTS + # ints: 0 + # ints: 0 + # } + # attrs + # { + # name: "strides" + # type: INTS + # ints: 1 + # ints: 1 + # } + # attrs + # { + # name: "global_pooling" + # type: BOOLEAN + # b: false + # } + # attrs + # { + # name: "data_format" + # type: STRING + # s: "AnyLayout" + # } + # attrs + # { + # name: "ksize" + # type: INTS + # ints: 7 + # ints: 7 + # } + # attrs + # { + # name: "pooling_type" + # type: STRING + # s: "avg" + # } + # is_target: false + + @staticmethod + def pack_fusion_conv_add_attr(desc_ops_add, layer): + + # fusion_conv_add : attrs + attrs_add = desc_ops_add.attrs.add() + attrs_add.name = 'workspace_size_MB' + # 0-->INT + attrs_add.type = 0 + attrs_add.i = 4096 + + attrs_add = desc_ops_add.attrs.add() + attrs_add.name = 'data_format' + # 2-->STRING + attrs_add.type = 2 + attrs_add.s = 'AnyLayout' + + attrs_add = desc_ops_add.attrs.add() + attrs_add.name = 'use_mkldnn' + # boolean + attrs_add.type = 6 + attrs_add.b = 0 + + attrs_add = desc_ops_add.attrs.add() + attrs_add.name = 'use_cudnn' + # boolean + attrs_add.type = 6 + attrs_add.b = 1 + + attrs_add = desc_ops_add.attrs.add() + attrs_add.name = 'dilations' + # ints + attrs_add.type = 3 + attrs_add.ints.append(1) + attrs_add.ints.append(1) + + attrs_add = desc_ops_add.attrs.add() + attrs_add.name = 'axis' + # int + attrs_add.type = 0 + attrs_add.i = 1 + + if 'param' in layer: + l_params = layer['param'] + + attrs_add = desc_ops_add.attrs.add() + attrs_add.name = 'paddings' + # ints + attrs_add.type = 3 + attrs_add.ints.append(l_params[types.fusion_conv_add_attrs_dict.get('paddings')]) + attrs_add.ints.append(l_params[types.fusion_conv_add_attrs_dict.get('paddings')]) + + # attrs_add = desc_ops_add.attrs.add() + # attrs_add.name = 'paddings' + # # ints + # attrs_add.type = 3 + # attrs_add.ints.append(0) + # attrs_add.ints.append(0) + + attrs_add = desc_ops_add.attrs.add() + attrs_add.name = 'strides' + # ints + attrs_add.type = 3 + attrs_add.ints.append(l_params[types.fusion_conv_add_attrs_dict.get('strides')]) + attrs_add.ints.append(l_params[types.fusion_conv_add_attrs_dict.get('strides')]) + + # attrs_add = desc_ops_add.attrs.add() + # attrs_add.name = 'strides' + # # ints + # attrs_add.type = 3 + # attrs_add.ints.append(6) + # attrs_add.ints.append(6) + + attrs_add = desc_ops_add.attrs.add() + attrs_add.name = 'groups' + # int + attrs_add.type = 0 + attrs_add.i = l_params[types.fusion_conv_add_attrs_dict.get('groups')] + # attrs_add.i = 1 + + # + # op_attrs_tupl = types.op_io_dict.get(desc_ops_add.type) \ + # .get(types.mdl_attrs_key) + # + # + # + # + # # group stride padding + # print '----------------------' + # for i, val in enumerate(op_attrs_tupl): + # attrs_add = desc_ops_add.attrs.add() + # attr_name = op_attrs_tupl[i] + # print attr_name + # attrs_add.name = attr_name + # attrs_add.type = types.fluid_attrs_type_dict.get(attr_name) + # attrs_add. + # print l_params[types.fusion_conv_add_attrs_dict.get(attr_name)] + + # for p in l_params: + # attrs_add = desc_ops_add.attrs.add() + + @staticmethod + def package_ops_inputs(desc_ops_add, layer): + l_inputs = layer['input'] + for i in l_inputs: + inputs_add = desc_ops_add.inputs.add() + # print i + inputs_add.parameter = types.op_io_dict.get(desc_ops_add.type).get(types.mdl_inputs_key) + inputs_add.arguments.append(i) + + @staticmethod + def package_ops_outputs(desc_ops_add, layer): + l_outputs = layer['output'] + for o in l_outputs: + # print o + outputs_add = desc_ops_add.outputs.add() + dict = types.op_io_dict.get(desc_ops_add.type) + # print 'desc_ops_add.type: ' + desc_ops_add.type + # print dict + outputs_add.parameter = dict.get(types.mdl_outputs_key) + outputs_add.arguments.append(o) + + def package_ops_weight2inputs(self, desc_ops_add, layer): + l_weights = layer['weight'] + for w in l_weights: + self.weight_list_.append(w) + + if layer['type'] == types.layer_mdl_deepwise_conv: + # print l_weights[0] + self.deepwise_weight_list_.append(l_weights[0]) + + op_weight_tup = types.op_io_dict.get(desc_ops_add.type).get(types.mdl_weight_key) + if op_weight_tup is not None: + # print len(op_weight_tup) + for i, val in enumerate(op_weight_tup): + # print i + # print val + inputs_add = desc_ops_add.inputs.add() + inputs_add.parameter = op_weight_tup[i] + inputs_add.arguments.append(l_weights[i]) + + # for w in l_weights: + # inputs_add = desc_ops_add.inputs.add() + # # print w + # inputs_add.parameter = op_weight_tup[0] + # inputs_add.arguments.append(w) + + @staticmethod + def package_ops_type(desc_ops_add, layer): + l_type = layer['type'] + # print l_type + # print mdl2fluid_op_layer_dict.get(l_type) + desc_ops_add.type = types.mdl2fluid_op_layer_dict.get(l_type) + + def package_vars(self, block_desc): + vars_add = block_desc.vars.add() + vars_add.name = 'feed' + vars_add.type.type = 9 # 9 is FEED_MINIBATCH + vars_add.persistable = 1 + # fetch + vars_add = block_desc.vars.add() + vars_add.name = 'fetch' + vars_add.type.type = 10 # 10 is fetch list + vars_add.persistable = 1 + + json_matrix_ = self.mdl_json['matrix'] + # print json_matrix_ + for j in json_matrix_: + vars_add = block_desc.vars.add() + vars_add.name = j + vars_add.type.type = 7 # 7 is lodtensor + # print j + tensor = vars_add.type.lod_tensor.tensor + tensor.data_type = 5 # 5 is FP32 + + # print json_matrix_ + + dims_of_matrix = json_matrix_.get(j) + # dims_size = len(dims_of_matrix) + # print dims_size + + # if dims_size == 4: + # tensor.dims.append(dims_of_matrix[0]) # N + # tensor.dims.append(dims_of_matrix[3]) # C + # tensor.dims.append(dims_of_matrix[1]) # H + # tensor.dims.append(dims_of_matrix[2]) # W + # else: + + # issues in mdl model filter swich n and c + if j in self.deepwise_weight_list_ and len(dims_of_matrix) == 4: + print "deep wise issue fit: " + j + tensor.dims.append(dims_of_matrix[1]) + tensor.dims.append(dims_of_matrix[0]) + tensor.dims.append(dims_of_matrix[2]) + tensor.dims.append(dims_of_matrix[3]) + print tensor.dims + else: + for dims in dims_of_matrix: + # print dims + tensor.dims.append(dims) + + if j in self.weight_list_: + vars_add.persistable = 1 + dims_size = len(dims_of_matrix) + # print dims_size + # print 'weight name : ' + j + Swichter().copy_add_head( + self.source_weights_dir + j + '.bin', + self.target_weight_dir + j + ) + + # if dims_size == 4: + # # convert weight from nhwc to nchw + # Swichter().nhwc2nchw_one_slice_add_head( + # 'yolo/datas/multiobjects/float32s_nhwc/' + j + '.bin', + # 'yolo/datas/multiobjects/float32s_nchw_with_head/' + j, + # 'yolo/datas/multiobjects/float32s_nchw/' + j + '.tmp', + # dims_of_matrix[0], + # dims_of_matrix[1], + # dims_of_matrix[2], + # dims_of_matrix[3] + # ) + # else: + # Swichter().copy_add_head( + # 'yolo/datas/multiobjects/float32s_nhwc/' + j + '.bin', + # 'yolo/datas/multiobjects/float32s_nchw_with_head/' + j, + # 'yolo/datas/multiobjects/float32s_nchw/' + j + '.tmp' + # ) + else: + vars_add.persistable = 0 + + +mdl_path = "datas/sourcemodels/source_profile/mobileNetModel.json" +base_dir = "/Users/xiebaiyuan/PaddleProject/paddle-mobile/tools/python/modeltools/mobilenet/" +converter = Converter(base_dir, mdl_path) +converter.convert() diff --git a/tools/python/modeltools/mobilenet/swicher.py b/tools/python/modeltools/mobilenet/swicher.py new file mode 100644 index 0000000000000000000000000000000000000000..90bc6d26f600624b14c5912cddfe6e156865d196 --- /dev/null +++ b/tools/python/modeltools/mobilenet/swicher.py @@ -0,0 +1,119 @@ +import os +import shutil +from array import array + + +class Swichter: + def __init__(self): + pass + + def nhwc2nchw_one_slice(self, from_file_name, to_file_name, batch, channel, height, width): + from_file = open(from_file_name, "rb") + to_file = open(to_file_name, "wb") + + float_array = array("f") + float_array.fromfile(from_file, width * height * batch * channel) + float_write_array = array("f") + + for b in range(batch): + for c in range(channel): + for h in range(height): + for w in range(width): + float_value = float_array[b * channel * width * height + + channel * (h * width + w) + c] + + float_write_array.append(float_value) + + float_write_array.tofile(to_file) + from_file.close() + to_file.close() + + def copy(self, from_file_name, to_file_name): + from_file = open(from_file_name, "rb") + to_file = open(to_file_name, "wb") + + to_file.write(from_file.read()) + from_file.close() + to_file.close() + + def nhwc2nchw_one_slice_add_head(self, from_file_name, to_file_name, tmp_file_name, batch, channel, height, width): + from_file = open(from_file_name, "rb") + tmp_file = open(tmp_file_name, "wb+") + float_array = array("f") + float_array.fromfile(from_file, width * height * batch * channel) + float_write_array = array("f") + + for b in range(batch): + for c in range(channel): + for h in range(height): + for w in range(width): + float_value = float_array[b * channel * width * height + + channel * (h * width + w) + c] + + float_write_array.append(float_value) + + float_write_array.tofile(tmp_file) + tmp_file.close() + from_file.close() + + tmp_file = open(tmp_file_name, "rb") + to_file = open(to_file_name, "wb") + + tmp = tmp_file.read() + head = self.read_head('yolo/datas/yolo/head') + to_file.write(head) + to_file.write(tmp) + tmp_file.close() + to_file.close() + + def read_head(self, head_file): + from_file = open(head_file, "rb") + read = from_file.read(24) + # print read + from_file.close() + # print read + return read + + def copy_add_head(self, from_file_name, to_file_name): + + from_file = open(from_file_name, "rb") + to_file = open(to_file_name, "wb") + # tmp_file = open(tmp_file_name, "wb") + + head = self.read_head( + '/Users/xiebaiyuan/PaddleProject/paddle-mobile/tools/python/modeltools/mobilenet/datas/sourcemodels/head/head') + to_file.write(head) + to_file.write(from_file.read()) + from_file.close() + to_file.close() + pass + + def copy_padding_add_head(self, from_file_name, to_file_name, tmp_file_name, padding): + print'padding = %d' % padding + from_file = open(from_file_name, "rb") + # print len(from_file.read()) + from_file.seek(padding, 0) + + read = from_file.read() + print len(read) + + to_file = open(to_file_name, "wb") + # tmp_file = open(tmp_file_name, "wb") + + head = self.read_head('yolo/datas/yolo/head') + to_file.write(head) + to_file.write(read) + from_file.close() + to_file.close() + pass + +# Swichter().nhwc2nchw_one_slice_add_head( +# '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/multiobjects/float32s_nhwc/conv1_0.bin', +# '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/multiobjects/float32s_nchw_with_head/conv1_0', +# '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/multiobjects/float32s_nchw/.tmp', +# 32, +# 3, 3, 3) + +# Swichter().read_head('/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/yolo/head') + +# Swichter().copy_add_head('datas/model.0.0.weight', 'datas/conv1_0', '') diff --git a/tools/python/modeltools/tools/__init__.py b/tools/python/modeltools/tools/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/python/tools/mdl2fluid/float2halffloat.py b/tools/python/modeltools/tools/float2halffloat.py similarity index 100% rename from python/tools/mdl2fluid/float2halffloat.py rename to tools/python/modeltools/tools/float2halffloat.py diff --git a/python/tools/mdl2fluid/loader.py b/tools/python/modeltools/tools/loader.py similarity index 73% rename from python/tools/mdl2fluid/loader.py rename to tools/python/modeltools/tools/loader.py index ef2258e365a84003b7b90ac480abbd9798f48f59..cb996c8bedd78004e667f1433bfdb20785e7792f 100644 --- a/python/tools/mdl2fluid/loader.py +++ b/tools/python/modeltools/tools/loader.py @@ -1,9 +1,4 @@ -import datetime import json -import os - -import google.protobuf as pbg -import framework_pb2 as framework_pb2 def loadmdl(json_path): diff --git a/python/tools/mdl2fluid/model_combine.py b/tools/python/modeltools/tools/model_combine.py similarity index 100% rename from python/tools/mdl2fluid/model_combine.py rename to tools/python/modeltools/tools/model_combine.py diff --git a/python/tools/mdl2fluid/model_reader.py b/tools/python/modeltools/tools/model_reader.py similarity index 71% rename from python/tools/mdl2fluid/model_reader.py rename to tools/python/modeltools/tools/model_reader.py index 8d53350db20739526b77663f791942299d4bc149..5f6e5f0cb9da8fb349e35211ed56f77bb9cf95da 100644 --- a/python/tools/mdl2fluid/model_reader.py +++ b/tools/python/modeltools/tools/model_reader.py @@ -1,6 +1,6 @@ import os -import framework_pb2 as framework_pb2 +from core import framework_pb2 as framework_pb2 def read_model(model_path): @@ -16,7 +16,7 @@ def read_model(model_path): # print desc.blocks except IOError: - print ": File not found. Creating a new file." + print ": File not found." def get_file_size(file_path): @@ -26,5 +26,5 @@ def get_file_size(file_path): return round(fsize, 2) -path = "newyolo/__model__" +path = '/Users/xiebaiyuan/PaddleProject/paddle-mobile/tools/python/modeltools/mobilenet/datas/sourcemodels/mobilenet_example/mobilenet/__model__' read_model(path) diff --git a/tools/python/modeltools/yolo/__init__.py b/tools/python/modeltools/yolo/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/python/tools/mdl2fluid/mdl2fluid.py b/tools/python/modeltools/yolo/mdl2fluid.py similarity index 89% rename from python/tools/mdl2fluid/mdl2fluid.py rename to tools/python/modeltools/yolo/mdl2fluid.py index a57a01d09eaf236fd9f890dcb9e8eead19aa7868..2c2d0f3e9498254f26da6ff1b88b8a33e1b31d27 100644 --- a/python/tools/mdl2fluid/mdl2fluid.py +++ b/tools/python/modeltools/yolo/mdl2fluid.py @@ -1,9 +1,7 @@ import json -import os -import framework_pb2 as framework_pb2 -import op_types as types -from swicher import Swichter +from core import framework_pb2 as framework_pb2, op_types as types +from yolo.swicher import Swichter import shutil @@ -40,10 +38,10 @@ class Converter: print self.program_desc.blocks print 'convert end.....' desc_serialize_to_string = self.program_desc.SerializeToString() - shutil.rmtree('newyolo/') - shutil.copytree('multiobjects/float32s_nchw_with_head', 'newyolo/') + shutil.rmtree('yolo/datas/newyolo/') + shutil.copytree('yolo/datas/multiobjects/float32s_nchw_with_head/', 'yolo/datas/newyolo/') - f = open("newyolo/__model__", "wb") + f = open("yolo/datas/newyolo/__model__", "wb") f.write(desc_serialize_to_string) f.close() @@ -312,9 +310,9 @@ class Converter: if dims_size == 4: # convert weight from nhwc to nchw Swichter().nhwc2nchw_one_slice_add_head( - '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nhwc/' + j + '.bin', - '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nchw_with_head/' + j, - '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nchw/' + j + '.tmp', + 'yolo/datas/multiobjects/float32s_nhwc/' + j + '.bin', + 'yolo/datas/multiobjects/float32s_nchw_with_head/' + j, + 'yolo/datas/multiobjects/float32s_nchw/' + j + '.tmp', dims_of_matrix[0], dims_of_matrix[1], dims_of_matrix[2], @@ -322,14 +320,14 @@ class Converter: ) else: Swichter().copy_add_head( - '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nhwc/' + j + '.bin', - '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nchw_with_head/' + j, - '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nchw/' + j + '.tmp' + 'yolo/datas/multiobjects/float32s_nhwc/' + j + '.bin', + 'yolo/datas/multiobjects/float32s_nchw_with_head/' + j, + 'yolo/datas/multiobjects/float32s_nchw/' + j + '.tmp' ) else: vars_add.persistable = 0 -mdl_path = "/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/YOLO_Universal.json" +mdl_path = "yolo/datas/multiobjects/YOLO_Universal.json" converter = Converter(mdl_path) converter.convert() diff --git a/python/tools/mdl2fluid/swicher.py b/tools/python/modeltools/yolo/swicher.py similarity index 86% rename from python/tools/mdl2fluid/swicher.py rename to tools/python/modeltools/yolo/swicher.py index bfe0360fd5b32f5e6fa61f6f05a0a384fb3a1e9b..713ce93985957fe7f3c99d6bc6a9c436faea59a4 100644 --- a/python/tools/mdl2fluid/swicher.py +++ b/tools/python/modeltools/yolo/swicher.py @@ -58,7 +58,7 @@ class Swichter: to_file = open(to_file_name, "wb") tmp = tmp_file.read() - head = self.read_head('/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/yolo/conv1_biases') + head = self.read_head('yolo/datas/yolo/head') to_file.write(head) to_file.write(tmp) tmp_file.close() @@ -77,7 +77,7 @@ class Swichter: to_file = open(to_file_name, "wb") # tmp_file = open(tmp_file_name, "wb") - head = self.read_head('/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/yolo/conv1_biases') + head = self.read_head('yolo/datas/yolo/head') to_file.write(head) to_file.write(from_file.read()) from_file.close() @@ -96,7 +96,7 @@ class Swichter: to_file = open(to_file_name, "wb") # tmp_file = open(tmp_file_name, "wb") - head = self.read_head('/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/yolo/conv1_biases') + head = self.read_head('yolo/datas/yolo/head') to_file.write(head) to_file.write(read) from_file.close() @@ -104,12 +104,12 @@ class Swichter: pass # Swichter().nhwc2nchw_one_slice_add_head( -# '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nhwc/conv1_0.bin', -# '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nchw_with_head/conv1_0', -# '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/multiobjects/float32s_nchw/.tmp', +# '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/multiobjects/float32s_nhwc/conv1_0.bin', +# '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/multiobjects/float32s_nchw_with_head/conv1_0', +# '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/multiobjects/float32s_nchw/.tmp', # 32, # 3, 3, 3) -# Swichter().read_head('/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/mdl2fluid/yolo/conv1_biases') +# Swichter().read_head('/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/yolo/head') # Swichter().copy_add_head('datas/model.0.0.weight', 'datas/conv1_0', '')