diff --git a/README.md b/README.md index 1a478db3770e1f5e518594fd2fefabb686cf3c38..ee4e20513186979fe76c1259e7fc3ca962426843 100644 --- a/README.md +++ b/README.md @@ -26,61 +26,10 @@ Paddle-Mobile是PaddlePaddle组织下的项目,是一个致力于嵌入式平 - **ARM CPU** -|mobilenet arm v7|1线程|2线程|4线程| -|------------|----|-----|-----| -|麒麟970(ms)|108.180|63.935|37.545| -|麒麟960(ms)|108.588|63.073|36.822| -|高通845(ms)|85.952|48.890|28.641| -|高通835(ms)|105.434|62.752|37.131| -||||| -|mobilenetssd arm v7|1线程|2线程|4线程| -|麒麟970(ms)|212.686|127.205|77.485| -|麒麟960(ms)|212.641|125.338|75.250| -|高通845(ms)|182.863|95.671|56.857| -|高通835(ms)|213.849|127.717|77.006| -||||| -|googlenet(v1) arm v7|1线程|2线程|4线程| -|麒麟970(ms)|335.288|234.559|161.295| -|麒麟960(ms)|354.443|232.642|157.815| -|高通845(ms)|282.007|173.146|122.148| -|高通835(ms)|341.250|233.354|158.554| -||||| -|squeezenet arm v7|1线程|2线程|4线程| -|麒麟970(ms)|83.726|57.944|36.923| -|麒麟960(ms)|85.835|55.762|36.496| -|高通845(ms)|71.301|41.618|28.785| -|高通835(ms)|82.407|56.176|36.455| -||||| -|yolo arm v7|1线程|2线程|4线程| -|麒麟970(ms)|129.658|79.993|49.969| -|麒麟960(ms)|130.208|78.791|48.390| -|高通845(ms)|109.244|61.736|40.600| -|高通835(ms)|130.402|80.863|50.359| - - 测试机型信息: - 麒麟970:荣耀v10 (2.36GHz * 4 + 1.8GHz * 4) - 麒麟960:华为mate9 (2.36GHz * 4 + 1.8GHz * 4) - 骁龙835:小米6 (2.45GHz * 4 + 1.9GHz * 4) - 骁龙845:OPPO FindX (2.80GHz * 4 + 1.8GHz * 4) - - **Mali GPU** - Mali GPU是百度和ARM合作开发的,双方团队近期都在致力于将paddle的op能无缝运行在ACL(arm compute library)。目前已经支持squeezenet,googlenet,resnet等几个网络模型,后续会继续加大力度。使全部移动端paddle op能高效运行在mali gpu上。 - - **苹果设备的GPU Metal实现** -|mobilenetfssd|速度| -|------------|-----| -|A9(ms)|33.78| -|A10(ms)|24.05| -|A11(ms)|17.15| -||| -|genet|速度| -|A9(ms) |3.49| -|A10(ms)|2.54| -|A11(ms)|1.43| - - - **FPGA** 目前已经支持 ZCU102 开发板。 diff --git a/doc/development_fpga.md b/doc/development_fpga.md index 14cc57c6b4055e8c4e45d8b673eb1e3be22ae256..3389ddde676a5d1c7b452dc734880eb50170bd3e 100644 --- a/doc/development_fpga.md +++ b/doc/development_fpga.md @@ -27,8 +27,9 @@ ___ ## 准备模型和数据 ___ 1. 模型文件放在./test/models/resnet50中。将[\_\_model\_\_](http://mms-graph.bj.bcebos.com/paddle-mobile/fpga/files.tar.gz)文件复制到此文件夹下。 -2. 另外下载模型[权重文件](http://paddle-imagenet-models.bj.bcebos.com/resnet_50_model.tar),解压后也放在./test/models/resnet50 中。 -3. 将数据文件[image_src_float](http://mms-graph.bj.bcebos.com/paddle-mobile/fpga/files.tar.gz)复制到/test/images下。此数据文件对应着标准数据集中的ILSVRC2012_val_00000885.JPEG,分类标签为80, 对应着"black grouse". +2. 如果不存在,则创建文件夹./test/models/resnet50 和 ./test/images。 +3. 另外下载模型[权重文件](http://paddle-imagenet-models.bj.bcebos.com/resnet_50_model.tar),解压后也放在./test/models/resnet50 中。 +4. 将数据文件[image_src_float](http://mms-graph.bj.bcebos.com/paddle-mobile/fpga/files.tar.gz)复制到./test/images下。此数据文件对应着标准数据集中的ILSVRC2012_val_00000885.JPEG,分类标签为80, 对应着"black grouse"。 ## 运行程序 ___ diff --git a/doc/development_ios.md b/doc/development_ios.md index 1d4f28bd5bcde1c3068ddeae87627ae6686d886a..1dbc7555e8ed6db94071c571673212d0ce2b7a71 100644 --- a/doc/development_ios.md +++ b/doc/development_ios.md @@ -34,7 +34,7 @@ cd ../build/release/ios/build libpaddle-mobile.a /src/ios_io/ 下的 -PaddleMobile.h +PaddleMobileCPU.h ``` 拖入工程 diff --git a/src/common/types.cpp b/src/common/types.cpp index 8c8de7765161e61dc75036a87a34fc6abd2df43e..b90fb70f2a81b365f049632cc7281a69ec58e18d 100644 --- a/src/common/types.cpp +++ b/src/common/types.cpp @@ -40,9 +40,11 @@ const char *G_OP_TYPE_POOL2D = "pool2d"; const char *G_OP_TYPE_PRIOR_BOX = "prior_box"; const char *G_OP_TYPE_RELU = "relu"; const char *G_OP_TYPE_RESHAPE = "reshape"; +const char *G_OP_TYPE_RESHAPE2 = "reshape2"; const char *G_OP_TYPE_SIGMOID = "sigmoid"; const char *G_OP_TYPE_SOFTMAX = "softmax"; const char *G_OP_TYPE_TRANSPOSE = "transpose"; +const char *G_OP_TYPE_TRANSPOSE2 = "transpose2"; const char *G_OP_TYPE_SPLIT = "split"; const char *G_OP_TYPE_FEED = "feed"; const char *G_OP_TYPE_FETCH = "fetch"; @@ -90,6 +92,7 @@ std::unordered_map< {G_OP_TYPE_FEED, {{"X"}, {"Out"}}}, {G_OP_TYPE_FETCH, {{"X"}, {"Out"}}}, {G_OP_TYPE_TRANSPOSE, {{"X"}, {"Out"}}}, + {G_OP_TYPE_TRANSPOSE2, {{"X"}, {"Out", "XShape"}}}, {G_OP_TYPE_BOX_CODER, {{"PriorBox", "PriorBoxVar", "TargetBox"}, {"OutputBox"}}}, {G_OP_TYPE_FUSION_CONV_ADD_BN_RELU, {{"Input"}, {"Out"}}}, @@ -99,6 +102,7 @@ std::unordered_map< {G_OP_TYPE_POLYGON_BOX_TRANSFORM, {{"Input"}, {"Output"}}}, {G_OP_TYPE_FC, {{"X", "Y", "Z"}, {"Out"}}}, {G_OP_TYPE_RESHAPE, {{"X"}, {"Out"}}}, + {G_OP_TYPE_RESHAPE2, {{"X"}, {"Out", "XShape"}}}, {G_OP_TYPE_DEPTHWISE_CONV, {{"Input"}, {"Output"}}}, {G_OP_TYPE_FILL_CONSTANT, {{}, {"Out"}}}, {G_OP_TYPE_FUSION_CONV_ADD_RELU, {{"Input"}, {"Out"}}}, diff --git a/src/framework/load_ops.h b/src/framework/load_ops.h index 2b76b0158fe06e8678208f6f98fcdb71f8d91e51..982f1c0f3525afde8475866c0121343fafc9d5a0 100644 --- a/src/framework/load_ops.h +++ b/src/framework/load_ops.h @@ -109,9 +109,15 @@ LOAD_FUSION_MATCHER(fusion_conv_add_bn_relu); #ifdef RESHAPE_OP LOAD_OP2(reshape, CPU, MALI_GPU); #endif +#ifdef RESHAPE2_OP +LOAD_OP2(reshape2, CPU, MALI_GPU); +#endif #ifdef TRANSPOSE_OP LOAD_OP1(transpose, CPU); #endif +#ifdef TRANSPOSE2_OP +LOAD_OP1(transpose2, CPU); +#endif #ifdef PRIORBOX_OP LOAD_OP1(prior_box, CPU); #endif @@ -221,5 +227,9 @@ LOAD_FUSION_MATCHER(fusion_conv_bn); #ifdef ELEMENTWISESUB_OP LOAD_OP1(elementwise_sub, CPU) #endif +#ifdef QUANT_OP LOAD_OP1(quantize, CPU); +#endif +#ifdef DEQUANT_OP LOAD_OP1(dequantize, CPU); +#endif diff --git a/src/operators/kernel/arm/quantize_kernel.cpp b/src/operators/kernel/arm/quantize_kernel.cpp index e7552d2602b31f9a5c10e3d81122babae8fcf1a8..11a1f0a53d4886e1a07d258b76b3827671471dca 100644 --- a/src/operators/kernel/arm/quantize_kernel.cpp +++ b/src/operators/kernel/arm/quantize_kernel.cpp @@ -135,11 +135,15 @@ static void quantize_round_to_even(const Tensor *input, const float scale, #if defined(__ARM_NEON__) || defined(__ARM_NEON) size_t loop = size >> 4; size_t remain = size & 0xF; + + #pragma omp parallel for for (size_t i = 0; i < loop; ++i) { - float32x4_t r0 = vld1q_f32(x); - float32x4_t r1 = vld1q_f32(x + 4); - float32x4_t r2 = vld1q_f32(x + 8); - float32x4_t r3 = vld1q_f32(x + 12); + const float *local_x = x + (i << 4); + int8_t *local_y = y + (i << 4); + float32x4_t r0 = vld1q_f32(local_x); + float32x4_t r1 = vld1q_f32(local_x + 4); + float32x4_t r2 = vld1q_f32(local_x + 8); + float32x4_t r3 = vld1q_f32(local_x + 12); r0 = vmulq_n_f32(r0, scale); r1 = vmulq_n_f32(r1, scale); r2 = vmulq_n_f32(r2, scale); @@ -156,12 +160,12 @@ static void quantize_round_to_even(const Tensor *input, const float scale, int16x8_t q6 = vcombine_s16(d2, d3); int8x8_t d5 = vmovn_s16(q5); int8x8_t d6 = vmovn_s16(q6); - vst1_s8(y, d5); - vst1_s8(y + 8, d6); - x += 16; - y += 16; + vst1_s8(local_y, d5); + vst1_s8(local_y + 8, d6); } size = remain; + x += (loop << 4); + y += (loop << 4); #endif for (size_t i = 0; i < size; ++i) { float value = x[i] * scale; @@ -187,11 +191,15 @@ static void quantize_round_to_zero(const Tensor *input, const float scale, #ifdef defined(__ARM_NEON__) || defined(__ARM_NEON) size_t loop = size >> 4; size_t remain = size & 0xF; + + #pragma omp parallel for for (size_t i = 0; i < loop; ++i) { - float32x4_t r0 = vld1q_f32(x); - float32x4_t r1 = vld1q_f32(x + 4); - float32x4_t r2 = vld1q_f32(x + 8); - float32x4_t r3 = vld1q_f32(x + 12); + const float *local_x = x + (i << 4); + int8_t *local_y = y + (i << 4); + float32x4_t r0 = vld1q_f32(local_x); + float32x4_t r1 = vld1q_f32(local_x + 4); + float32x4_t r2 = vld1q_f32(local_x + 8); + float32x4_t r3 = vld1q_f32(local_x + 12); r0 = vmulq_n_f32(r0, scale); r1 = vmulq_n_f32(r1, scale); r2 = vmulq_n_f32(r2, scale); @@ -208,12 +216,12 @@ static void quantize_round_to_zero(const Tensor *input, const float scale, int16x8_t q6 = vcombine_s16(d2, d3); int8x8_t d5 = vmovn_s16(q5); int8x8_t d6 = vmovn_s16(q6); - vst1_s8(y, d5); - vst1_s8(y + 8, d6); - x += 16; - y += 16; + vst1_s8(local_y, d5); + vst1_s8(local_y + 8, d6); } size = remain; + x += (loop << 4); + y += (loop << 4); #endif for (size_t i = 0; i < size; ++i) { y[i] = trunc(x[i] * scale); @@ -228,11 +236,15 @@ static void quantize_round_to_nearest(const Tensor *input, const float scale, #if defined(__ARM_NEON__) || defined(__ARM_NEON) size_t loop = size >> 4; size_t remain = size & 0xF; + + #pragma omp parallel for for (size_t i = 0; i < loop; ++i) { - float32x4_t r0 = vld1q_f32(x); - float32x4_t r1 = vld1q_f32(x + 4); - float32x4_t r2 = vld1q_f32(x + 8); - float32x4_t r3 = vld1q_f32(x + 12); + const float *local_x = x + (i << 4); + int8_t *local_y = y + (i << 4); + float32x4_t r0 = vld1q_f32(local_x); + float32x4_t r1 = vld1q_f32(local_x + 4); + float32x4_t r2 = vld1q_f32(local_x + 8); + float32x4_t r3 = vld1q_f32(local_x + 12); r0 = vmulq_n_f32(r0, scale); r1 = vmulq_n_f32(r1, scale); r2 = vmulq_n_f32(r2, scale); @@ -249,12 +261,12 @@ static void quantize_round_to_nearest(const Tensor *input, const float scale, int16x8_t q6 = vcombine_s16(d2, d3); int8x8_t d5 = vmovn_s16(q5); int8x8_t d6 = vmovn_s16(q6); - vst1_s8(y, d5); - vst1_s8(y + 8, d6); - x += 16; - y += 16; + vst1_s8(local_y, d5); + vst1_s8(local_y + 8, d6); } size = remain; + x += (loop << 4); + y += (loop << 4); #endif for (size_t i = 0; i < size; ++i) { y[i] = round(x[i] * scale); diff --git a/src/operators/kernel/arm/reshape2_kernel.cpp b/src/operators/kernel/arm/reshape2_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..83bbf112abb8b5e290126d6909a0fe77291f8fac --- /dev/null +++ b/src/operators/kernel/arm/reshape2_kernel.cpp @@ -0,0 +1,37 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef RESHAPE2_OP + +#include "operators/kernel/reshape2_kernel.h" +#include "operators/kernel/central-arm-func/reshape2_arm_func.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool Reshape2Kernel::Init(Reshape2Param *param) { + return true; +} + +template <> +void Reshape2Kernel::Compute( + const Reshape2Param ¶m) const { + Reshape2Compute(param); +} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/arm/transpose2_kernel.cpp b/src/operators/kernel/arm/transpose2_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..656d2768840a52f50c42d3797018aa9aec037783 --- /dev/null +++ b/src/operators/kernel/arm/transpose2_kernel.cpp @@ -0,0 +1,36 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#ifdef TRANSPOSE2_OP + +#include "operators/kernel/transpose2_kernel.h" +#include "operators/kernel/central-arm-func/transpose2_arm_func.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool Transpose2Kernel::Init(Transpose2Param *param) { + return true; +} + +template <> +void Transpose2Kernel::Compute( + const Transpose2Param ¶m) const { + Transpose2Compute(param); +} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/central-arm-func/conv_arm_func.h b/src/operators/kernel/central-arm-func/conv_arm_func.h index f80a8f944139566483c47daf10f9decac49650dc..e7a8c7f52db327f3ff5871566c3557c484ba4d13 100644 --- a/src/operators/kernel/central-arm-func/conv_arm_func.h +++ b/src/operators/kernel/central-arm-func/conv_arm_func.h @@ -16,7 +16,6 @@ limitations under the License. */ #pragma once #include -#include "operators/math/conv_arm_int8.h" #include "operators/math/conv_func.h" #include "operators/math/depthwise_conv_3x3.h" #include "operators/math/im2col.h" @@ -28,11 +27,12 @@ limitations under the License. */ namespace paddle_mobile { namespace operators { -template +template inline void ConvBasic(const ConvParam ¶m) { const Tensor *input = param.Input(); Tensor filter = *param.Filter(); Tensor *output = param.Output(); + output->mutable_data(); int groups = param.Groups(); const std::vector strides = param.Strides(); const std::vector paddings = param.Paddings(); @@ -60,7 +60,7 @@ inline void ConvBasic(const ConvParam ¶m) { Tensor col; Tensor col_matrix; if (is_expand) { - col.mutable_data(col_shape); + col.mutable_data(col_shape); col_matrix.ShareDataWith(col); col_matrix.Resize(col_matrix_shape); } @@ -79,8 +79,8 @@ inline void ConvBasic(const ConvParam ¶m) { int in_step = static_cast(input->dims()[1]) / groups; int out_step = static_cast(output->dims()[1]) / groups; - math::Vol2ColFunctor vol2col; - math::Im2ColFunctor im2col; + math::Vol2ColFunctor vol2col; + math::Im2ColFunctor im2col; for (int i = 0; i < batch_size; i++) { Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); @@ -109,69 +109,18 @@ inline void ConvBasic(const ConvParam ¶m) { Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); - math::matmul(filter_slice, false, col_matrix, false, + math::matmul(filter_slice, false, col_matrix, false, static_cast(1), &out_slice, static_cast(0)); } } } -inline void ConvCompute_int8(const ConvParam ¶m) { - typedef void (*ConvFunc)(const Tensor &input, const Tensor &kernel, - Tensor *output); - static ConvFunc conv_funcs_table[7][5] = { - {0, 0, 0, 0, 0}, // k = 1 - {0, 0, 0, 0, 0}, {conv3x3s1_int8, 0, 0, 0, 0}, // k = 3 - {0, 0, 0, 0, 0}, {conv5x5s1_int8, 0, 0, 0, 0}, // k = 5 - {0, 0, 0, 0, 0}, {0, 0, 0, 0, 0}, // k = 7 - }; - const Tensor *input = param.Input(); - Tensor *filter = param.Filter(); - Tensor *output = param.Output(); - int groups = param.Groups(); - const std::vector &strides = param.Strides(); - const std::vector &paddings = param.Paddings(); - const std::vector &dilations = param.Dilations(); - int kernel_h = filter->dims()[2]; - int kernel_w = filter->dims()[3]; - output->mutable_data(); - - ConvFunc conv_func = 0; - if (strides[1] == strides[0] && strides[1] < 6 && kernel_h == kernel_w && - kernel_h < 8 && groups == 1 && dilations[0] == dilations[1] && - dilations[1] == 1) { - conv_func = conv_funcs_table[kernel_h - 1][strides[0] - 1]; - } - if (conv_func) { - int batch_size = input->dims()[0]; - math::PadFunctor pad; - - Tensor input_pad; - for (int i = 0; i < batch_size; ++i) { - Tensor in_batch = input->Slice(i, i + 1); - Tensor out_batch = output->Slice(i, i + 1); - if (paddings[0] == 0 && paddings[1] == 0) { - input_pad = in_batch; - } else { - framework::DDim pad_shape = in_batch.dims(); - pad_shape[2] += 2 * paddings[0]; - pad_shape[3] += 2 * paddings[1]; - input_pad.mutable_data(pad_shape); - pad(in_batch, paddings[0], paddings[1], &input_pad); - } - conv_func(input_pad, *filter, &out_batch); - } - } else { - ConvBasic(param); - } -} - template void ConvCompute(const ConvParam ¶m) { if (param.Input()->type() == typeid(int8_t)) { - ConvCompute_int8(param); + ConvBasic(param); } else { - param.Output()->mutable_data(); if (param.Groups() == param.Input()->dims()[1] && param.Input()->dims()[1] == param.Output()->dims()[1] && param.Filter()->dims()[2] == param.Filter()->dims()[3] && @@ -185,7 +134,7 @@ void ConvCompute(const ConvParam ¶m) { math::DepthwiseConv3x3(param.Input(), param.Strides(), param.Paddings(), param.Filter(), nullptr, param.Output(), false); } else { - ConvBasic(param); + ConvBasic(param); } } } diff --git a/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h b/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h index ff5d5d4b2a351d075fcecce209063aa66e026754..73170bdab922a46831334307aebc8af210ddfb73 100644 --- a/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h +++ b/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h @@ -44,7 +44,7 @@ void DepthwiseConvCompute(const ConvParam ¶m) { Bias, false); } else { - ConvBasic(param); + ConvBasic(param); } } diff --git a/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h b/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h index 0c01ef0072444479d2d2e2f7676b842d89e432ec..b6288380a04c71b3d6467f7f6648db046ae9acc9 100644 --- a/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h +++ b/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h @@ -58,6 +58,7 @@ void ElementwiseAddCompute(const ElementwiseAddParam ¶m) { const float *input_data = input_x->data(); float *output_data = Out->mutable_data(); for (int i = 0; i < batch; ++i) { + #pragma omp parallel for for (int j = 0; j < channels; ++j) { size_t offset = (i * channels + j) * elementwise_num; const float *input = input_data + offset; diff --git a/src/operators/kernel/central-arm-func/polygon_box_transform_arm_func.h b/src/operators/kernel/central-arm-func/polygon_box_transform_arm_func.h index 6db4297046fba8cbb8028f1c70d8214b703158b6..9cbac1035faf4cdc5109a08ea78dfafa8e1df7f2 100644 --- a/src/operators/kernel/central-arm-func/polygon_box_transform_arm_func.h +++ b/src/operators/kernel/central-arm-func/polygon_box_transform_arm_func.h @@ -26,7 +26,7 @@ void PolygonBoxTransformCompute(const PolygonBoxTransformParam& param) { const auto& input_dims = input->dims(); const auto* input_data = input->data(); auto* output = param.Output(); - auto* output_data = output->mutable_data(); + auto* output_data = output->mutable_data(input_dims); int64_t batch_size = input_dims[0]; int64_t geo_channel = input_dims[1]; diff --git a/src/operators/kernel/central-arm-func/reshape2_arm_func.h b/src/operators/kernel/central-arm-func/reshape2_arm_func.h new file mode 100644 index 0000000000000000000000000000000000000000..c22cf120313b039944932fb4e6cc52aa59a68fd4 --- /dev/null +++ b/src/operators/kernel/central-arm-func/reshape2_arm_func.h @@ -0,0 +1,59 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef RESHAPE2_OP +#pragma once + +#include +#include "operators/kernel/reshape_kernel.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +template +void Reshape2Compute(const Reshape2Param ¶m) { + const auto *input_x = param.InputX(); + const auto &input_x_dims = input_x->dims(); + auto *out = param.Out(); + framework::DDim out_dims = out->dims(); + const auto *input_shape = param.InputShape(); + + if (input_shape) { + auto *shape_data = input_shape->data(); + framework::Tensor cpu_shape_tensor; + auto shape = + std::vector(shape_data, shape_data + input_shape->numel()); + out_dims = ValidateShape(shape, input_x->dims()); + } else { + auto &shape = param.Shape(); + out_dims = ValidateShape(shape, input_x_dims); + } + + bool inplace = param.Inplace(); + out->Resize(out_dims); + if (!inplace) { + out->mutable_data(); + framework::TensorCopy(*input_x, out); + out->Resize(out_dims); + } else { + out->ShareDataWith(*input_x); + out->Resize(out_dims); + } +} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/central-arm-func/transpose2_arm_func.h b/src/operators/kernel/central-arm-func/transpose2_arm_func.h new file mode 100644 index 0000000000000000000000000000000000000000..dea90e863b20f19820d60d9cce67b6849d3c467b --- /dev/null +++ b/src/operators/kernel/central-arm-func/transpose2_arm_func.h @@ -0,0 +1,70 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef TRANSPOSE2_OP +#pragma once + +#include +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +template +void Transpose2Compute(const Transpose2Param& param) { + const auto* input_x = param.InputX(); + const auto input_x_dims = input_x->dims(); + auto* out = param.Out(); + const auto axis = param.Axis(); + const auto* input_x_data = input_x->data(); + auto* out_data = out->mutable_data(); + + size_t ndim = axis.size(); + std::vector xdim(ndim); + std::vector xstride(ndim); + std::vector xout(ndim); + for (int i = 0; i < ndim; i++) { + int j = ndim - 1 - i; + xdim[j] = input_x_dims[axis[i]]; + xstride[j] = 1; + for (int k = axis[i] + 1; k < ndim; k++) { + xstride[j] *= input_x_dims[k]; + } + xout[j] = xstride[j] * xdim[j]; + } + + auto numel = input_x->numel(); + size_t pind = 0; + std::vector ind(ndim); + for (int i = 0; i < numel; i++) { + out_data[i] = input_x_data[pind]; + ind[0]++; + pind += xstride[0]; + for (int j = 0; j < ndim - 1; j++) { + if (ind[j] == xdim[j]) { + ind[j + 1]++; + ind[j] = 0; + pind += xstride[j + 1]; + pind -= xout[j]; + } else { + break; + } + } + } +} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/reshape2_kernel.h b/src/operators/kernel/reshape2_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..8d15a619d314e3f5d3085a34cff503e286b5ee37 --- /dev/null +++ b/src/operators/kernel/reshape2_kernel.h @@ -0,0 +1,36 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef RESHAPE2_OP + +#pragma once + +#include +#include "framework/operator.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +template +class Reshape2Kernel + : public framework::OpKernelBase> { + public: + void Compute(const Reshape2Param& param) const; + bool Init(Reshape2Param* param); +}; +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/math/conv_arm_int8.h b/src/operators/kernel/transpose2_kernel.h similarity index 60% rename from src/operators/math/conv_arm_int8.h rename to src/operators/kernel/transpose2_kernel.h index 98843e6158bb0f9816bf49a1cbced5a2ea731446..8ae75ea483ddb887d9c53b32228ff72b41c76097 100644 --- a/src/operators/math/conv_arm_int8.h +++ b/src/operators/kernel/transpose2_kernel.h @@ -12,25 +12,25 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef CONV_OP +#ifdef TRANSPOSE2_OP #pragma once -#include "framework/tensor.h" +#include + +#include "framework/operator.h" +#include "operators/op_param.h" namespace paddle_mobile { namespace operators { -void conv3x3s1_int8(const framework::Tensor& input, - const framework::Tensor& weight, framework::Tensor* output); - -void conv3x3s1_int8_4c(const framework::Tensor& input, - const framework::Tensor& weight, - framework::Tensor* output); - -void conv5x5s1_int8(const framework::Tensor& input, - const framework::Tensor& weight, framework::Tensor* output); - +template +class Transpose2Kernel + : public framework::OpKernelBase> { + public: + void Compute(const Transpose2Param& param) const; + bool Init(Transpose2Param* param); +}; } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/math/conv3x3_arm_int8.cpp b/src/operators/math/conv3x3_arm_int8.cpp deleted file mode 100644 index 283dcb2255b43052dcaf2d622ad629e923810a82..0000000000000000000000000000000000000000 --- a/src/operators/math/conv3x3_arm_int8.cpp +++ /dev/null @@ -1,761 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef CONV_OP - -#include "operators/math/conv_arm_int8.h" - -namespace paddle_mobile { -namespace operators { - -void conv3x3s1_int8(const framework::Tensor& input, - const framework::Tensor& weight, - framework::Tensor* output) { -#if defined(__ARM_NEON__) || defined(__ARM_NEON) - const int8_t* in_data = input.data(); - const int8_t* w_data = weight.data(); - int32_t* out_data = output->mutable_data(); - // make sure that batch size is 1 - int input_c = input.dims()[1]; - int input_h = input.dims()[2]; - int input_w = input.dims()[3]; - int output_c = output->dims()[1]; - int output_h = output->dims()[2]; - int output_w = output->dims()[3]; - int image_size = input_h * input_w; - int out_image_size = output_h * output_w; - memset(out_data, 0, output_c * out_image_size * sizeof(int32_t)); -#if __aarch64__ - // TODO(hjchen2) -#else - int oc = 0; - #pragma omp parallel for - for (; oc < output_c - 1; oc += 2) { - for (int ic = 0; ic < input_c; ++ic) { - const int8_t* kernel0 = w_data + (oc * input_c + ic) * 9; - const int8_t* kernel1 = w_data + ((oc + 1) * input_c + ic) * 9; - int32_t* output0 = out_data + oc * out_image_size; - int32_t* output0n = output0 + output_w; - int32_t* output1 = out_data + (oc + 1) * out_image_size; - int32_t* output1n = output1 + output_w; - - int oh = 0; - for (; oh < output_h - 1; oh += 2) { - const int8_t* r0 = in_data + ic * image_size + oh * input_w; - const int8_t* r1 = r0 + input_w; - const int8_t* r2 = r1 + input_w; - const int8_t* r3 = r2 + input_w; - - int ow = output_w >> 3; - int remain = output_w & 0x7; - if (ow > 0) { - asm volatile( - "vld1.8 {d0}, [%[kernel0]] \n" - "ldr r5, [%[kernel0], #8] \n" - "vld1.8 {d1}, [%[kernel1]] \n" - "ldr r6, [%[kernel1], #8] \n" - - "0: \n" - "vld1.8 {d2-d3}, [%[r0]] \n" // r0 - "add %[r0], #8 \n" - "vext.8 d4, d2, d3, #1 \n" - "vext.8 d5, d2, d3, #2 \n" - "vdup.s8 d6, d0[0] \n" - "vdup.s8 d7, d0[1] \n" - "vdup.s8 d8, d0[2] \n" - "vdup.s8 d9, d1[0] \n" - "vdup.s8 d10, d1[1] \n" - "vdup.s8 d11, d1[2] \n" - "vmull.s8 q6, d2, d6 \n" - "vmull.s8 q7, d4, d7 \n" - "vmlal.s8 q6, d5, d8 \n" - "vaddl.s16 q12, d12, d14 \n" - "vaddl.s16 q13, d13, d15 \n" - "vmull.s8 q6, d2, d9 \n" - "vmull.s8 q7, d4, d10 \n" - "vmlal.s8 q6, d5, d11 \n" - "vaddl.s16 q14, d12, d14 \n" - "vaddl.s16 q15, d13, d15 \n" - - "vld1.8 {d2-d3}, [%[r1]] \n" // r1 - "add %[r1], #8 \n" - "vext.8 d4, d2, d3, #1 \n" - "vext.8 d5, d2, d3, #2 \n" - - "vmull.s8 q6, d2, d6 \n" // next row - "vmull.s8 q7, d4, d7 \n" - "vmlal.s8 q6, d5, d8 \n" - "vaddl.s16 q8, d12, d14 \n" - "vaddl.s16 q9, d13, d15 \n" - "vmull.s8 q6, d2, d9 \n" - "vmull.s8 q7, d4, d10 \n" - "vmlal.s8 q6, d5, d11 \n" - "vaddl.s16 q10, d12, d14 \n" - "vaddl.s16 q11, d13, d15 \n" - - "vdup.s8 d6, d0[3] \n" - "vdup.s8 d7, d0[4] \n" - "vdup.s8 d8, d0[5] \n" - "vdup.s8 d9, d1[3] \n" - "vdup.s8 d10, d1[4] \n" - "vdup.s8 d11, d1[5] \n" - "vmull.s8 q6, d2, d6 \n" - "vmull.s8 q7, d4, d7 \n" - "vmlal.s8 q6, d5, d8 \n" - "vaddw.s16 q12, q12, d12 \n" - "vaddw.s16 q13, q13, d13 \n" - "vaddw.s16 q12, q12, d14 \n" - "vaddw.s16 q13, q13, d15 \n" - "vmull.s8 q6, d2, d9 \n" - "vmull.s8 q7, d4, d10 \n" - "vmlal.s8 q6, d5, d11 \n" - "vaddw.s16 q14, q14, d12 \n" - "vaddw.s16 q15, q15, d13 \n" - "vaddw.s16 q14, q14, d14 \n" - "vaddw.s16 q15, q15, d15 \n" - - "vld1.8 {d2-d3}, [%[r2]] \n" // r2 - "add %[r2], #8 \n" - "vext.8 d4, d2, d3, #1 \n" - "vext.8 d5, d2, d3, #2 \n" - - "vmull.s8 q6, d2, d6 \n" // next row - "vmull.s8 q7, d4, d7 \n" - "vmlal.s8 q6, d5, d8 \n" - "vaddw.s16 q8, q8, d12 \n" - "vaddw.s16 q8, q8, d14 \n" - "vaddw.s16 q9, q9, d13 \n" - "vaddw.s16 q9, q9, d15 \n" - "vmull.s8 q6, d2, d9 \n" - "vmull.s8 q7, d4, d10 \n" - "vmlal.s8 q6, d5, d11 \n" - "vaddw.s16 q10, q10, d12 \n" - "vaddw.s16 q11, q11, d13 \n" - "vaddw.s16 q10, q10, d14 \n" - "vaddw.s16 q11, q11, d15 \n" - - "vdup.s8 d6, d0[6] \n" - "vdup.s8 d7, d0[7] \n" - "vdup.s8 d8, r5 \n" - "vdup.s8 d9, d1[6] \n" - "vdup.s8 d10, d1[7] \n" - "vdup.s8 d11, r6 \n" - "vmull.s8 q6, d2, d6 \n" - "vmull.s8 q7, d4, d7 \n" - "vmlal.s8 q6, d5, d8 \n" - "vaddw.s16 q12, q12, d12 \n" - "vaddw.s16 q13, q13, d13 \n" - "vaddw.s16 q12, q12, d14 \n" - "vaddw.s16 q13, q13, d15 \n" - - "vld1.32 {d12-d15}, [%[output0]] \n" - "vadd.s32 q6, q6, q12 \n" - "vadd.s32 q7, q7, q13 \n" - "vst1.32 {d12-d15}, [%[output0]]! \n" - - "vmull.s8 q6, d2, d9 \n" - "vmull.s8 q7, d4, d10 \n" - "vmlal.s8 q6, d5, d11 \n" - "vaddw.s16 q14, q14, d12 \n" - "vaddw.s16 q15, q15, d13 \n" - "vaddw.s16 q14, q14, d14 \n" - "vaddw.s16 q15, q15, d15 \n" - - "vld1.32 {d12-d15}, [%[output1]] \n" - "vadd.s32 q6, q6, q14 \n" - "vadd.s32 q7, q7, q15 \n" - "vst1.32 {d12-d15}, [%[output1]]! \n" - - "vld1.8 {d2-d3}, [%[r3]] \n" // r3 - "add %[r3], #8 \n" - "vext.8 d4, d2, d3, #1 \n" - "vext.8 d5, d2, d3, #2 \n" - - "vmull.s8 q6, d2, d6 \n" // next row - "vmull.s8 q7, d4, d7 \n" - "vmlal.s8 q6, d5, d8 \n" - "vaddw.s16 q8, q8, d12 \n" - "vaddw.s16 q9, q9, d15 \n" - "vaddw.s16 q8, q8, d14 \n" - "vaddw.s16 q9, q9, d13 \n" - - "vld1.32 {d12-d15}, [%[output0n]] \n" - "vadd.s32 q6, q6, q8 \n" - "vadd.s32 q7, q7, q9 \n" - "vst1.32 {d12-d15}, [%[output0n]]! \n" - - "vmull.s8 q6, d2, d9 \n" - "vmull.s8 q7, d4, d10 \n" - "vmlal.s8 q6, d5, d11 \n" - "vaddw.s16 q10, q10, d12 \n" - "vaddw.s16 q11, q11, d15 \n" - "vaddw.s16 q10, q10, d14 \n" - "vaddw.s16 q11, q11, d13 \n" - - "vld1.32 {d12-d15}, [%[output1n]] \n" - "vadd.s32 q6, q6, q10 \n" - "vadd.s32 q7, q7, q11 \n" - "vst1.32 {d12-d15}, [%[output1n]]! \n" - - "subs %[ow], #1 \n" - "bne 0b \n" - : [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2), [r3] "+r"(r3), - [ow] "+r"(ow), [output0] "+r"(output0), [output1] "+r"(output1), - [output0n] "+r"(output0n), [output1n] "+r"(output1n) - : [kernel0] "r"(kernel0), [kernel1] "r"(kernel1) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r5", - "r6"); - } - if (remain > 0) { - asm volatile( - "vld1.8 {d0}, [%[kernel0]] \n" - "ldr r5, [%[kernel0], #8] \n" - "vld1.8 {d1}, [%[kernel1]] \n" - "ldr r6, [%[kernel1], #8] \n" - - "0: \n" - "vld1.8 d4, [%[r0]] \n" - "vld1.8 d5, [%[r1]] \n" - "vld1.8 d6, [%[r2]] \n" - "vld1.8 d7, [%[r3]] \n" - "add %[r0], #1 \n" - "add %[r1], #1 \n" - "add %[r2], #1 \n" - "add %[r3], #1 \n" - "vdup.s8 d2, r5 \n" - "vdup.s8 d3, r6 \n" - "vext.8 d8, d0, d2, #3 \n" - "vext.8 d9, d0, d2, #6 \n" - "vext.8 d10, d1, d3, #3 \n" - "vext.8 d11, d1, d3, #6 \n" - - "vmull.s8 q6, d4, d0 \n" - "vmull.s8 q7, d5, d8 \n" - "vmlal.s8 q6, d6, d9 \n" - "vaddl.s16 q12, d12, d14 \n" - "vdup.s32 d2, d24[1] \n" - "vadd.s32 d24, d24, d2 \n" - "vadd.s32 d24, d24, d25 \n" - "vmull.s8 q6, d4, d1 \n" - "vmull.s8 q7, d5, d10 \n" - "vmlal.s8 q6, d6, d11 \n" - "vaddl.s16 q13, d12, d14 \n" - "vdup.s32 d2, d26[1] \n" - "vadd.s32 d26, d26, d2 \n" - "vadd.s32 d26, d26, d27 \n" - - "ldr r7, [%[output0]] \n" - "vdup.s32 d14, r7 \n" - "vadd.s32 d14, d14, d24 \n" - "vst1.32 d14[0], [%[output0]]! \n" - "ldr r7, [%[output1]] \n" - "vdup.s32 d14, r7 \n" - "vadd.s32 d14, d14, d26 \n" - "vst1.32 d14[0], [%[output1]]! \n" - - "vmull.s8 q6, d5, d0 \n" - "vmull.s8 q7, d6, d8 \n" - "vmlal.s8 q6, d7, d9 \n" - "vaddl.s16 q12, d12, d14 \n" - "vdup.s32 d2, d24[1] \n" - "vadd.s32 d24, d24, d2 \n" - "vadd.s32 d24, d24, d25 \n" - "vmull.s8 q6, d5, d1 \n" - "vmull.s8 q7, d6, d10 \n" - "vmlal.s8 q6, d7, d11 \n" - "vaddl.s16 q13, d12, d14 \n" - "vdup.s32 d2, d26[1] \n" - "vadd.s32 d26, d26, d2 \n" - "vadd.s32 d26, d26, d27 \n" - - "ldr r7, [%[output0n]] \n" - "vdup.s32 d14, r7 \n" - "vadd.s32 d14, d14, d24 \n" - "vst1.32 d14[0], [%[output0n]]! \n" - "ldr r7, [%[output1n]] \n" - "vdup.s32 d14, r7 \n" - "vadd.s32 d14, d14, d26 \n" - "vst1.32 d14[0], [%[output1n]]! \n" - - "subs %[remain], #1 \n" - "bne 0b \n" - : [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2), [r3] "+r"(r3), - [remain] "+r"(remain), [output0] "+r"(output0), - [output1] "+r"(output1), [output0n] "+r"(output0n), - [output1n] "+r"(output1n) - : [kernel0] "r"(kernel0), [kernel1] "r"(kernel1) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "r5", "r6", "r7"); - } - output0 += output_w; - output1 += output_w; - output0n += output_w; - output1n += output_w; - } - // remain output height - for (; oh < output_h; ++oh) { - const int8_t* r0 = in_data + ic * image_size + oh * input_w; - const int8_t* r1 = r0 + input_w; - const int8_t* r2 = r1 + input_w; - const int8_t* r3 = r2 + input_w; - const int8_t* r4 = r3 + input_w; - - int ow = output_w >> 3; - int remain = output_w & 0x7; - if (ow > 0) { - asm volatile( - "vld1.8 {d0}, [%[kernel0]] \n" - "ldr r5, [%[kernel0], #8] \n" - "vld1.8 {d1}, [%[kernel1]] \n" - "ldr r6, [%[kernel1], #8] \n" - - "0: \n" - "vld1.8 {d2-d3}, [%[r0]] \n" // r0 - "add %[r0], #8 \n" - "vext.8 d4, d2, d3, #1 \n" - "vext.8 d5, d2, d3, #2 \n" - "vdup.s8 d6, d0[0] \n" - "vdup.s8 d7, d0[1] \n" - "vdup.s8 d8, d0[2] \n" - "vdup.s8 d9, d1[0] \n" - "vdup.s8 d10, d1[1] \n" - "vdup.s8 d11, d1[2] \n" - "vmull.s8 q6, d2, d6 \n" - "vmull.s8 q7, d4, d7 \n" - "vmlal.s8 q6, d5, d8 \n" - "vaddl.s16 q12, d12, d14 \n" - "vaddl.s16 q13, d13, d15 \n" - "vmull.s8 q6, d2, d9 \n" - "vmull.s8 q7, d4, d10 \n" - "vmlal.s8 q6, d5, d11 \n" - "vaddl.s16 q14, d12, d14 \n" - "vaddl.s16 q15, d13, d15 \n" - - "vld1.8 {d2-d3}, [%[r1]] \n" // r1 - "add %[r1], #8 \n" - "vext.8 d4, d2, d3, #1 \n" - "vext.8 d5, d2, d3, #2 \n" - "vdup.s8 d6, d0[3] \n" - "vdup.s8 d7, d0[4] \n" - "vdup.s8 d8, d0[5] \n" - "vdup.s8 d9, d1[3] \n" - "vdup.s8 d10, d1[4] \n" - "vdup.s8 d11, d1[5] \n" - "vmull.s8 q6, d2, d6 \n" - "vmull.s8 q7, d4, d7 \n" - "vmlal.s8 q6, d5, d8 \n" - "vaddw.s16 q12, q12, d12 \n" - "vaddw.s16 q12, q12, d14 \n" - "vaddw.s16 q13, q13, d13 \n" - "vaddw.s16 q13, q13, d15 \n" - "vmull.s8 q6, d2, d9 \n" - "vmull.s8 q7, d4, d10 \n" - "vmlal.s8 q6, d5, d11 \n" - "vaddw.s16 q14, q14, d12 \n" - "vaddw.s16 q14, q14, d14 \n" - "vaddw.s16 q15, q15, d13 \n" - "vaddw.s16 q15, q15, d15 \n" - - "vld1.8 {d2-d3}, [%[r2]] \n" // r2 - "add %[r2], #8 \n" - "vext.8 d4, d2, d3, #1 \n" - "vext.8 d5, d2, d3, #2 \n" - "vdup.s8 d6, d0[6] \n" - "vdup.s8 d7, d0[7] \n" - "vdup.s8 d8, r5 \n" - "vdup.s8 d9, d1[6] \n" - "vdup.s8 d10, d1[7] \n" - "vdup.s8 d11, r6 \n" - "vmull.s8 q6, d2, d6 \n" - "vmull.s8 q7, d4, d7 \n" - "vmlal.s8 q6, d5, d8 \n" - "vaddw.s16 q12, q12, d12 \n" - "vaddw.s16 q12, q12, d14 \n" - "vaddw.s16 q13, q13, d13 \n" - "vaddw.s16 q13, q13, d15 \n" - "vmull.s8 q6, d2, d9 \n" - "vmull.s8 q7, d4, d10 \n" - "vmlal.s8 q6, d5, d11 \n" - "vaddw.s16 q14, q14, d12 \n" - "vaddw.s16 q14, q14, d14 \n" - "vaddw.s16 q15, q15, d13 \n" - "vaddw.s16 q15, q15, d15 \n" - - "vld1.32 {d12-d15}, [%[output0]] \n" - "vadd.s32 q6, q6, q12 \n" - "vadd.s32 q7, q7, q13 \n" - "vst1.32 {d12-d15}, [%[output0]]! \n" - "vld1.32 {d12-d15}, [%[output1]] \n" - "vadd.s32 q6, q6, q14 \n" - "vadd.s32 q7, q7, q15 \n" - "vst1.32 {d12-d15}, [%[output1]]! \n" - - "subs %[ow], #1 \n" - "bne 0b \n" - : [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2), [ow] "+r"(ow), - [output0] "+r"(output0), [output1] "+r"(output1) - : [kernel0] "r"(kernel0), [kernel1] "r"(kernel1) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r5", - "r6"); - } - - if (remain > 0) { - asm volatile( - "vld1.8 {d0}, [%[kernel0]] \n" - "ldr r5, [%[kernel0], #8] \n" - "vld1.8 {d1}, [%[kernel1]] \n" - "ldr r6, [%[kernel1], #8] \n" - - "0: \n" - "vld1.8 d4, [%[r0]] \n" - "vld1.8 d5, [%[r1]] \n" - "vld1.8 d6, [%[r2]] \n" - "add %[r0], #1 \n" - "add %[r1], #1 \n" - "add %[r2], #1 \n" - "vdup.s8 d2, r5 \n" - "vdup.s8 d3, r6 \n" - "vext.8 d8, d0, d2, #3 \n" - "vext.8 d9, d0, d2, #6 \n" - "vext.8 d10, d1, d3, #3 \n" - "vext.8 d11, d1, d3, #6 \n" - - "vmull.s8 q6, d4, d0 \n" - "vmull.s8 q7, d5, d8 \n" - "vmlal.s8 q6, d6, d9 \n" - "vaddl.s16 q12, d12, d14 \n" - "vdup.s32 d2, d24[1] \n" - "vadd.s32 d24, d24, d2 \n" - "vadd.s32 d24, d24, d25 \n" - "vmull.s8 q6, d4, d1 \n" - "vmull.s8 q7, d5, d10 \n" - "vmlal.s8 q6, d6, d11 \n" - "vaddl.s16 q13, d12, d14 \n" - "vdup.s32 d2, d26[1] \n" - "vadd.s32 d26, d26, d2 \n" - "vadd.s32 d26, d26, d27 \n" - - "ldr r7, [%[output0]] \n" - "vdup.s32 d14, r7 \n" - "vadd.s32 d14, d14, d24 \n" - "vst1.32 d14[0], [%[output0]]! \n" - "ldr r7, [%[output1]] \n" - "vdup.s32 d14, r7 \n" - "vadd.s32 d14, d14, d26 \n" - "vst1.32 d14[0], [%[output1]]! \n" - - "subs %[remain], #1 \n" - "bne 0b \n" - : [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2), - [remain] "+r"(remain), [output0] "+r"(output0), - [output1] "+r"(output1) - : [kernel0] "r"(kernel0), [kernel1] "r"(kernel1) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "r5", "r6", "r7"); - } - } - } - } - - for (; oc < output_c; ++oc) { - for (int ic = 0; ic < input_c; ++ic) { - const int8_t* kernel0 = w_data + (oc * input_c + ic) * 9; - int32_t* output0 = out_data + oc * out_image_size; - int32_t* output0n = output0 + output_w; - - int oh = 0; - for (; oh < output_h - 1; oh += 2) { - const int8_t* r0 = in_data + ic * image_size + oh * input_w; - const int8_t* r1 = r0 + input_w; - const int8_t* r2 = r1 + input_w; - const int8_t* r3 = r2 + input_w; - - int ow = output_w >> 3; - int remain = output_w & 0x7; - if (ow > 0) { - asm volatile( - "vld1.8 {d0}, [%[kernel0]] \n" - "ldr r5, [%[kernel0], #8] \n" - - "0: \n" - "vld1.8 {d2-d3}, [%[r0]] \n" // r0 - "add %[r0], #8 \n" - "vext.8 d4, d2, d3, #1 \n" - "vext.8 d5, d2, d3, #2 \n" - "vdup.s8 d6, d0[0] \n" - "vdup.s8 d7, d0[1] \n" - "vdup.s8 d8, d0[2] \n" - "vmull.s8 q6, d2, d6 \n" - "vmull.s8 q7, d4, d7 \n" - "vmlal.s8 q6, d5, d8 \n" - "vaddl.s16 q12, d12, d14 \n" - "vaddl.s16 q13, d13, d15 \n" - - "vld1.8 {d2-d3}, [%[r1]] \n" // r1 - "add %[r1], #8 \n" - "vext.8 d4, d2, d3, #1 \n" - "vext.8 d5, d2, d3, #2 \n" - - "vmull.s8 q6, d2, d6 \n" // next row - "vmull.s8 q7, d4, d7 \n" - "vmlal.s8 q6, d5, d8 \n" - "vaddl.s16 q8, d12, d14 \n" - "vaddl.s16 q9, d13, d15 \n" - - "vdup.s8 d6, d0[3] \n" - "vdup.s8 d7, d0[4] \n" - "vdup.s8 d8, d0[5] \n" - "vmull.s8 q6, d2, d6 \n" - "vmull.s8 q7, d4, d7 \n" - "vmlal.s8 q6, d5, d8 \n" - "vaddw.s16 q12, q12, d12 \n" - "vaddw.s16 q12, q12, d14 \n" - "vaddw.s16 q13, q13, d13 \n" - "vaddw.s16 q13, q13, d15 \n" - - "vld1.8 {d2-d3}, [%[r2]] \n" // r2 - "add %[r2], #8 \n" - "vext.8 d4, d2, d3, #1 \n" - "vext.8 d5, d2, d3, #2 \n" - - "vmull.s8 q6, d2, d6 \n" // next row - "vmull.s8 q7, d4, d7 \n" - "vmlal.s8 q6, d5, d8 \n" - "vaddw.s16 q8, q8, d12 \n" - "vaddw.s16 q8, q8, d14 \n" - "vaddw.s16 q9, q9, d13 \n" - "vaddw.s16 q9, q9, d15 \n" - - "vdup.s8 d6, d0[6] \n" - "vdup.s8 d7, d0[7] \n" - "vdup.s8 d8, r5 \n" - "vmull.s8 q6, d2, d6 \n" - "vmull.s8 q7, d4, d7 \n" - "vmlal.s8 q6, d5, d8 \n" - "vaddw.s16 q12, q12, d12 \n" - "vaddw.s16 q12, q12, d14 \n" - "vaddw.s16 q13, q13, d13 \n" - "vaddw.s16 q13, q13, d15 \n" - - "vld1.32 {d12-d15}, [%[output0]] \n" - "vadd.s32 q6, q6, q12 \n" - "vadd.s32 q7, q7, q13 \n" - "vst1.32 {d12-d15}, [%[output0]]! \n" - - "vld1.8 {d2-d3}, [%[r3]] \n" // r3 - "add %[r3], #8 \n" - "vext.8 d4, d2, d3, #1 \n" - "vext.8 d5, d2, d3, #2 \n" - - "vmull.s8 q6, d2, d6 \n" // next row - "vmull.s8 q7, d4, d7 \n" - "vmlal.s8 q6, d5, d8 \n" - "vaddw.s16 q8, q8, d12 \n" - "vaddw.s16 q8, q8, d14 \n" - "vaddw.s16 q9, q9, d13 \n" - "vaddw.s16 q9, q9, d15 \n" - - "vld1.32 {d12-d15}, [%[output0n]] \n" - "vadd.s32 q6, q6, q8 \n" - "vadd.s32 q7, q7, q9 \n" - "vst1.32 {d12-d15}, [%[output0n]]! \n" - - "subs %[ow], #1 \n" - "bne 0b \n" - : [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2), [r3] "+r"(r3), - [ow] "+r"(ow), [output0] "+r"(output0), - [output0n] "+r"(output0n) - : [kernel0] "r"(kernel0) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r5"); - } - if (remain > 0) { - asm volatile( - "vld1.8 {d0}, [%[kernel0]] \n" - "ldr r5, [%[kernel0], #8] \n" - - "0: \n" - "vld1.8 d4, [%[r0]] \n" - "vld1.8 d5, [%[r1]] \n" - "vld1.8 d6, [%[r2]] \n" - "vld1.8 d7, [%[r3]] \n" - "add %[r0], #1 \n" - "add %[r1], #1 \n" - "add %[r2], #1 \n" - "add %[r3], #1 \n" - "vdup.s8 d2, r5 \n" - "vext.8 d8, d0, d2, #3 \n" - "vext.8 d9, d0, d2, #6 \n" - - "vmull.s8 q6, d4, d0 \n" - "vmull.s8 q7, d5, d8 \n" - "vmlal.s8 q6, d6, d9 \n" - "vaddl.s16 q12, d12, d14 \n" - "vdup.s32 d2, d24[1] \n" - "vadd.s32 d24, d24, d2 \n" - "vadd.s32 d24, d24, d25 \n" - - "ldr r7, [%[output0]] \n" - "vdup.s32 d14, r7 \n" - "vadd.s32 d14, d14, d24 \n" - "vst1.32 d14[0], [%[output0]]! \n" - - "vmull.s8 q6, d5, d0 \n" - "vmull.s8 q7, d6, d8 \n" - "vmlal.s8 q6, d7, d9 \n" - "vaddl.s16 q12, d12, d14 \n" - "vdup.s32 d2, d24[1] \n" - "vadd.s32 d24, d24, d2 \n" - "vadd.s32 d24, d24, d25 \n" - - "ldr r7, [%[output0n]] \n" - "vdup.s32 d14, r7 \n" - "vadd.s32 d14, d14, d24 \n" - "vst1.32 d14[0], [%[output0n]]! \n" - - "subs %[remain], #1 \n" - "bne 0b \n" - : [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2), [r3] "+r"(r3), - [remain] "+r"(remain), [output0] "+r"(output0), - [output0n] "+r"(output0n) - : [kernel0] "r"(kernel0) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "r5", "r7"); - } - output0 += output_w; - output0n += output_w; - } - // remain output height - for (; oh < output_h; ++oh) { - const int8_t* r0 = in_data + ic * image_size + oh * input_w; - const int8_t* r1 = r0 + input_w; - const int8_t* r2 = r1 + input_w; - - int ow = output_w >> 3; - int remain = output_w & 0x7; - if (ow > 0) { - asm volatile( - "vld1.8 {d0}, [%[kernel0]] \n" - "ldr r5, [%[kernel0], #8] \n" - - "0: \n" - "vld1.8 {d2-d3}, [%[r0]] \n" // r0 - "add %[r0], #8 \n" - "vext.8 d4, d2, d3, #1 \n" - "vext.8 d5, d2, d3, #2 \n" - "vdup.s8 d6, d0[0] \n" - "vdup.s8 d7, d0[1] \n" - "vdup.s8 d8, d0[2] \n" - "vmull.s8 q6, d2, d6 \n" - "vmull.s8 q7, d4, d7 \n" - "vmlal.s8 q6, d5, d8 \n" - "vaddl.s16 q12, d12, d14 \n" - "vaddl.s16 q13, d13, d15 \n" - - "vld1.8 {d2-d3}, [%[r1]] \n" // r1 - "add %[r1], #8 \n" - "vext.8 d4, d2, d3, #1 \n" - "vext.8 d5, d2, d3, #2 \n" - "vdup.s8 d6, d0[3] \n" - "vdup.s8 d7, d0[4] \n" - "vdup.s8 d8, d0[5] \n" - "vmull.s8 q6, d2, d6 \n" - "vmull.s8 q7, d4, d7 \n" - "vmlal.s8 q6, d5, d8 \n" - "vaddw.s16 q12, q12, d12 \n" - "vaddw.s16 q12, q12, d14 \n" - "vaddw.s16 q13, q13, d13 \n" - "vaddw.s16 q13, q13, d15 \n" - - "vld1.8 {d2-d3}, [%[r2]] \n" // r2 - "add %[r2], #8 \n" - "vext.8 d4, d2, d3, #1 \n" - "vext.8 d5, d2, d3, #2 \n" - "vdup.s8 d6, d0[6] \n" - "vdup.s8 d7, d0[7] \n" - "vdup.s8 d8, r5 \n" - "vmull.s8 q6, d2, d6 \n" - "vmull.s8 q7, d4, d7 \n" - "vmlal.s8 q6, d5, d8 \n" - "vaddw.s16 q12, q12, d12 \n" - "vaddw.s16 q12, q12, d14 \n" - "vaddw.s16 q13, q13, d13 \n" - "vaddw.s16 q13, q13, d15 \n" - - "vld1.32 {d12-d15}, [%[output0]] \n" - "vadd.s32 q6, q6, q12 \n" - "vadd.s32 q7, q7, q13 \n" - "vst1.32 {d12-d15}, [%[output0]]! \n" - - "subs %[ow], #1 \n" - "bne 0b \n" - : [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2), [ow] "+r"(ow), - [output0] "+r"(output0) - : [kernel0] "r"(kernel0) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r5"); - } - - if (remain > 0) { - asm volatile( - "vld1.8 {d0}, [%[kernel0]] \n" - "ldr r5, [%[kernel0], #8] \n" - - "0: \n" - "vld1.8 d4, [%[r0]] \n" - "vld1.8 d5, [%[r1]] \n" - "vld1.8 d6, [%[r2]] \n" - "add %[r0], #1 \n" - "add %[r1], #1 \n" - "add %[r2], #1 \n" - "vdup.s8 d2, r5 \n" - "vext.8 d8, d0, d2, #3 \n" - "vext.8 d9, d0, d2, #6 \n" - - "vmull.s8 q6, d4, d0 \n" - "vmull.s8 q7, d5, d8 \n" - "vmlal.s8 q6, d6, d9 \n" - "vaddl.s16 q12, d12, d14 \n" - "vdup.s32 d2, d24[1] \n" - "vadd.s32 d24, d24, d2 \n" - "vadd.s32 d24, d24, d25 \n" - - "ldr r7, [%[output0]] \n" - "vdup.s32 d14, r7 \n" - "vadd.s32 d14, d14, d24 \n" - "vst1.32 d14[0], [%[output0]]! \n" - - "subs %[remain], #1 \n" - "bne 0b \n" - : [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2), - [remain] "+r"(remain), [output0] "+r"(output0) - : [kernel0] "r"(kernel0) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "r5", "r7"); - } - } - } - } -#endif -#else -// TODO(hjchen2) -#endif -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/src/operators/math/conv5x5_arm_int8.cpp b/src/operators/math/conv5x5_arm_int8.cpp deleted file mode 100644 index c861c22d184d5428f3ab9c8f3a69b9aca5b697bd..0000000000000000000000000000000000000000 --- a/src/operators/math/conv5x5_arm_int8.cpp +++ /dev/null @@ -1,551 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef CONV_OP - -#include "operators/math/conv_arm_int8.h" - -namespace paddle_mobile { -namespace operators { - -void conv5x5s1_int8(const framework::Tensor& input, - const framework::Tensor& weight, - framework::Tensor* output) { -#if defined(__ARM_NEON__) || defined(__ARM_NEON) - const int8_t* in_data = input.data(); - const int8_t* w_data = weight.data(); - int32_t* out_data = output->mutable_data(); - // make sure that batch size is 1 - int input_c = input.dims()[1]; - int input_h = input.dims()[2]; - int input_w = input.dims()[3]; - int output_c = output->dims()[1]; - int output_h = output->dims()[2]; - int output_w = output->dims()[3]; - int image_size = input_h * input_w; - int out_image_size = output_h * output_w; - memset(out_data, 0, output_c * out_image_size * sizeof(int32_t)); -#if __aarch64__ - // TODO(hjchen2) -#else - #pragma omp parallel for - for (int oc = 0; oc < output_c; ++oc) { - for (int ic = 0; ic < input_c; ++ic) { - const int8_t* kernel = w_data + (oc * input_c + ic) * 25; - int32_t* output0 = out_data + oc * out_image_size; - int32_t* output1 = output0 + output_w; - int oh = 0; - for (; oh < output_h - 1; oh += 2) { - const int8_t* r0 = in_data + ic * image_size + oh * input_w; - const int8_t* r1 = r0 + input_w; - const int8_t* r2 = r1 + input_w; - const int8_t* r3 = r2 + input_w; - const int8_t* r4 = r3 + input_w; - const int8_t* r5 = r4 + input_w; - - int ow = output_w >> 3; - int remain = output_w & 0x7; - if (ow > 0) { - asm volatile("vld1.8 {d0-d3}, [%[kernel]] \n" - : [kernel] "+r"(kernel) - : - : "cc", "memory", "q0", "q1"); - asm volatile( - "0: \n" - "vld1.8 {d4-d5}, [%[r0]] \n" // r0 - "add %[r0], #8 \n" - "vext.8 d6, d4, d5, #1 \n" - "vext.8 d7, d4, d5, #2 \n" - "vext.8 d8, d4, d5, #3 \n" - "vext.8 d9, d4, d5, #4 \n" - "vdup.s8 d10, d0[0] \n" - "vdup.s8 d11, d0[1] \n" - "vdup.s8 d12, d0[2] \n" - "vdup.s8 d13, d0[3] \n" - "vdup.s8 d14, d0[4] \n" - "vmull.s8 q8, d4, d10 \n" - "vmull.s8 q9, d6, d11 \n" - "vmlal.s8 q8, d7, d12 \n" - "vmlal.s8 q9, d8, d13 \n" - "vaddl.s16 q14, d16, d18 \n" - "vaddl.s16 q15, d17, d19 \n" - "vmull.s8 q8, d9, d14 \n" - "vaddw.s16 q14, q14, d16 \n" - "vaddw.s16 q15, q15, d17 \n" - - "vld1.8 {d4-d5}, [%[r1]] \n" // r1 - "add %[r1], #8 \n" - "vext.8 d6, d4, d5, #1 \n" - "vext.8 d7, d4, d5, #2 \n" - "vext.8 d8, d4, d5, #3 \n" - "vext.8 d9, d4, d5, #4 \n" - - "vmull.s8 q8, d4, d10 \n" // next row - "vmull.s8 q9, d6, d11 \n" - "vmlal.s8 q8, d7, d12 \n" - "vmlal.s8 q9, d8, d13 \n" - "vaddl.s16 q10, d16, d18 \n" - "vaddl.s16 q11, d17, d19 \n" - "vmull.s8 q8, d9, d14 \n" - "vaddw.s16 q10, q10, d16 \n" - "vaddw.s16 q11, q11, d17 \n" - - "vdup.s8 d10, d0[5] \n" - "vdup.s8 d11, d0[6] \n" - "vdup.s8 d12, d0[7] \n" - "vdup.s8 d13, d1[0] \n" - "vdup.s8 d14, d1[1] \n" - "vmull.s8 q8, d4, d10 \n" - "vmull.s8 q9, d6, d11 \n" - "vmlal.s8 q8, d7, d12 \n" - "vmlal.s8 q9, d8, d13 \n" - "vaddl.s16 q12, d16, d18 \n" - "vaddl.s16 q13, d17, d19 \n" - "vmull.s8 q8, d9, d14 \n" - "vaddw.s16 q12, q12, d16 \n" - "vaddw.s16 q13, q13, d17 \n" - "vadd.s32 q14, q14, q12 \n" - "vadd.s32 q15, q15, q13 \n" - - "vld1.8 {d4-d5}, [%[r2]] \n" // r2 - "add %[r2], #8 \n" - "vext.8 d6, d4, d5, #1 \n" - "vext.8 d7, d4, d5, #2 \n" - "vext.8 d8, d4, d5, #3 \n" - "vext.8 d9, d4, d5, #4 \n" - - "vmull.s8 q8, d4, d10 \n" // next row - "vmull.s8 q9, d6, d11 \n" - "vmlal.s8 q8, d7, d12 \n" - "vmlal.s8 q9, d8, d13 \n" - "vaddl.s16 q12, d16, d18 \n" - "vaddl.s16 q13, d17, d19 \n" - "vmull.s8 q8, d9, d14 \n" - "vaddw.s16 q12, q12, d16 \n" - "vaddw.s16 q13, q13, d17 \n" - "vadd.s32 q10, q10, q12 \n" - "vadd.s32 q11, q11, q13 \n" - - "vdup.s8 d10, d1[2] \n" - "vdup.s8 d11, d1[3] \n" - "vdup.s8 d12, d1[4] \n" - "vdup.s8 d13, d1[5] \n" - "vdup.s8 d14, d1[6] \n" - "vmull.s8 q8, d4, d10 \n" - "vmull.s8 q9, d6, d11 \n" - "vmlal.s8 q8, d7, d12 \n" - "vmlal.s8 q9, d8, d13 \n" - "vaddl.s16 q12, d16, d18 \n" - "vaddl.s16 q13, d17, d19 \n" - "vmull.s8 q8, d9, d14 \n" - "vaddw.s16 q12, q12, d16 \n" - "vaddw.s16 q13, q13, d17 \n" - "vadd.s32 q14, q14, q12 \n" - "vadd.s32 q15, q15, q13 \n" - - "vld1.8 {d4-d5}, [%[r3]] \n" // r3 - "add %[r3], #8 \n" - "vext.8 d6, d4, d5, #1 \n" - "vext.8 d7, d4, d5, #2 \n" - "vext.8 d8, d4, d5, #3 \n" - "vext.8 d9, d4, d5, #4 \n" - - "vmull.s8 q8, d4, d10 \n" // next row - "vmull.s8 q9, d6, d11 \n" - "vmlal.s8 q8, d7, d12 \n" - "vmlal.s8 q9, d8, d13 \n" - "vaddl.s16 q12, d16, d18 \n" - "vaddl.s16 q13, d17, d19 \n" - "vmull.s8 q8, d9, d14 \n" - "vaddw.s16 q12, q12, d16 \n" - "vaddw.s16 q13, q13, d17 \n" - "vadd.s32 q10, q10, q12 \n" - "vadd.s32 q11, q11, q13 \n" - - "vdup.s8 d10, d1[7] \n" - "vdup.s8 d11, d2[0] \n" - "vdup.s8 d12, d2[1] \n" - "vdup.s8 d13, d2[2] \n" - "vdup.s8 d14, d2[3] \n" - "vmull.s8 q8, d4, d10 \n" - "vmull.s8 q9, d6, d11 \n" - "vmlal.s8 q8, d7, d12 \n" - "vmlal.s8 q9, d8, d13 \n" - "vaddl.s16 q12, d16, d18 \n" - "vaddl.s16 q13, d17, d19 \n" - "vmull.s8 q8, d9, d14 \n" - "vaddw.s16 q12, q12, d16 \n" - "vaddw.s16 q13, q13, d17 \n" - "vadd.s32 q14, q14, q12 \n" - "vadd.s32 q15, q15, q13 \n" - - "vld1.8 {d4-d5}, [%[r4]] \n" // r4 - "add %[r4], #8 \n" - "vext.8 d6, d4, d5, #1 \n" - "vext.8 d7, d4, d5, #2 \n" - "vext.8 d8, d4, d5, #3 \n" - "vext.8 d9, d4, d5, #4 \n" - - "vmull.s8 q8, d4, d10 \n" // next row - "vmull.s8 q9, d6, d11 \n" - "vmlal.s8 q8, d7, d12 \n" - "vmlal.s8 q9, d8, d13 \n" - "vaddl.s16 q12, d16, d18 \n" - "vaddl.s16 q13, d17, d19 \n" - "vmull.s8 q8, d9, d14 \n" - "vaddw.s16 q12, q12, d16 \n" - "vaddw.s16 q13, q13, d17 \n" - "vadd.s32 q10, q10, q12 \n" - "vadd.s32 q11, q11, q13 \n" - - "vdup.s8 d10, d2[4] \n" - "vdup.s8 d11, d2[5] \n" - "vdup.s8 d12, d2[6] \n" - "vdup.s8 d13, d2[7] \n" - "vdup.s8 d14, d3[0] \n" - "vmull.s8 q8, d4, d10 \n" - "vmull.s8 q9, d6, d11 \n" - "vmlal.s8 q8, d7, d12 \n" - "vmlal.s8 q9, d8, d13 \n" - "vaddl.s16 q12, d16, d18 \n" - "vaddl.s16 q13, d17, d19 \n" - "vmull.s8 q8, d9, d14 \n" - "vaddw.s16 q12, q12, d16 \n" - "vaddw.s16 q13, q13, d17 \n" - "vadd.s32 q14, q14, q12 \n" - "vadd.s32 q15, q15, q13 \n" - - "vld1.32 {d24-d27}, [%[output0]] \n" - "vadd.s32 q12, q12, q14 \n" - "vadd.s32 q13, q13, q15 \n" - "vst1.32 {d24-d27}, [%[output0]]! \n" - - "vld1.8 {d4-d5}, [%[r5]] \n" // row 5 - "add %[r5], #8 \n" - "vext.8 d6, d4, d5, #1 \n" - "vext.8 d7, d4, d5, #2 \n" - "vext.8 d8, d4, d5, #3 \n" - "vext.8 d9, d4, d5, #4 \n" - "vmull.s8 q8, d4, d10 \n" - "vmull.s8 q9, d6, d11 \n" - "vmlal.s8 q8, d7, d12 \n" - "vmlal.s8 q9, d8, d13 \n" - "vaddl.s16 q12, d16, d18 \n" - "vaddl.s16 q13, d17, d19 \n" - "vmull.s8 q8, d9, d14 \n" - "vaddw.s16 q12, q12, d16 \n" - "vaddw.s16 q13, q13, d17 \n" - "vadd.s32 q10, q10, q12 \n" - "vadd.s32 q11, q11, q13 \n" - - "vld1.32 {d24-d27}, [%[output1]] \n" - "vadd.s32 q12, q12, q10 \n" - "vadd.s32 q13, q13, q11 \n" - "vst1.32 {d24-d27}, [%[output1]]! \n" - - "subs %[ow], #1 \n" - "bne 0b \n" - : [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2), [r3] "+r"(r3), - [r4] "+r"(r4), [r5] "+r"(r5), [ow] "+r"(ow), - [output0] "+r"(output0), [output1] "+r"(output1) - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); - } - if (remain > 0) { - asm volatile("vld1.8 {d0-d3}, [%[kernel]] \n" - : [kernel] "+r"(kernel) - : - : "cc", "memory", "q0", "q1"); - asm volatile( - "0: \n" - "vld1.8 d4, [%[r0]] \n" - "vld1.8 d5, [%[r1]] \n" - "vld1.8 d6, [%[r2]] \n" - "vld1.8 d7, [%[r3]] \n" - "vld1.8 d8, [%[r4]] \n" - "vld1.8 d9, [%[r5]] \n" - "add %[r0], #1 \n" - "add %[r1], #1 \n" - "add %[r2], #1 \n" - "add %[r3], #1 \n" - "add %[r4], #1 \n" - "add %[r5], #1 \n" - "vext.8 d10, d0, d1, #5 \n" - "vext.8 d11, d1, d2, #2 \n" - "vext.8 d12, d1, d2, #7 \n" - "vext.8 d13, d2, d3, #4 \n" - - "vmull.s8 q7, d4, d0 \n" - "vmull.s8 q8, d5, d10 \n" - "vmull.s8 q9, d6, d11 \n" - "vmlal.s8 q8, d7, d12 \n" - "vmlal.s8 q9, d8, d13 \n" - "vaddl.s16 q10, d14, d16 \n" - "vaddw.s16 q10, q10, d18 \n" - "vadd.s32 d4, d20, d21 \n" - "vaddl.s16 q10, d15, d17 \n" - "vaddw.s16 q10, q10, d19 \n" - "vdup.s32 d14, d4[0] \n" - "vdup.s32 d15, d4[1] \n" - "vadd.s32 d15, d15, d14 \n" - "vdup.s32 d14, d20[0] \n" - "vadd.s32 d15, d15, d14 \n" - - "ldr r6, [%[output0]] \n" - "vdup.s32 d14, r6 \n" - "vadd.s32 d15, d15, d14 \n" - "vst1.32 d15[0], [%[output0]]! \n" - - "vmull.s8 q7, d5, d0 \n" - "vmull.s8 q8, d6, d10 \n" - "vmull.s8 q9, d7, d11 \n" - "vmlal.s8 q8, d8, d12 \n" - "vmlal.s8 q9, d9, d13 \n" - "vaddl.s16 q10, d14, d16 \n" - "vaddw.s16 q10, q10, d18 \n" - "vadd.s32 d4, d20, d21 \n" - "vaddl.s16 q10, d15, d17 \n" - "vaddw.s16 q10, q10, d19 \n" - "vdup.s32 d14, d4[0] \n" - "vdup.s32 d15, d4[1] \n" - "vadd.s32 d15, d15, d14 \n" - "vdup.s32 d14, d20[0] \n" - "vadd.s32 d15, d15, d14 \n" - - "ldr r6, [%[output1]] \n" - "vdup.s32 d14, r6 \n" - "vadd.s32 d15, d15, d14 \n" - "vst1.32 d15[0], [%[output1]]! \n" - - "subs %[remain], #1 \n" - "bne 0b \n" - : [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2), [r3] "+r"(r3), - [r4] "+r"(r4), [r5] "+r"(r5), [remain] "+r"(remain), - [output0] "+r"(output0), [output1] "+r"(output1) - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "r6"); - } - output0 += output_w; - output1 += output_w; - } - // remain output height - for (; oh < output_h; ++oh) { - const int8_t* r0 = in_data + ic * image_size + oh * input_w; - const int8_t* r1 = r0 + input_w; - const int8_t* r2 = r1 + input_w; - const int8_t* r3 = r2 + input_w; - const int8_t* r4 = r3 + input_w; - - int ow = output_w >> 3; - int remain = output_w & 0x7; - if (ow > 0) { - asm volatile("vld1.8 {d0-d3}, [%[kernel]] \n" - : [kernel] "+r"(kernel) - : - : "cc", "memory", "q0", "q1"); - asm volatile( - "0: \n" - "vld1.8 {d4-d5}, [%[r0]] \n" // r0 - "add %[r0], #8 \n" - "vext.8 d6, d4, d5, #1 \n" - "vext.8 d7, d4, d5, #2 \n" - "vext.8 d8, d4, d5, #3 \n" - "vext.8 d9, d4, d5, #4 \n" - "vdup.s8 d10, d0[0] \n" - "vdup.s8 d11, d0[1] \n" - "vdup.s8 d12, d0[2] \n" - "vdup.s8 d13, d0[3] \n" - "vdup.s8 d14, d0[4] \n" - "vmull.s8 q8, d4, d10 \n" - "vmull.s8 q9, d6, d11 \n" - "vmlal.s8 q8, d7, d12 \n" - "vmlal.s8 q9, d8, d13 \n" - "vaddl.s16 q14, d16, d18 \n" - "vaddl.s16 q15, d17, d19 \n" - "vmull.s8 q8, d9, d14 \n" - "vaddw.s16 q14, q14, d16 \n" - "vaddw.s16 q15, q15, d17 \n" - - "vld1.8 {d4-d5}, [%[r1]] \n" // r1 - "add %[r1], #8 \n" - "vext.8 d6, d4, d5, #1 \n" - "vext.8 d7, d4, d5, #2 \n" - "vext.8 d8, d4, d5, #3 \n" - "vext.8 d9, d4, d5, #4 \n" - "vdup.s8 d10, d0[5] \n" - "vdup.s8 d11, d0[6] \n" - "vdup.s8 d12, d0[7] \n" - "vdup.s8 d13, d1[0] \n" - "vdup.s8 d14, d1[1] \n" - "vmull.s8 q8, d4, d10 \n" - "vmull.s8 q9, d6, d11 \n" - "vmlal.s8 q8, d7, d12 \n" - "vmlal.s8 q9, d8, d13 \n" - "vaddl.s16 q12, d16, d18 \n" - "vaddl.s16 q13, d17, d19 \n" - "vmull.s8 q8, d9, d14 \n" - "vaddw.s16 q12, q12, d16 \n" - "vaddw.s16 q13, q13, d17 \n" - "vadd.s32 q14, q14, q12 \n" - "vadd.s32 q15, q15, q13 \n" - - "vld1.8 {d4-d5}, [%[r2]] \n" // r2 - "add %[r2], #8 \n" - "vext.8 d6, d4, d5, #1 \n" - "vext.8 d7, d4, d5, #2 \n" - "vext.8 d8, d4, d5, #3 \n" - "vext.8 d9, d4, d5, #4 \n" - "vdup.s8 d10, d1[2] \n" - "vdup.s8 d11, d1[3] \n" - "vdup.s8 d12, d1[4] \n" - "vdup.s8 d13, d1[5] \n" - "vdup.s8 d14, d1[6] \n" - "vmull.s8 q8, d4, d10 \n" - "vmull.s8 q9, d6, d11 \n" - "vmlal.s8 q8, d7, d12 \n" - "vmlal.s8 q9, d8, d13 \n" - "vaddl.s16 q12, d16, d18 \n" - "vaddl.s16 q13, d17, d19 \n" - "vmull.s8 q8, d9, d14 \n" - "vaddw.s16 q12, q12, d16 \n" - "vaddw.s16 q13, q13, d17 \n" - "vadd.s32 q14, q14, q12 \n" - "vadd.s32 q15, q15, q13 \n" - - "vld1.8 {d4-d5}, [%[r3]] \n" // r3 - "add %[r3], #8 \n" - "vext.8 d6, d4, d5, #1 \n" - "vext.8 d7, d4, d5, #2 \n" - "vext.8 d8, d4, d5, #3 \n" - "vext.8 d9, d4, d5, #4 \n" - "vdup.s8 d10, d1[7] \n" - "vdup.s8 d11, d2[0] \n" - "vdup.s8 d12, d2[1] \n" - "vdup.s8 d13, d2[2] \n" - "vdup.s8 d14, d2[3] \n" - "vmull.s8 q8, d4, d10 \n" - "vmull.s8 q9, d6, d11 \n" - "vmlal.s8 q8, d7, d12 \n" - "vmlal.s8 q9, d8, d13 \n" - "vaddl.s16 q12, d16, d18 \n" - "vaddl.s16 q13, d17, d19 \n" - "vmull.s8 q8, d9, d14 \n" - "vaddw.s16 q12, q12, d16 \n" - "vaddw.s16 q13, q13, d17 \n" - "vadd.s32 q14, q14, q12 \n" - "vadd.s32 q15, q15, q13 \n" - - "vld1.8 {d4-d5}, [%[r4]] \n" // r4 - "add %[r4], #8 \n" - "vext.8 d6, d4, d5, #1 \n" - "vext.8 d7, d4, d5, #2 \n" - "vext.8 d8, d4, d5, #3 \n" - "vext.8 d9, d4, d5, #4 \n" - "vdup.s8 d10, d2[4] \n" - "vdup.s8 d11, d2[5] \n" - "vdup.s8 d12, d2[6] \n" - "vdup.s8 d13, d2[7] \n" - "vdup.s8 d14, d3[0] \n" - "vmull.s8 q8, d4, d10 \n" - "vmull.s8 q9, d6, d11 \n" - "vmlal.s8 q8, d7, d12 \n" - "vmlal.s8 q9, d8, d13 \n" - "vaddl.s16 q12, d16, d18 \n" - "vaddl.s16 q13, d17, d19 \n" - "vmull.s8 q8, d9, d14 \n" - "vaddw.s16 q12, q12, d16 \n" - "vaddw.s16 q13, q13, d17 \n" - "vadd.s32 q14, q14, q12 \n" - "vadd.s32 q15, q15, q13 \n" - - "vld1.32 {d24-d27}, [%[output0]] \n" - "vadd.s32 q12, q12, q14 \n" - "vadd.s32 q13, q13, q15 \n" - "vst1.32 {d24-d27}, [%[output0]]! \n" - - "subs %[ow], #1 \n" - "bne 0b \n" - : [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2), [r3] "+r"(r3), - [r4] "+r"(r4), [ow] "+r"(ow), [output0] "+r"(output0) - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); - } - - if (remain > 0) { - asm volatile("vld1.8 {d0-d3}, [%[kernel]] \n" - : [kernel] "+r"(kernel) - : - : "cc", "memory", "q0", "q1"); - asm volatile( - "0: \n" - "vld1.8 d4, [%[r0]] \n" - "vld1.8 d5, [%[r1]] \n" - "vld1.8 d6, [%[r2]] \n" - "vld1.8 d7, [%[r3]] \n" - "vld1.8 d8, [%[r4]] \n" - "add %[r0], #1 \n" - "add %[r1], #1 \n" - "add %[r2], #1 \n" - "add %[r3], #1 \n" - "add %[r4], #1 \n" - "vext.8 d10, d0, d1, #5 \n" - "vext.8 d11, d1, d2, #2 \n" - "vext.8 d12, d1, d2, #7 \n" - "vext.8 d13, d2, d3, #4 \n" - - "vmull.s8 q7, d4, d0 \n" - "vmull.s8 q8, d5, d10 \n" - "vmull.s8 q9, d6, d11 \n" - "vmlal.s8 q8, d7, d12 \n" - "vmlal.s8 q9, d8, d13 \n" - "vaddl.s16 q10, d14, d16 \n" - "vaddw.s16 q10, q10, d18 \n" - "vadd.s32 d4, d20, d21 \n" - "vaddl.s16 q10, d15, d17 \n" - "vaddw.s16 q10, q10, d19 \n" - "vdup.s32 d14, d4[0] \n" - "vdup.s32 d15, d4[1] \n" - "vadd.s32 d15, d15, d14 \n" - "vdup.s32 d14, d20[0] \n" - "vadd.s32 d15, d15, d14 \n" - - "ldr r6, [%[output0]] \n" - "vdup.s32 d14, r6 \n" - "vadd.s32 d15, d15, d14 \n" - "vst1.32 d15[0], [%[output0]]! \n" - - "subs %[remain], #1 \n" - "bne 0b \n" - : [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2), [r3] "+r"(r3), - [r4] "+r"(r4), [remain] "+r"(remain), [output0] "+r"(output0) - : - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "r6"); - } - } - } - } -#endif -#else -// TODO(hjchen2) -#endif -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/src/operators/math/gemm.h b/src/operators/math/gemm.h index ea023bc134033aee6577ebf06c95f2a762d08bca..8498992fcecbcb2c9a773fba874e108c013a04fc 100644 --- a/src/operators/math/gemm.h +++ b/src/operators/math/gemm.h @@ -209,12 +209,18 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb, int32_t lda, int8_t *buffer); void PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B, int32_t ldb, int8_t *buffer); + void PackMatrixA_omp_4r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A, + int32_t lda, int8_t *buffer); + void PackMatrixB_omp_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B, + int32_t ldb, int8_t *buffer); // 8 bits int matrix product void Sgemm(int32_t m, int32_t n, int32_t k, int8_t alpha, const int8_t *A, int32_t lda, const int8_t *B, int32_t ldb, int8_t beta, int32_t *C, int32_t ldc, bool relu, int8_t *bias); - + void Sgemm_omp(int32_t m, int32_t n, int32_t k, int8_t alpha, const int8_t *A, + int32_t lda, const int8_t *B, int32_t ldb, int8_t beta, + int32_t *C, int32_t ldc, bool relu, int8_t *bias); // 8 bits int write back // C = alpha * A * B + beta * C void WriteWithAlphaBeta(int32_t mc, int32_t nc, int32_t *c, int32_t *C, diff --git a/src/operators/math/gemm_int8.cpp b/src/operators/math/gemm_int8.cpp index 5dd8a7c3131543f426f32e258efb3181be9b2f61..b16db7fe6acf0c3c7fb2902c9fb3f6e3dc81a65f 100644 --- a/src/operators/math/gemm_int8.cpp +++ b/src/operators/math/gemm_int8.cpp @@ -30,7 +30,7 @@ void Gemm::AddDot4x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c, int32_t ldc) { #if __ARM_NEON #if __aarch64__ -// TODO +// TODO(wzzju) #else const int8_t *a_ptr, *b_ptr; a_ptr = a; @@ -246,7 +246,7 @@ void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c, int32_t ldc) { #if __ARM_NEON #if __aarch64__ -// TODO +// TODO(wzzju) #else const int8_t *a_ptr, *b_ptr; a_ptr = a; @@ -546,8 +546,12 @@ void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, int8_t alpha, #pragma omp parallel for for (int32_t j = 0; j < nc; j += NR) { for (int32_t i = 0; i < mc; i += MR_INT8) { +#if __aarch64__ + // TODO(wzzju) +#else // AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); +#endif // __aarch64__ } } if (alpha != 1) { @@ -682,7 +686,7 @@ void Gemm::PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B, const int8_t *b0 = &B(i, j); #if __ARM_NEON #if __aarch64__ - // TODO + // TODO(wzzju) #else asm volatile( // "pld [%[b0]] \n\t" @@ -791,7 +795,7 @@ void Gemm::WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C, int32_t ldc) { #if __ARM_NEON #if __aarch64__ -// TODO +// TODO(wzzju) #else int32_t nc1 = nc >> 4; int32_t _nc1 = nc & 15; diff --git a/src/operators/math/gemm_omp_int8.cpp b/src/operators/math/gemm_omp_int8.cpp new file mode 100644 index 0000000000000000000000000000000000000000..21256cccfcc6dcc647f34a2129616b70804d398f --- /dev/null +++ b/src/operators/math/gemm_omp_int8.cpp @@ -0,0 +1,235 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "common/log.h" +#include "memory/t_malloc.h" +#include "operators/math/gemm.h" +#if __ARM_NEON +#include +#endif +#ifdef _OPENMP +#include +#endif + +namespace paddle_mobile { +namespace operators { +namespace math { + +// 8 bits int matrix product (m*k x k*n) +void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, int8_t alpha, + const int8_t *A, int32_t lda, const int8_t *B, int32_t ldb, + int8_t beta, int32_t *C, int32_t ldc, bool relu, + int8_t *bias) { +#ifdef _OPENMP + int32_t max_threads = omp_get_max_threads(); +#else + int32_t max_threads = 1; +#endif + + int32_t L1 = 64 / max_threads * 1024; + KC = k; + zero_int8 = + static_cast(paddle_mobile::memory::Alloc(sizeof(int8_t) * KC)); + memset(static_cast(zero_int8), 0, sizeof(int8_t) * KC); + if (m > n) { + // 对 A 分块 + MC = L1 / (KC * sizeof(int8_t)); + if (MC == 0) { + MC = MR_INT8; + } else { + int32_t mblock_num = (m + MC - 1) / MC; + MC = (m + mblock_num - 1) / mblock_num; + MC = (MC + MR_INT8 - 1) / MR_INT8 * MR_INT8; + } + // 补齐 B + NC = (n + NR - 1) / NR * NR; + + packedB_int8 = static_cast( + paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC)); +#if __aarch64__ + // TODO(wzzju) +#else + PackMatrixB_omp_8c(KC, n, n % NR, B, ldb, packedB_int8); +#endif + packedA_int8 = static_cast( + paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC * max_threads)); + } else { + // 对 B 分块 + NC = L1 / (KC * sizeof(int8_t)); + if (NC == 0) { + NC = NR; + } else { + int32_t nblock_num = (n + NC - 1) / NC; + NC = (n + nblock_num - 1) / nblock_num; + NC = (NC + NR - 1) / NR * NR; + } + // 补齐 A + MC = (m + MR_INT8 - 1) / MR_INT8 * MR_INT8; + + packedA_int8 = static_cast( + paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC)); +#if __aarch64__ + // TODO(wzzju) +#else + PackMatrixA_omp_4r(m, KC, m % MR_INT8, A, lda, packedA_int8); +#endif + packedB_int8 = static_cast( + paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC * max_threads)); + } + packedC_int8 = static_cast( + paddle_mobile::memory::Alloc(sizeof(int32_t) * MC * NC * max_threads)); + + if (m > n) { +#pragma omp parallel for + for (int32_t i = 0; i < m; i += MC) { +#ifdef _OPENMP + int32_t local_threads = omp_get_thread_num(); +#else + int32_t local_threads = 0; +#endif + + int32_t mc; + mc = s_min(m - i, MC); + int8_t *local_A = packedA_int8 + MC * KC * local_threads; + int32_t *local_C = packedC_int8 + MC * NC * local_threads; +#if __aarch64__ + // TODO(wzzju) +#else + PackMatrixA_4r(mc, KC, mc % MR_INT8, &A(i, 0), lda, local_A); +#endif + InnerKernelWithBias(mc, n, alpha, local_A, packedB_int8, beta, local_C, + &C(i, 0), ldc, relu, bias + i); + } + } else { +#pragma omp parallel for + for (int32_t j = 0; j < n; j += NC) { +#ifdef _OPENMP + int32_t local_threads = omp_get_thread_num(); +#else + int32_t local_threads = 0; +#endif + int32_t nc; + nc = s_min(n - j, NC); + int8_t *local_B = packedB_int8 + KC * NC * local_threads; + int32_t *local_C = packedC_int8 + MC * NC * local_threads; +#if __aarch64__ + // TODO(wzzju) +#else + PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, local_B); +#endif + InnerKernelWithBias(m, nc, alpha, packedA_int8, local_B, beta, local_C, + &C(0, j), ldc, relu, bias); + } + } + + paddle_mobile::memory::Free(packedA_int8); + paddle_mobile::memory::Free(packedB_int8); + paddle_mobile::memory::Free(packedC_int8); + paddle_mobile::memory::Free(zero_int8); +} + +void Gemm::PackMatrixB_omp_8c(int32_t k, int32_t n, int32_t n_tail, + const int8_t *B, int32_t ldb, int8_t *buffer) { + const int32_t j_length = n - n_tail; +#pragma omp parallel for + for (int32_t j = 0; j < j_length; j += NR) { + int8_t *local_buffer = buffer + j * k; + for (int32_t i = 0; i < k; ++i) { + const int8_t *b0 = &B(i, j); +#if __ARM_NEON +#if __aarch64__ + // TODO(wzzju) +#else + asm volatile( + // "pld [%[b0]] \n\t" + "vld1.s8 {d0}, [%[b0]] \n\t" + "vst1.s8 {d0}, [%[local_buffer]]! \n\t" + : [local_buffer] "+r"(local_buffer) + : [b0] "r"(b0) + : "memory", "q0"); +#endif // __aarch64__ +#else + *local_buffer++ = *b0++; + *local_buffer++ = *b0++; + *local_buffer++ = *b0++; + *local_buffer++ = *b0++; + *local_buffer++ = *b0++; + *local_buffer++ = *b0++; + *local_buffer++ = *b0++; + *local_buffer++ = *b0++; +#endif // __ARM_NEON + } + } + if (n_tail != 0) { + int8_t *local_buffer = buffer + j_length * k; + for (int32_t i = 0; i < k; ++i) { + const int8_t *b0 = &B(i, j_length); + for (int32_t j = j_length; j < n; ++j) { + *local_buffer++ = *b0++; + } + for (int32_t j = n; j < j_length + NR; ++j) { + *local_buffer++ = 0; + } + } + } +} + +void Gemm::PackMatrixA_omp_4r(int32_t m, int32_t k, int32_t m_tail, + const int8_t *A, int32_t lda, int8_t *buffer) { + const int i_length = m - m_tail; +#pragma omp parallel for + for (int32_t i = 0; i < i_length; i += MR_INT8) { + const int8_t *a0 = A + i * lda; + const int8_t *a1 = A + (i + 1) * lda; + const int8_t *a2 = A + (i + 2) * lda; + const int8_t *a3 = A + (i + 3) * lda; + int8_t *local_buffer = buffer + i * k; + for (int32_t j = 0; j < k; ++j) { + *local_buffer++ = *a0++; + *local_buffer++ = *a1++; + *local_buffer++ = *a2++; + *local_buffer++ = *a3++; + } + } + + if (m_tail != 0) { + const int8_t *a0 = &A(i_length, 0); + const int8_t *a1 = a0 + lda; + const int8_t *a2 = a0 + 2 * lda; + const int8_t *a3 = a0 + 3 * lda; + int8_t *local_buffer = buffer + i_length * k; + switch (m_tail) { + case 1: + a1 = zero_int8; + case 2: + a2 = zero_int8; + case 3: + a3 = zero_int8; + break; + default: + break; + } + for (int j = 0; j < k; ++j) { + *local_buffer++ = *a0++; + *local_buffer++ = *a1++; + *local_buffer++ = *a2++; + *local_buffer++ = *a3++; + } + } +} + +} // namespace math +} // namespace operators +} // namespace paddle_mobile diff --git a/src/operators/math/math_function_int8.cpp b/src/operators/math/math_function_int8.cpp index 70677223d12ded2da07ab53bc371f1e8da9fe293..e02824b290ebc0080613e2ae2365626d79576c9e 100644 --- a/src/operators/math/math_function_int8.cpp +++ b/src/operators/math/math_function_int8.cpp @@ -51,12 +51,23 @@ void matmul(const framework::Tensor &matrix_a, bool trans_a, } } +#ifdef _OPENMP + gemm.Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data(), N, beta, + matrix_out->data(), N, relu, bias); +#else gemm.Sgemm(M, N, K, alpha, a, K, matrix_b.data(), N, beta, matrix_out->data(), N, relu, bias); +#endif } else { +#ifdef _OPENMP + gemm.Sgemm_omp(M, N, K, alpha, matrix_a.data(), K, + matrix_b.data(), N, beta, + matrix_out->data(), N, relu, bias); +#else gemm.Sgemm(M, N, K, alpha, matrix_a.data(), K, matrix_b.data(), N, beta, matrix_out->data(), N, relu, bias); +#endif } } } // namespace math diff --git a/src/operators/op_param.h b/src/operators/op_param.h index 106f5c43c1762afa7f24a8c3e3e86beac8517834..2c0075271a92cb66ef95603965dd18d0dd3c5faf 100644 --- a/src/operators/op_param.h +++ b/src/operators/op_param.h @@ -243,6 +243,12 @@ class OpParam { return GetVarValue("Y", outputs, scope); } + template + static T *OutputXShapeFrom(const VariableNameMap &outputs, + const Scope &scope) { + return GetVarValue("XShape", outputs, scope); + } + template static T *OutputBoxesFrom(const VariableNameMap &outputs, const Scope &scope) { @@ -1126,6 +1132,37 @@ class TransposeParam : public OpParam { }; #endif +#ifdef TRANSPOSE2_OP +template +class Transpose2Param : public OpParam { + typedef typename DtypeTensorTrait::gtype GType; + typedef typename DtypeTensorTrait::rtype RType; + + public: + Transpose2Param(const VariableNameMap &inputs, const VariableNameMap &outputs, + const AttributeMap &attrs, const Scope &scope) { + input_x_ = InputXFrom(inputs, scope); + out_ = OutFrom(outputs, scope); + output_xshape_ = OutputXShapeFrom(outputs, scope); + axis_ = GetAttr>("axis", attrs); + } + + const RType *InputX() const { return input_x_; } + + RType *Out() const { return out_; } + + RType *OutputXShape() const { return output_xshape_; } + + const vector &Axis() const { return axis_; } + + private: + RType *input_x_; + RType *out_; + RType *output_xshape_; + vector axis_; +}; +#endif + #ifdef LOOKUP_OP template class LookupParam : public OpParam { @@ -1233,6 +1270,49 @@ class ReshapeParam : public OpParam { }; #endif +#ifdef RESHAPE2_OP +template +class Reshape2Param : public OpParam { + typedef typename DtypeTensorTrait::gtype GType; + typedef typename DtypeTensorTrait::rtype RType; + + public: + Reshape2Param(const VariableNameMap &inputs, const VariableNameMap &outputs, + const AttributeMap &attrs, const Scope &scope) { + input_x_ = InputXFrom(inputs, scope); + input_shape_ = InputShapeFrom(inputs, scope); + out_ = OutFrom(outputs, scope); + output_xshape_ = OutputXShapeFrom(outputs, scope); + shape_ = GetAttr>("shape", attrs); + if (HasAttr("inplace", attrs)) { + inplace_ = GetAttr("inplace", attrs); + } else { + inplace_ = false; + } + } + + const RType *InputX() const { return input_x_; } + + const RType *InputShape() const { return input_shape_; } + + RType *Out() const { return out_; } + + RType *OutputXShape() const { return output_xshape_; } + + const vector &Shape() const { return shape_; } + + const bool &Inplace() const { return inplace_; } + + private: + RType *input_x_; + RType *input_shape_; + RType *out_; + RType *output_xshape_; + vector shape_; + bool inplace_; +}; +#endif + #ifdef SCALE_OP template class ScaleParam : public OpParam { diff --git a/src/operators/reshape2_op.cpp b/src/operators/reshape2_op.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d1623076570d466fc53f885374060c5e744365ed --- /dev/null +++ b/src/operators/reshape2_op.cpp @@ -0,0 +1,47 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef RESHAPE2_OP + +#include "operators/reshape2_op.h" +#include +#include "operators/kernel/reshape_kernel.h" +namespace paddle_mobile { +namespace operators { + +template +void Reshape2Op::InferShape() const { + auto &shape = this->param_.Shape(); + auto input_x_dims = this->param_.InputX()->dims(); + auto out_dims = ValidateShape(shape, input_x_dims); + this->param_.Out()->Resize(out_dims); + std::vector xshape_dims(input_x_dims.size() + 1, 0); + for (int i = 0; i < input_x_dims.size(); ++i) { + xshape_dims[i + 1] = input_x_dims[i]; + } + this->param_.OutputXShape()->Resize(framework::make_ddim(xshape_dims)); +} + +} // namespace operators +} // namespace paddle_mobile + +namespace ops = paddle_mobile::operators; +#ifdef PADDLE_MOBILE_CPU +REGISTER_OPERATOR_CPU(reshape2, ops::Reshape2Op); +#endif +#ifdef PADDLE_MOBILE_MALI_GPU +REGISTER_OPERATOR_MALI_GPU(reshape2, ops::Reshape2Op); +#endif + +#endif diff --git a/src/operators/reshape2_op.h b/src/operators/reshape2_op.h new file mode 100644 index 0000000000000000000000000000000000000000..3a06c2b9b90233b6ad0bacb6176f4cc274ff1cc0 --- /dev/null +++ b/src/operators/reshape2_op.h @@ -0,0 +1,54 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef RESHAPE2_OP + +#pragma once + +#include + +#include "framework/operator.h" +#include "operators/kernel/reshape2_kernel.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +using paddle_mobile::framework::Tensor; + +template +class Reshape2Op : public framework::OperatorWithKernel< + DeviceType, Reshape2Param, + operators::Reshape2Kernel> { + public: + Reshape2Op(const std::string &type, const VariableNameMap &inputs, + const VariableNameMap &outputs, + const framework::AttributeMap &attrs, + std::shared_ptr scope) + : framework::OperatorWithKernel, + operators::Reshape2Kernel>( + type, inputs, outputs, attrs, scope) {} + + using framework::OperatorWithKernel< + DeviceType, Reshape2Param, + operators::Reshape2Kernel>::OperatorWithKernel; + void InferShape() const override; + + protected: +}; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/transpose2_op.cpp b/src/operators/transpose2_op.cpp new file mode 100644 index 0000000000000000000000000000000000000000..64d07991f60b4057e3d2841afa1bfe6483f31a88 --- /dev/null +++ b/src/operators/transpose2_op.cpp @@ -0,0 +1,64 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef TRANSPOSE2_OP + +#include + +#include "common/enforce.h" +#include "operators/transpose2_op.h" +namespace paddle_mobile { +namespace operators { + +template +void Transpose2Op::InferShape() const { + auto input_x_dims = this->param_.InputX()->dims(); + auto axis = this->param_.Axis(); + + size_t x_dims_size = input_x_dims.size(); + size_t axis_size = axis.size(); + + PADDLE_MOBILE_ENFORCE((x_dims_size == axis_size), + "input_dims must " + "be equal to the axis_size. ") + + std::vector count(axis_size, 0); + for (size_t i = 0; i < axis_size; i++) { + PADDLE_MOBILE_ENFORCE( + axis[i] < static_cast(axis_size) && ++count[axis[i]] == 1, + "Each element of Attribute axis should be a unique value " + "range from 0 to (dims - 1), " + "where the dims is the axis's size"); + } + framework::DDim out_dims(input_x_dims); + for (size_t i = 0; i < axis_size; i++) { + out_dims[i] = input_x_dims[axis[i]]; + } + this->param_.Out()->Resize(out_dims); + std::vector xshape_dims(input_x_dims.size() + 1, 0); + for (int i = 0; i < input_x_dims.size(); ++i) { + xshape_dims[i + 1] = input_x_dims[i]; + } + this->param_.OutputXShape()->Resize(framework::make_ddim(xshape_dims)); +} + +} // namespace operators +} // namespace paddle_mobile + +namespace ops = paddle_mobile::operators; +#ifdef PADDLE_MOBILE_CPU +REGISTER_OPERATOR_CPU(transpose2, ops::Transpose2Op); +#endif + +#endif // TRANSPOSE_OP diff --git a/src/operators/transpose2_op.h b/src/operators/transpose2_op.h new file mode 100644 index 0000000000000000000000000000000000000000..f1339cc59e0c71a232eddd5dcef47f62994b80da --- /dev/null +++ b/src/operators/transpose2_op.h @@ -0,0 +1,53 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef TRANSPOSE2_OP + +#pragma once + +#include + +#include "framework/operator.h" +#include "operators/kernel/transpose2_kernel.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +using paddle_mobile::framework::Tensor; + +template +class Transpose2Op : public framework::OperatorWithKernel< + DeviceType, Transpose2Param, + operators::Transpose2Kernel> { + public: + Transpose2Op(const std::string &type, const VariableNameMap &inputs, + const VariableNameMap &outputs, + const framework::AttributeMap &attrs, + std::shared_ptr scope) + : framework::OperatorWithKernel< + DeviceType, Transpose2Param, + operators::Transpose2Kernel>(type, inputs, outputs, + attrs, scope) {} + + using framework::OperatorWithKernel< + DeviceType, Transpose2Param, + operators::Transpose2Kernel>::OperatorWithKernel; + void InferShape() const override; +}; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index a10088f9b417b628418404b8df3d340b851af383..2bd7169533f637add2a752feaceca8df132cb262 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -184,6 +184,10 @@ if (NOT FOUND_MATCH) ADD_EXECUTABLE(test-transpose-op operators/test_transpose_op.cpp test_helper.h test_include.h) target_link_libraries(test-transpose-op paddle-mobile) + # gen test + ADD_EXECUTABLE(test-transpose2-op operators/test_transpose2_op.cpp test_helper.h test_include.h) + target_link_libraries(test-transpose2-op paddle-mobile) + # gen test ADD_EXECUTABLE(test-multiclassnms-op operators/test_multiclass_nms_op.cpp test_helper.h test_include.h) target_link_libraries(test-multiclassnms-op paddle-mobile) @@ -200,6 +204,10 @@ if (NOT FOUND_MATCH) ADD_EXECUTABLE(test-reshape-op operators/test_reshape_op.cpp test_helper.h test_include.h) target_link_libraries(test-reshape-op paddle-mobile) + # gen test + ADD_EXECUTABLE(test-reshape2-op operators/test_reshape2_op.cpp test_helper.h test_include.h) + target_link_libraries(test-reshape2-op paddle-mobile) + # gen test ADD_EXECUTABLE(test-relu-op operators/test_relu_op.cpp test_helper.h test_include.h) target_link_libraries(test-relu-op paddle-mobile) diff --git a/test/common/test_gemm_int8_accuracy.cpp b/test/common/test_gemm_int8_accuracy.cpp index 80ddd40e121c81032c903955bd7116cf52695569..87f8d945648577ef1414417b57f4013d288dc043 100644 --- a/test/common/test_gemm_int8_accuracy.cpp +++ b/test/common/test_gemm_int8_accuracy.cpp @@ -20,6 +20,9 @@ limitations under the License. */ #include "common/log.h" #include "memory/t_malloc.h" #include "operators/math/gemm.h" +#ifdef _OPENMP +#include +#endif // _OPENMP #define a(i, j) a[(i)*lda + (j)] #define b(i, j) b[(i)*ldb + (j)] @@ -84,8 +87,13 @@ int do_sgemm(int m, int n, int k, bool relu, int pr) { } paddle_mobile::operators::math::Gemm gemm; +#ifdef _OPENMP + gemm.Sgemm_omp(m, n, k, static_cast(1), a, lda, b, ldb, + static_cast(0), c, ldc, relu, nullptr); +#else gemm.Sgemm(m, n, k, static_cast(1), a, lda, b, ldb, static_cast(0), c, ldc, relu, nullptr); +#endif int eq = 0; int neq = 0; for (int i = 0; i < m * n; ++i) { @@ -119,12 +127,17 @@ int do_sgemm(int m, int n, int k, bool relu, int pr) { } int main() { - do_sgemm(9, 9, 9, false, 10); +#ifdef _OPENMP + omp_set_num_threads(8); +#endif + do_sgemm(9, 9, 9, false, 1); do_sgemm(10, 6, 12, false, 0); do_sgemm(512, 256, 384, false, 0); do_sgemm(1366, 768, 256, false, 0); do_sgemm(1255, 755, 333, false, 0); - do_sgemm(555, 777, 999, false, 0); + do_sgemm(599, 1133, 393, false, 0); + do_sgemm(777, 555, 999, false, 0); + do_sgemm(333, 797, 939, false, 0); do_sgemm(1024, 1024, 1024, false, 0); return 0; diff --git a/test/common/test_gemm_perf.cpp b/test/common/test_gemm_perf.cpp index 89f0012ae8effaab383719c1b85748c24eb2bf73..14da4ba284b5ac7b0660bd15de871fdf5ed04cdd 100644 --- a/test/common/test_gemm_perf.cpp +++ b/test/common/test_gemm_perf.cpp @@ -28,7 +28,7 @@ limitations under the License. */ int main() { paddle_mobile::PaddleMobile paddle_mobile; - paddle_mobile.SetThreadNum(1); + paddle_mobile.SetThreadNum(8); Tensor aa, bb, cc; auto aaptr = aa.mutable_data({m, k}); auto bbptr = bb.mutable_data({k, n}); diff --git a/test/net/test_googlenet.cpp b/test/net/test_googlenet.cpp index c88a78974c330ec270fbcb3f5c28e368ef16440e..f7d29942224b51734cf62988ba8f271f1fa05bc3 100644 --- a/test/net/test_googlenet.cpp +++ b/test/net/test_googlenet.cpp @@ -25,8 +25,8 @@ int main() { paddle_mobile::PaddleMobile paddle_mobile; #endif - paddle_mobile.SetThreadNum(1); - bool optimize = false; + paddle_mobile.SetThreadNum(4); + bool optimize = true; auto time1 = time(); if (paddle_mobile.Load(g_googlenet, optimize)) { auto time2 = time(); @@ -35,10 +35,10 @@ int main() { std::vector output; std::vector dims{1, 3, 224, 224}; GetInput(g_test_image_1x3x224x224, &input, dims); - // // 预热十次 - // for (int i = 0; i < 10; ++i) { - // output = paddle_mobile.Predict(input, dims); - // } + // 预热十次 + for (int i = 0; i < 10; ++i) { + output = paddle_mobile.Predict(input, dims); + } auto time3 = time(); for (int i = 0; i < 10; ++i) { output = paddle_mobile.Predict(input, dims); @@ -47,9 +47,6 @@ int main() { std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms" << std::endl; - for (int i = 0; i < output.size(); ++i) { - DLOG << "result[" << i << "] = " << output[i]; - } } return 0; } diff --git a/test/operators/test_batchnorm_op.cpp b/test/operators/test_batchnorm_op.cpp index 4ccad8c1512036c2400a09575b3775e75b26acce..5f064d27f3f3f9cca5428467557c9412f76735c7 100644 --- a/test/operators/test_batchnorm_op.cpp +++ b/test/operators/test_batchnorm_op.cpp @@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#pragma once - #include "../test_helper.h" #include "../test_include.h" #include "operators/batchnorm_op.h" diff --git a/test/operators/test_box_coder_op.cpp b/test/operators/test_box_coder_op.cpp index 92cba3995c866c67c00491ad5cc38fb094594ad3..aeef10be9650623767af4d2de8913ce53b1d2c59 100644 --- a/test/operators/test_box_coder_op.cpp +++ b/test/operators/test_box_coder_op.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#pragma once #include "../test_include.h" #include "operators/box_coder_op.h" diff --git a/test/operators/test_elementwise_sub_op.cpp b/test/operators/test_elementwise_sub_op.cpp index cfac83eff7a012d52d47f96e088bd8519603cadc..e27361b21c3146675ea856d02d70878e73e8912f 100644 --- a/test/operators/test_elementwise_sub_op.cpp +++ b/test/operators/test_elementwise_sub_op.cpp @@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#pragma once - #include "../test_helper.h" #include "../test_include.h" #include "operators/elementwise_sub_op.h" diff --git a/test/operators/test_fill_constant_op.cpp b/test/operators/test_fill_constant_op.cpp index b099217d1641eb221b3d0d86d780fb6ecfa929bd..99c65ed821c0a90691070b661a6967a11d4694f7 100644 --- a/test/operators/test_fill_constant_op.cpp +++ b/test/operators/test_fill_constant_op.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#pragma once #include "../test_include.h" #include "operators/fill_constant_op.h" diff --git a/test/operators/test_fusion_fc_op.cpp b/test/operators/test_fusion_fc_op.cpp index a23bde45cb74f0f75e655821b15e66b1cef4c081..aaa2d7b578dbda4c6919210eb4a2fb42ba243e53 100644 --- a/test/operators/test_fusion_fc_op.cpp +++ b/test/operators/test_fusion_fc_op.cpp @@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#pragma once - #include #include "../test_include.h" #include "operators/fusion_fc_op.h" diff --git a/test/operators/test_im2sequence_op.cpp b/test/operators/test_im2sequence_op.cpp index b45e437e12f95cd9f7050247fc03a152246d8122..6c69d1cc9d94ffd958251ee4ed783d6b5531c455 100644 --- a/test/operators/test_im2sequence_op.cpp +++ b/test/operators/test_im2sequence_op.cpp @@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#pragma once - #include "../test_helper.h" #include "../test_include.h" #include "operators/im2sequence_op.h" diff --git a/test/operators/test_mul_op.cpp b/test/operators/test_mul_op.cpp index 10dab2cda1b3c692f42cf8760eb2b48ae6451f39..262ee960e1c777d369d3b510eb31e5ed47b3493c 100644 --- a/test/operators/test_mul_op.cpp +++ b/test/operators/test_mul_op.cpp @@ -93,6 +93,8 @@ int TestMulOP() { } // namespace paddle_mobile int main() { + paddle_mobile::PaddleMobile paddle_mobile; + paddle_mobile.SetThreadNum(8); paddle_mobile::TestMulOP(); paddle_mobile::TestMulOP(); return 0; diff --git a/test/operators/test_multiclass_nms_op.cpp b/test/operators/test_multiclass_nms_op.cpp index d1b98d4965fd182ab1adc480279f38cea53974be..3447bbdd10b64d2c2f497bdb4d5af15958a9a95b 100644 --- a/test/operators/test_multiclass_nms_op.cpp +++ b/test/operators/test_multiclass_nms_op.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#pragma once #include "../test_include.h" #include "operators/multiclass_nms_op.h" @@ -31,14 +30,12 @@ class TestMultiClassNMSOp { const std::vector> blocks = to_predict_program_->Blocks(); - // DLOG << " **block size " << blocks.size(); for (auto block_desc : blocks) { std::vector> ops = block_desc->Ops(); - // DLOG << " ops " << ops.size(); for (auto op : ops) { if (op->Type() == "multiclass_nms" && op->Input("BBoxes")[0] == "box_coder_0.tmp_0") { - DLOG << " mul attr size: " << op->GetAttrMap().size(); + DLOG << " attr size: " << op->GetAttrMap().size(); DLOG << " inputs size: " << op->GetInputs().size(); DLOG << " outputs size: " << op->GetOutputs().size(); DLOG << " BBoxes is : " << op->Input("BBoxes")[0]; @@ -55,14 +52,6 @@ class TestMultiClassNMSOp { << op->GetAttrMap().at("nms_top_k").Get(); DLOG << " score_threshold : " << op->GetAttrMap().at("score_threshold").Get(); - // DLOG << " variances : " << - // op->GetAttrMap().at("variances").Get>(); - // DLOG << " aspect_ratios : " << - // op->GetAttrMap().at("aspect_ratios").Get>(); - // DLOG << " min_sizes : " << - // op->GetAttrMap().at("min_sizes").Get>(); - // DLOG << " max_sizes : " << - // op->GetAttrMap().at("max_sizes").Get>(); std::shared_ptr> priorbox = std::make_shared>( op->Type(), op->GetInputs(), op->GetOutputs(), @@ -88,16 +77,12 @@ class TestMultiClassNMSOp { auto *output_tensor = output->GetMutable(); output_tensor->mutable_data({1917, 6}); - // DLOG << typeid(output_tensor).name(); - // DLOG << "output_tensor dims: " << output_tensor->dims(); - std::shared_ptr out_tensor = std::make_shared(); out_tensor.reset(output_tensor); predict(t1, t2, 0); return out_tensor; - // return outvars_tensor; } private: diff --git a/test/operators/test_polygon_box_transform_op.cpp b/test/operators/test_polygon_box_transform_op.cpp index a71177ddbd8e4d8b0f204fd6ec9c948882499cbd..5b30ce1ebfd59db972953e16e4506fa2595b8f04 100644 --- a/test/operators/test_polygon_box_transform_op.cpp +++ b/test/operators/test_polygon_box_transform_op.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#pragma once #include "../test_include.h" #include "operators/polygon_box_transform_op.h" diff --git a/test/operators/test_prior_box_op.cpp b/test/operators/test_prior_box_op.cpp index 8c697a9a7982f05b71caa5bb5f4d12e50dc9d418..2c75d01df297030b4633829ac4b29f7592aaf5c4 100644 --- a/test/operators/test_prior_box_op.cpp +++ b/test/operators/test_prior_box_op.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#pragma once #include "../test_include.h" #include "operators/prior_box_op.h" diff --git a/test/operators/test_reshape2_op.cpp b/test/operators/test_reshape2_op.cpp new file mode 100644 index 0000000000000000000000000000000000000000..42c348a6274592eb23332620131faa0784a71d28 --- /dev/null +++ b/test/operators/test_reshape2_op.cpp @@ -0,0 +1,142 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "../test_include.h" +#include "operators/reshape2_op.h" + +namespace paddle_mobile { +namespace framework { + +template +class TestReshape2Op { + public: + explicit TestReshape2Op(const Program p) : program_(p) { + if (use_optimize_) { + to_predict_program_ = program_.optimizeProgram; + } else { + to_predict_program_ = program_.originProgram; + } + const std::vector> blocks = + to_predict_program_->Blocks(); + for (auto block_desc : blocks) { + std::vector> ops = block_desc->Ops(); + for (auto op : ops) { + if (op->Type() == "reshape2") { + DLOG << " attr size: " << op->GetAttrMap().size(); + std::unordered_map attrs = op->GetAttrMap(); + for (std::unordered_map::iterator it = + attrs.begin(); + it != attrs.end(); ++it) { + DLOG << " " << it->first << " " << it->second; + } + + DLOG << " inputs size: " << op->GetInputs().size(); + VariableNameMap inputs = op->GetInputs(); + for (VariableNameMap::iterator it = inputs.begin(); + it != inputs.end(); ++it) { + DLOG << " " << it->first << " " << it->second; + } + + DLOG << " outputs size: " << op->GetOutputs().size(); + VariableNameMap outputs = op->GetOutputs(); + for (VariableNameMap::iterator it = outputs.begin(); + it != outputs.end(); ++it) { + DLOG << " " << it->first << " " << it->second; + } + + input_var_name = op->Input("X")[0]; + output_var_name = op->Output("Out")[0]; + std::shared_ptr> op_ptr = + std::make_shared>( + op->Type(), op->GetInputs(), op->GetOutputs(), + op->GetAttrMap(), program_.scope); + ops_of_block_[*block_desc.get()].push_back(op_ptr); + return; + } + } + } + } + + std::shared_ptr predict(const Tensor &t) { + auto scope = program_.scope; + Variable *input_feed_value = scope->Var(input_var_name); + auto tensor_input = input_feed_value->GetMutable(); + tensor_input->ShareDataWith(t); + + Variable *output = scope->Var(output_var_name); + auto *output_tensor = output->GetMutable(); + + std::shared_ptr out_tensor = std::make_shared(); + out_tensor.reset(output_tensor); + + predict(t, 0); + + return out_tensor; + } + + private: + const framework::Program program_; + std::shared_ptr to_predict_program_; + std::map>>> + ops_of_block_; + bool use_optimize_ = false; + string input_var_name; + string output_var_name; + + void predict(const Tensor &t, int block_id) { + std::shared_ptr to_predict_block = + to_predict_program_->Block(block_id); + for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) { + auto op = ops_of_block_[*to_predict_block.get()][j]; + op->Run(); + } + } +}; + +template class TestReshape2Op; +} // namespace framework +} // namespace paddle_mobile + +int main() { + DLOG << "----------**********----------"; + DLOG << "begin to run Reshape2 Test"; + paddle_mobile::Loader loader; + auto program = loader.Load(std::string(g_ocr) + "/model", + std::string(g_ocr) + "/params"); + + paddle_mobile::framework::Tensor input; + SetupTensor(&input, {1, 4, 4}, static_cast(0), + static_cast(1)); + auto *input_ptr = input.data(); + for (int i = 0; i < 16; ++i) { + *(input_ptr + i) = i; + } + DLOG << "input : "; + for (int i = 0; i < input.numel(); ++i) { + DLOG << " index " << i << " : " << input_ptr[i]; + } + + paddle_mobile::framework::TestReshape2Op testReshape2Op( + program); + + auto output = testReshape2Op.predict(input); + auto *output_ptr = output->data(); + + DLOG << "output : "; + for (int i = 0; i < output->numel(); ++i) { + DLOG << " index " << i << " : " << output_ptr[i]; + } + return 0; +} diff --git a/test/operators/test_sum_op.cpp b/test/operators/test_sum_op.cpp index e51d1cff5e99c5d9c444db046e78eee6a03f9243..467529d8d3877fcb9ac5527daf5f037aea6d18fc 100644 --- a/test/operators/test_sum_op.cpp +++ b/test/operators/test_sum_op.cpp @@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#pragma once - #include "../test_helper.h" #include "../test_include.h" #include "operators/sum_op.h" diff --git a/test/operators/test_transpose2_op.cpp b/test/operators/test_transpose2_op.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b75a957cd5c1cd08dc09895e9e2448761e822274 --- /dev/null +++ b/test/operators/test_transpose2_op.cpp @@ -0,0 +1,143 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "../test_include.h" +#include "operators/transpose2_op.h" + +namespace paddle_mobile { +namespace framework { + +template +class TestTranspose2Op { + public: + explicit TestTranspose2Op(const Program p) : program_(p) { + if (use_optimize_) { + to_predict_program_ = program_.optimizeProgram; + } else { + to_predict_program_ = program_.originProgram; + } + const std::vector> blocks = + to_predict_program_->Blocks(); + for (auto block_desc : blocks) { + std::vector> ops = block_desc->Ops(); + for (auto op : ops) { + if (op->Type() == "transpose2") { + DLOG << " attr size: " << op->GetAttrMap().size(); + std::unordered_map attrs = op->GetAttrMap(); + for (std::unordered_map::iterator it = + attrs.begin(); + it != attrs.end(); ++it) { + DLOG << " " << it->first << " " << it->second; + } + + DLOG << " inputs size: " << op->GetInputs().size(); + VariableNameMap inputs = op->GetInputs(); + for (VariableNameMap::iterator it = inputs.begin(); + it != inputs.end(); ++it) { + DLOG << " " << it->first << " " << it->second; + } + + DLOG << " outputs size: " << op->GetOutputs().size(); + VariableNameMap outputs = op->GetOutputs(); + for (VariableNameMap::iterator it = outputs.begin(); + it != outputs.end(); ++it) { + DLOG << " " << it->first << " " << it->second; + } + + input_var_name = op->Input("X")[0]; + output_var_name = op->Output("Out")[0]; + std::shared_ptr> op_ptr = + std::make_shared>( + op->Type(), op->GetInputs(), op->GetOutputs(), + op->GetAttrMap(), program_.scope); + ops_of_block_[*block_desc.get()].push_back(op_ptr); + return; + } + } + } + } + + std::shared_ptr predict(const Tensor &t) { + auto scope = program_.scope; + Variable *input_feed_value = scope->Var(input_var_name); + auto tensor_input = input_feed_value->GetMutable(); + tensor_input->ShareDataWith(t); + + Variable *output = scope->Var(output_var_name); + auto *output_tensor = output->GetMutable(); + output_tensor->mutable_data({1, 2, 8}); + + std::shared_ptr out_tensor = std::make_shared(); + out_tensor.reset(output_tensor); + + predict(t, 0); + + return out_tensor; + } + + private: + const framework::Program program_; + std::shared_ptr to_predict_program_; + std::map>>> + ops_of_block_; + bool use_optimize_ = false; + string input_var_name; + string output_var_name; + + void predict(const Tensor &t, int block_id) { + std::shared_ptr to_predict_block = + to_predict_program_->Block(block_id); + for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) { + auto op = ops_of_block_[*to_predict_block.get()][j]; + op->Run(); + } + } +}; + +template class TestTranspose2Op; +} // namespace framework +} // namespace paddle_mobile + +int main() { + DLOG << "----------**********----------"; + DLOG << "begin to run Transpose2 Test"; + paddle_mobile::Loader loader; + auto program = loader.Load(std::string(g_ocr) + "/model", + std::string(g_ocr) + "/params"); + + paddle_mobile::framework::Tensor input; + SetupTensor(&input, {1, 8, 2}, static_cast(0), + static_cast(1)); + auto *input_ptr = input.data(); + for (int i = 0; i < 16; ++i) { + *(input_ptr + i) = i; + } + DLOG << "input : "; + for (int i = 0; i < input.numel(); ++i) { + DLOG << " index " << i << " : " << input_ptr[i]; + } + + paddle_mobile::framework::TestTranspose2Op + testTranspose2Op(program); + + auto output = testTranspose2Op.predict(input); + auto *output_ptr = output->data(); + + DLOG << "output : "; + for (int i = 0; i < output->numel(); ++i) { + DLOG << " index " << i << " : " << output_ptr[i]; + } + return 0; +} diff --git a/tools/build.sh b/tools/build.sh index 1408822e46850752bcd448350fc483c25f70ae9a..c6554105718304c195bb4a3326c80947719033a0 100755 --- a/tools/build.sh +++ b/tools/build.sh @@ -160,7 +160,7 @@ build_for_ios() { fi cd "${BUILD_DIR}" make -j 8 - cp ../../../src/ios_io/PaddleMobile.h ./build/PaddleMobile.h + cp ../../../src/ios_io/PaddleMobileCPU.h ./build/PaddleMobileCPU.h cd ./build # 生成符号表 ranlib *.a diff --git a/tools/op.cmake b/tools/op.cmake index f7a6ed4b134f78ddb23487cd3a861f244e6a86db..2e1e311a2c96bac02257cfdce2d2fbebcd962dfb 100644 --- a/tools/op.cmake +++ b/tools/op.cmake @@ -201,9 +201,11 @@ if(NOT FOUND_MATCH) set(PRIORBOX_OP ON) set(RELU_OP ON) set(RESHAPE_OP ON) + set(RESHAPE2_OP ON) set(SIGMOID_OP ON) set(SOFTMAX_OP ON) set(TRANSPOSE_OP ON) + set(TRANSPOSE2_OP ON) set(FUSION_CONVADDBNRELU_OP ON) set(FUSION_CONVADDADDPRELU_OP ON) set(FUSION_DWCONVBNRELU_OP ON) @@ -246,9 +248,11 @@ endif() # option(PRIORBOX_OP "" ON) # option(RELU_OP "" ON) # option(RESHAPE_OP "" ON) + # option(RESHAPE2_OP "" ON) # option(SIGMOID_OP "" ON) # option(SOFTMAX_OP "" ON) # option(TRANSPOSE_OP "" ON) + # option(TRANSPOSE2_OP "" ON) # endif () if (BATCHNORM_OP) @@ -314,6 +318,9 @@ endif() if (RESHAPE_OP) add_definitions(-DRESHAPE_OP) endif() +if (RESHAPE2_OP) + add_definitions(-DRESHAPE2_OP) +endif() if (SIGMOID_OP) add_definitions(-DSIGMOID_OP) endif() @@ -323,6 +330,9 @@ endif() if (TRANSPOSE_OP) add_definitions(-DTRANSPOSE_OP) endif() +if (TRANSPOSE2_OP) + add_definitions(-DTRANSPOSE2_OP) +endif() if (FUSION_CONVADDBNRELU_OP) add_definitions(-DFUSION_CONVADDBNRELU_OP) endif()