prune

3a631fbb · Chunwei · 0f9e7057 · 0f9e7057 · 0f9e7057 · 0f9e7057
16 changed file
--- a/paddle/fluid/lite/arm/math/activation.cc
+++ b/paddle/fluid/lite/arm/math/activation.cc
--- a/paddle/fluid/lite/arm/math/activation.h
+++ b/paddle/fluid/lite/arm/math/activation.h
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-template <typename T>
-void act_relu(const T* din, T* dout, int size, int threads);
-template <typename T>
-void act_relu_neg(const T* din, T* dout, int size, const float negative_slope,
-                  int threads);
-template <typename T>
-void act_clipped_relu(const T* din, T* dout, int size, const float coef,
-                      int threads);
-template <typename T>
-void act_prelu(const T* din, T* dout, int outer_size, int channel_size,
-               int inner_size, bool channel_shared, float* channel_slope,
-               int threads);
-template <typename T>
-void act_sigmoid(const T* din, T* dout, int size, int threads);
-template <typename T>
-void act_tanh(const T* din, T* dout, int size, int threads);
-template <typename T>
-void act_swish(const T* din, T* dout, int size, const float coef, int threads);
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
--- a/paddle/fluid/lite/arm/math/concat.cc
+++ b/paddle/fluid/lite/arm/math/concat.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/lite/arm/math/concat.h"
-#include <algorithm>
-#include <limits>
-#include <memory>
-#include "paddle/fluid/lite/arm/math/funcs.h"
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-void concat_func(const std::vector<lite::Tensor *> &input, const int axis,
-                 lite::Tensor *output) {
-  size_t num = input.size();
-  int rows = 1;
-  auto dim_0 = input[0]->dims();
-  for (int i = 0; i < axis; ++i) {
-    rows *= dim_0[i];
-  }
-  int out_rows = rows, out_cols = 0;
-  std::vector<int64_t> input_cols(input.size());
-  for (int i = 0; i < num; ++i) {
-    int t_cols = input[i]->numel() / rows;
-    out_cols += t_cols;
-    input_cols[i] = t_cols;
-  }
-  // computation
-  for (int k = 0; k < out_rows; ++k) {
-    float *dst_ptr = output->mutable_data<float>() + k * out_cols;
-    int col_idx = 0;
-    for (int j = 0; j < num; ++j) {
-      int col_len = input_cols[j];
-      const float *src_prt = input[j]->data<float>() + k * col_len;
-      std::memcpy(dst_ptr + col_idx, src_prt, sizeof(float) * col_len);
-      col_idx += col_len;
-    }
-  }
-}
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
--- a/paddle/fluid/lite/arm/math/concat.h
+++ b/paddle/fluid/lite/arm/math/concat.h
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include <algorithm>
-#include <string>
-#include <vector>
-#include "paddle/fluid/lite/operators/op_params.h"
-#include "paddle/fluid/lite/utils/cp_logging.h"
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-void concat_func(const std::vector<lite::Tensor *> &input, const int axis,
-                 lite::Tensor *output);
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
--- a/paddle/fluid/lite/arm/math/dropout.cc
+++ b/paddle/fluid/lite/arm/math/dropout.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/lite/arm/math/dropout.h"
-#include "paddle/fluid/lite/arm/math/funcs.h"
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-template <>
-void dropout_down<float>(const float* din, float* dout, int num, float prob) {
-  const float scale = 1.0f - prob;
-  int cnt = num >> 4;
-  int remain = num % 16;
-  float32x4_t vscale = vdupq_n_f32(scale);
-#pragma omp parallel for
-  for (int i = 0; i < cnt; i++) {
-    const float* din_ptr = din + (i << 4);
-    float* dout_ptr = dout + (i << 4);
-    float32x4_t din0 = vld1q_f32(din_ptr);
-    float32x4_t din1 = vld1q_f32(din_ptr + 4);
-    float32x4_t din2 = vld1q_f32(din_ptr + 8);
-    float32x4_t din3 = vld1q_f32(din_ptr + 12);
-    float32x4_t vmul0 = vmulq_f32(din0, vscale);
-    float32x4_t vmul1 = vmulq_f32(din1, vscale);
-    float32x4_t vmul2 = vmulq_f32(din2, vscale);
-    float32x4_t vmul3 = vmulq_f32(din3, vscale);
-    vst1q_f32(dout_ptr, vmul0);
-    vst1q_f32(dout_ptr + 4, vmul1);
-    vst1q_f32(dout_ptr + 8, vmul2);
-    vst1q_f32(dout_ptr + 12, vmul3);
-  }
-  if (remain > 0) {
-    const float* din_ptr = din + (cnt << 4);
-    float* dout_ptr = dout + (cnt << 4);
-    for (int i = 0; i < remain; i++) {
-      *dout_ptr = *din_ptr * scale;
-      dout_ptr++;
-      din_ptr++;
-    }
-  }
-}
-template <>
-void dropout_up<float>(const float* din, float* dout, int num) {
-  int cnt = num >> 4;
-  int remain = num % 16;
-#pragma omp parallel for
-  for (int i = 0; i < cnt; i++) {
-    const float* din_ptr = din + (i << 4);
-    float* dout_ptr = dout + (i << 4);
-    float32x4_t din0 = vld1q_f32(din_ptr);
-    float32x4_t din1 = vld1q_f32(din_ptr + 4);
-    float32x4_t din2 = vld1q_f32(din_ptr + 8);
-    float32x4_t din3 = vld1q_f32(din_ptr + 12);
-    vst1q_f32(dout_ptr, din0);
-    vst1q_f32(dout_ptr + 4, din1);
-    vst1q_f32(dout_ptr + 8, din2);
-    vst1q_f32(dout_ptr + 12, din3);
-  }
-  if (remain > 0) {
-    const float* din_ptr = din + (cnt << 4);
-    float* dout_ptr = dout + (cnt << 4);
-    for (int i = 0; i < remain; i++) {
-      *dout_ptr = *din_ptr;
-      dout_ptr++;
-      din_ptr++;
-    }
-  }
-}
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
--- a/paddle/fluid/lite/arm/math/dropout.h
+++ b/paddle/fluid/lite/arm/math/dropout.h
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include <string>
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-template <typename T>
-void dropout_down(const T* din, T* dout, int num, float prob);
-template <typename T>
-void dropout_up(const T* din, T* dout, int num);
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
--- a/paddle/fluid/lite/arm/math/elementwise.cc
+++ b/paddle/fluid/lite/arm/math/elementwise.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/lite/arm/math/elementwise.h"
-#include "paddle/fluid/lite/arm/math/funcs.h"
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-template <>
-void elementwise_add<float>(const float* dinx, const float* diny, float* dout,
-                            int num) {
-  int cnt = num >> 4;
-  int remain = num % 16;
-#pragma omp parallel for
-  for (int i = 0; i < cnt; i++) {
-    const float* dinx_ptr = dinx + (i << 4);
-    const float* diny_ptr = diny + (i << 4);
-    float* dout_ptr = dout + (i << 4);
-    float32x4_t dinx0 = vld1q_f32(dinx_ptr);
-    float32x4_t dinx1 = vld1q_f32(dinx_ptr + 4);
-    float32x4_t dinx2 = vld1q_f32(dinx_ptr + 8);
-    float32x4_t dinx3 = vld1q_f32(dinx_ptr + 12);
-    float32x4_t diny0 = vld1q_f32(diny_ptr);
-    float32x4_t diny1 = vld1q_f32(diny_ptr + 4);
-    float32x4_t diny2 = vld1q_f32(diny_ptr + 8);
-    float32x4_t diny3 = vld1q_f32(diny_ptr + 12);
-    dinx0 = vaddq_f32(dinx0, diny0);
-    dinx1 = vaddq_f32(dinx1, diny1);
-    dinx2 = vaddq_f32(dinx2, diny2);
-    dinx3 = vaddq_f32(dinx3, diny3);
-    vst1q_f32(dout_ptr, dinx0);
-    vst1q_f32(dout_ptr + 4, dinx1);
-    vst1q_f32(dout_ptr + 8, dinx2);
-    vst1q_f32(dout_ptr + 12, dinx3);
-  }
-  if (remain > 0) {
-    const float* dinx_ptr = dinx + (cnt << 4);
-    const float* diny_ptr = diny + (cnt << 4);
-    float* dout_ptr = dout + (cnt << 4);
-    for (int i = 0; i < remain; i++) {
-      *dout_ptr = *dinx_ptr + *diny_ptr;
-      dout_ptr++;
-      dinx_ptr++;
-      diny_ptr++;
-    }
-  }
-}
-template <>
-void elementwise_add_relu<float>(const float* dinx, const float* diny,
-                                 float* dout, int num) {
-  int cnt = num >> 4;
-  int remain = num % 16;
-  float32x4_t vzero = vdupq_n_f32(0.f);
-#pragma omp parallel for
-  for (int i = 0; i < cnt; i++) {
-    const float* dinx_ptr = dinx + (i << 4);
-    const float* diny_ptr = diny + (i << 4);
-    float* dout_ptr = dout + (i << 4);
-    float32x4_t dinx0 = vld1q_f32(dinx_ptr);
-    float32x4_t dinx1 = vld1q_f32(dinx_ptr + 4);
-    float32x4_t dinx2 = vld1q_f32(dinx_ptr + 8);
-    float32x4_t dinx3 = vld1q_f32(dinx_ptr + 12);
-    float32x4_t diny0 = vld1q_f32(diny_ptr);
-    float32x4_t diny1 = vld1q_f32(diny_ptr + 4);
-    float32x4_t diny2 = vld1q_f32(diny_ptr + 8);
-    float32x4_t diny3 = vld1q_f32(diny_ptr + 12);
-    dinx0 = vaddq_f32(dinx0, diny0);
-    dinx1 = vaddq_f32(dinx1, diny1);
-    dinx2 = vaddq_f32(dinx2, diny2);
-    dinx3 = vaddq_f32(dinx3, diny3);
-    // relu
-    dinx0 = vmaxq_f32(dinx0, vzero);
-    dinx1 = vmaxq_f32(dinx1, vzero);
-    dinx2 = vmaxq_f32(dinx2, vzero);
-    dinx3 = vmaxq_f32(dinx3, vzero);
-    vst1q_f32(dout_ptr, dinx0);
-    vst1q_f32(dout_ptr + 4, dinx1);
-    vst1q_f32(dout_ptr + 8, dinx2);
-    vst1q_f32(dout_ptr + 12, dinx3);
-  }
-  if (remain > 0) {
-    const float* dinx_ptr = dinx + (cnt << 4);
-    const float* diny_ptr = diny + (cnt << 4);
-    float* dout_ptr = dout + (cnt << 4);
-    for (int i = 0; i < remain; i++) {
-      float tmp = *dinx_ptr + *diny_ptr;
-      *dout_ptr = tmp > 0.f ? tmp : 0.f;
-      dout_ptr++;
-      dinx_ptr++;
-      diny_ptr++;
-    }
-  }
-}
-template <>
-void elementwise_add_broadcast<float>(const float* dinx, const float* diny,
-                                      float* dout, int batch, int channels,
-                                      int num) {
-#pragma omp parallel for collapse(2)
-  for (int i = 0; i < batch; ++i) {
-    for (int j = 0; j < channels; ++j) {
-      int offset = (i * channels + j) * num;
-      const float* din_ptr = dinx + offset;
-      const float diny_data = diny[j];
-      float* dout_ptr = dout + offset;
-      int cnt = num >> 4;
-      int remain = num % 16;
-      float32x4_t rb = vdupq_n_f32(diny_data);
-      for (int k = 0; k < cnt; ++k) {
-        float32x4_t din0 = vld1q_f32(din_ptr);
-        float32x4_t din1 = vld1q_f32(din_ptr + 4);
-        float32x4_t din2 = vld1q_f32(din_ptr + 8);
-        float32x4_t din3 = vld1q_f32(din_ptr + 12);
-        din0 = vaddq_f32(din0, rb);
-        din1 = vaddq_f32(din1, rb);
-        din2 = vaddq_f32(din2, rb);
-        din3 = vaddq_f32(din3, rb);
-        vst1q_f32(dout_ptr, din0);
-        vst1q_f32(dout_ptr + 4, din1);
-        vst1q_f32(dout_ptr + 8, din2);
-        vst1q_f32(dout_ptr + 12, din3);
-        din_ptr += 16;
-        dout_ptr += 16;
-      }
-      if (remain >= 8) {
-        float32x4_t din0 = vld1q_f32(din_ptr);
-        float32x4_t din1 = vld1q_f32(din_ptr + 4);
-        din0 = vaddq_f32(din0, rb);
-        din1 = vaddq_f32(din1, rb);
-        vst1q_f32(dout_ptr, din0);
-        vst1q_f32(dout_ptr + 4, din1);
-        din_ptr += 8;
-        dout_ptr += 8;
-        remain -= 8;
-      }
-      if (remain >= 4) {
-        float32x4_t din0 = vld1q_f32(din_ptr);
-        din0 = vaddq_f32(din0, rb);
-        vst1q_f32(dout_ptr, din0);
-        din_ptr += 4;
-        dout_ptr += 4;
-        remain -= 4;
-      }
-      if (remain > 0) {
-        for (int p = 0; p < remain; p++) {
-          *dout_ptr = *din_ptr + diny_data;
-          dout_ptr++;
-          din_ptr++;
-        }
-      }
-    }
-  }
-}
-template <>
-void elementwise_add_relu_broadcast<float>(const float* dinx, const float* diny,
-                                           float* dout, int batch, int channels,
-                                           int num) {
-  float32x4_t vzero = vdupq_n_f32(0.f);
-#pragma omp parallel for collapse(2)
-  for (int i = 0; i < batch; ++i) {
-    for (int j = 0; j < channels; ++j) {
-      int offset = (i * channels + j) * num;
-      const float* din_ptr = dinx + offset;
-      const float diny_data = diny[j];
-      float* dout_ptr = dout + offset;
-      int cnt = num >> 4;
-      int remain = num % 16;
-      float32x4_t rb = vdupq_n_f32(diny_data);
-      for (int k = 0; k < cnt; ++k) {
-        float32x4_t din0 = vld1q_f32(din_ptr);
-        float32x4_t din1 = vld1q_f32(din_ptr + 4);
-        float32x4_t din2 = vld1q_f32(din_ptr + 8);
-        float32x4_t din3 = vld1q_f32(din_ptr + 12);
-        din0 = vaddq_f32(din0, rb);
-        din1 = vaddq_f32(din1, rb);
-        din2 = vaddq_f32(din2, rb);
-        din3 = vaddq_f32(din3, rb);
-        // relu
-        din0 = vmaxq_f32(din0, vzero);
-        din1 = vmaxq_f32(din1, vzero);
-        din2 = vmaxq_f32(din2, vzero);
-        din3 = vmaxq_f32(din3, vzero);
-        vst1q_f32(dout_ptr, din0);
-        vst1q_f32(dout_ptr + 4, din1);
-        vst1q_f32(dout_ptr + 8, din2);
-        vst1q_f32(dout_ptr + 12, din3);
-        din_ptr += 16;
-        dout_ptr += 16;
-      }
-      if (remain >= 8) {
-        float32x4_t din0 = vld1q_f32(din_ptr);
-        float32x4_t din1 = vld1q_f32(din_ptr + 4);
-        din0 = vaddq_f32(din0, rb);
-        din1 = vaddq_f32(din1, rb);
-        // relu
-        din0 = vmaxq_f32(din0, vzero);
-        din1 = vmaxq_f32(din1, vzero);
-        vst1q_f32(dout_ptr, din0);
-        vst1q_f32(dout_ptr + 4, din1);
-        din_ptr += 8;
-        dout_ptr += 8;
-        remain -= 8;
-      }
-      if (remain >= 4) {
-        float32x4_t din0 = vld1q_f32(din_ptr);
-        din0 = vaddq_f32(din0, rb);
-        // relu
-        din0 = vmaxq_f32(din0, vzero);
-        vst1q_f32(dout_ptr, din0);
-        din_ptr += 4;
-        dout_ptr += 4;
-        remain -= 4;
-      }
-      if (remain > 0) {
-        for (int p = 0; p < remain; p++) {
-          float tmp = *din_ptr + diny_data;
-          *dout_ptr = tmp > 0.f ? tmp : 0.f;
-          dout_ptr++;
-          din_ptr++;
-        }
-      }
-    }
-  }
-}
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
--- a/paddle/fluid/lite/arm/math/elementwise.h
+++ b/paddle/fluid/lite/arm/math/elementwise.h
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-template <typename T>
-void elementwise_add(const T* dinx, const T* diny, T* dout, int num);
-template <typename T>
-void elementwise_add_relu(const T* dinx, const T* diny, T* dout, int num);
-template <typename T>
-void elementwise_add_broadcast(const T* dinx, const T* diny, T* dout, int batch,
-                               int channels, int num);
-template <typename T>
-void elementwise_add_relu_broadcast(const T* dinx, const T* diny, T* dout,
-                                    int batch, int channels, int num);
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
--- a/paddle/fluid/lite/arm/math/pooling.cc
+++ b/paddle/fluid/lite/arm/math/pooling.cc
--- a/paddle/fluid/lite/arm/math/pooling.h
+++ b/paddle/fluid/lite/arm/math/pooling.h
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include <algorithm>
-#include <string>
-#include <vector>
-#include "paddle/fluid/lite/utils/cp_logging.h"
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-// !pooling fp32 Op
-void pooling_basic(const float* din, float* dout, int num, int chout, int hout,
-                   int wout, int chin, int hin, int win,
-                   const std::vector<int>& ksize,
-                   const std::vector<int>& strides,
-                   const std::vector<int>& paddings, bool global_pooling,
-                   bool exclusive, bool adaptive, bool ceil_mode,
-                   bool use_quantizer, const std::string& pooling_type);
-void pooling_global_max(const float* din, float* dout, int num, int chout,
-                        int hout, int wout, int chin, int hin, int win);
-void pooling_global_avg(const float* din, float* dout, int num, int chout,
-                        int hout, int wout, int chin, int hin, int win);
-void pooling2x2s2_max(const float* din, float* dout, int num, int chout,
-                      int hout, int wout, int chin, int hin, int win);
-void pooling2x2s2_avg(const float* din, float* dout, int num, int chout,
-                      int hout, int wout, int chin, int hin, int win,
-                      bool exclusive);
-void pooling3x3s1p1_max(const float* din, float* dout, int num, int chout,
-                        int hout, int wout, int chin, int hin, int win);
-void pooling3x3s1p1_avg(const float* din, float* dout, int num, int chout,
-                        int hout, int wout, int chin, int hin, int win,
-                        bool exclusive);
-void pooling3x3s2p1_max(const float* din, float* dout, int num, int chout,
-                        int hout, int wout, int chin, int hin, int win);
-void pooling3x3s2p1_avg(const float* din, float* dout, int num, int chout,
-                        int hout, int wout, int chin, int hin, int win,
-                        bool exclusive);
-void pooling3x3s2p0_max(const float* din, float* dout, int num, int chout,
-                        int hout, int wout, int chin, int hin, int win);
-void pooling3x3s2p0_avg(const float* din, float* dout, int num, int chout,
-                        int hout, int wout, int chin, int hin, int win,
-                        bool exclusive);
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
--- a/paddle/fluid/lite/arm/math/scale.cc
+++ b/paddle/fluid/lite/arm/math/scale.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/lite/arm/math/scale.h"
-#include "paddle/fluid/lite/arm/math/funcs.h"
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-template <>
-void scale<float>(const float* din, float* dout, int num, float scale,
-                  float bias) {
-  int cnt = num >> 4;
-  int remain = num % 16;
-  float32x4_t vscale = vdupq_n_f32(scale);
-  float32x4_t vbias = vdupq_n_f32(bias);
-#pragma omp parallel for
-  for (int i = 0; i < cnt; i++) {
-    const float* din_ptr = din + (i << 4);
-    float* dout_ptr = dout + (i << 4);
-    float32x4_t din0 = vld1q_f32(din_ptr);
-    float32x4_t din1 = vld1q_f32(din_ptr + 4);
-    float32x4_t din2 = vld1q_f32(din_ptr + 8);
-    float32x4_t din3 = vld1q_f32(din_ptr + 12);
-    float32x4_t vsum1 = vmlaq_f32(vbias, din0, vscale);
-    float32x4_t vsum2 = vmlaq_f32(vbias, din1, vscale);
-    float32x4_t vsum3 = vmlaq_f32(vbias, din2, vscale);
-    float32x4_t vsum4 = vmlaq_f32(vbias, din3, vscale);
-    vst1q_f32(dout_ptr, vsum1);
-    vst1q_f32(dout_ptr + 4, vsum2);
-    vst1q_f32(dout_ptr + 8, vsum3);
-    vst1q_f32(dout_ptr + 12, vsum4);
-  }
-  if (remain > 0) {
-    const float* din_ptr = din + (cnt << 4);
-    float* dout_ptr = dout + (cnt << 4);
-    for (int i = 0; i < remain; i++) {
-      *dout_ptr = *din_ptr * scale + bias;
-      dout_ptr++;
-      din_ptr++;
-    }
-  }
-}
-template <>
-void scale<float>(const float* din, float* dout, int outer_dim, int scale_dim,
-                  int inner_dim, const float* scale_data,
-                  const float* bias_data) {
-  int cnt = inner_dim >> 4;
-  int remain = inner_dim % 16;
-  int size = inner_dim * scale_dim;
-  for (int n = 0; n < outer_dim; n++) {
-    const float* din_ptr_n = din + n * size;
-    float* dout_ptr_n = dout + n * size;
-#pragma omp parallel for
-    for (int i = 0; i < scale_dim; i++) {
-      const float* din_ptr = din_ptr_n + i * inner_dim;
-      float* dout_ptr = dout_ptr_n + i * inner_dim;
-      float scale = scale_data[i];
-      float32x4_t vscale = vdupq_n_f32(scale);
-      float bias = bias_data[i];
-      float32x4_t vbias = vdupq_n_f32(bias);
-      for (int j = 0; j < cnt; j++) {
-        float32x4_t din0 = vld1q_f32(din_ptr);
-        float32x4_t din1 = vld1q_f32(din_ptr + 4);
-        float32x4_t din2 = vld1q_f32(din_ptr + 8);
-        float32x4_t din3 = vld1q_f32(din_ptr + 12);
-        float32x4_t vsum1 = vmlaq_f32(vbias, din0, vscale);
-        float32x4_t vsum2 = vmlaq_f32(vbias, din1, vscale);
-        float32x4_t vsum3 = vmlaq_f32(vbias, din2, vscale);
-        float32x4_t vsum4 = vmlaq_f32(vbias, din3, vscale);
-        din_ptr += 16;
-        vst1q_f32(dout_ptr, vsum1);
-        vst1q_f32(dout_ptr + 4, vsum2);
-        vst1q_f32(dout_ptr + 8, vsum3);
-        vst1q_f32(dout_ptr + 12, vsum4);
-        dout_ptr += 16;
-      }
-      for (int j = 0; j < remain; j++) {
-        *dout_ptr = *din_ptr * scale + bias;
-        dout_ptr++;
-        din_ptr++;
-      }
-    }
-  }
-}
-template <>
-void scale<float>(const float* din, float* dout, int outer_dim, int scale_dim,
-                  const float* scale_data, const float* bias_data) {
-  int cnt = scale_dim >> 4;
-  int remain = scale_dim % 16;
-  for (int n = 0; n < outer_dim; n++) {
-    const float* din_ptr_n = din + n * scale_dim;
-    float* dout_ptr_n = dout + n * scale_dim;
-#pragma omp parallel for
-    for (int i = 0; i < cnt; i++) {
-      int idx = i << 4;
-      const float* din_ptr = din_ptr_n + idx;
-      const float* scale_ptr = scale_data + idx;
-      const float* bias_ptr = bias_data + idx;
-      float* dout_ptr = dout_ptr_n + idx;
-      float32x4_t din0 = vld1q_f32(din_ptr);
-      float32x4_t vscale0 = vld1q_f32(scale_ptr);
-      float32x4_t vbias0 = vld1q_f32(bias_ptr);
-      float32x4_t din1 = vld1q_f32(din_ptr + 4);
-      float32x4_t vscale1 = vld1q_f32(scale_ptr + 4);
-      float32x4_t vbias1 = vld1q_f32(bias_ptr + 4);
-      float32x4_t din2 = vld1q_f32(din_ptr + 8);
-      float32x4_t vscale2 = vld1q_f32(scale_ptr + 8);
-      float32x4_t vbias2 = vld1q_f32(bias_ptr + 8);
-      float32x4_t vsum1 = vmlaq_f32(vbias0, din0, vscale0);
-      float32x4_t vsum2 = vmlaq_f32(vbias1, din1, vscale1);
-      float32x4_t din3 = vld1q_f32(din_ptr + 12);
-      float32x4_t vscale3 = vld1q_f32(scale_ptr + 12);
-      float32x4_t vbias3 = vld1q_f32(bias_ptr + 12);
-      vst1q_f32(dout_ptr, vsum1);
-      vst1q_f32(dout_ptr + 4, vsum2);
-      float32x4_t vsum3 = vmlaq_f32(vbias2, din2, vscale2);
-      float32x4_t vsum4 = vmlaq_f32(vbias3, din3, vscale3);
-      vst1q_f32(dout_ptr + 8, vsum3);
-      vst1q_f32(dout_ptr + 12, vsum4);
-    }
-    int idx = cnt << 4;
-    const float* din_ptr = din_ptr_n + idx;
-    float* dout_ptr = dout_ptr_n + idx;
-    const float* scale_ptr = scale_data + idx;
-    const float* bias_ptr = bias_data + idx;
-    for (int j = 0; j < remain; j++) {
-      *dout_ptr = *din_ptr * (*scale_ptr) + (*bias_ptr);
-      dout_ptr++;
-      din_ptr++;
-      scale_ptr++;
-      bias_ptr++;
-    }
-  }
-}
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
--- a/paddle/fluid/lite/arm/math/scale.h
+++ b/paddle/fluid/lite/arm/math/scale.h
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-template <typename T>
-void scale(const T* din, T* dout, int num, float scale, float bias);
-template <typename T>
-void scale(const T* din, T* dout, int outer_dim, int scale_dim, int inner_dim,
-           const float* scale_data, const float* bias_data);
-template <typename T>
-void scale(const T* din, T* dout, int outer_dim, int scale_dim,
-           const float* scale_data, const float* bias_data);
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
--- a/paddle/fluid/lite/arm/math/softmax.cc
+++ b/paddle/fluid/lite/arm/math/softmax.cc
--- a/paddle/fluid/lite/arm/math/softmax.h
+++ b/paddle/fluid/lite/arm/math/softmax.h
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-template <typename T>
-void softmax_basic(const T* din, T* dout, const int axis_size,
-                   const int inner_num, const int outer_num);
-template <typename T>
-void softmax_inner8_axis4(const T* din, T* dout, const int axis_size,
-                          const int inner_num, const int outer_num);
-template <typename T>
-void softmax_inner4_axis4(const T* din, T* dout, const int axis_size,
-                          const int inner_num, const int outer_num);
-template <typename T>
-void softmax_inner8(const T* din, T* dout, const int axis_size,
-                    const int inner_num, const int outer_num);
-template <typename T>
-void softmax_inner4(const T* din, T* dout, const int axis_size,
-                    const int inner_num, const int outer_num);
-template <typename T>
-void softmax_inner1_large_axis(const T* din, T* dout, const int outer_size,
-                               const int axis_size);
-template <typename T>
-void softmax_inner1_small_axis(const T* din, T* dout, const int outer_size,
-                               const int axis_size);
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
--- a/paddle/fluid/lite/arm/math/split.cc
+++ b/paddle/fluid/lite/arm/math/split.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/lite/arm/math/split.h"
-#include <algorithm>
-#include "paddle/fluid/lite/arm/math/funcs.h"
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-template <>
-void split_cpy<float>(const float* din, float* dout, int num) {
-  int cnt = num >> 4;
-  int remain = num % 16;
-#pragma omp parallel for
-  for (int i = 0; i < cnt; i++) {
-    const float* din_ptr = din + (i << 4);
-    float* dout_ptr = dout + (i << 4);
-    float32x4_t din0 = vld1q_f32(din_ptr);
-    float32x4_t din1 = vld1q_f32(din_ptr + 4);
-    float32x4_t din2 = vld1q_f32(din_ptr + 8);
-    float32x4_t din3 = vld1q_f32(din_ptr + 12);
-    vst1q_f32(dout_ptr, din0);
-    vst1q_f32(dout_ptr + 4, din1);
-    vst1q_f32(dout_ptr + 8, din2);
-    vst1q_f32(dout_ptr + 12, din3);
-  }
-  if (remain > 0) {
-    const float* din_ptr = din + (cnt << 4);
-    float* dout_ptr = dout + (cnt << 4);
-    for (int i = 0; i < remain; i++) {
-      *dout_ptr = *din_ptr;
-      dout_ptr++;
-      din_ptr++;
-    }
-  }
-}
-template <>
-void split<float>(const float* din, const std::vector<lite::Tensor*>& dout,
-                  const int axis, const std::vector<int>& in_strides) {
-  int input_offset = 0;
-  for (auto out : dout) {
-    auto out_dim = out->dims();
-    std::vector<int> out_strides(out_dim.size());
-    out_strides[out_dim.size() - 1] = out_dim[out_dim.size() - 1];
-    for (int i = out_dim.size() - 2; i >= 0; --i) {
-      out_strides[i] = out_strides[i + 1] * out_dim[i];
-    }
-    float* out_data = out->mutable_data<float>();
-    int before = out_strides[0] / out_strides[axis];
-    int in_after = in_strides[axis];
-    int out_after = out_strides[axis];
-    for (int i = 0; i < before; ++i) {
-      split_cpy(din + input_offset + i * in_after, out_data + i * out_after,
-                out_after);
-    }
-    input_offset += out_strides[axis];
-  }
-}
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
--- a/paddle/fluid/lite/arm/math/split.h
+++ b/paddle/fluid/lite/arm/math/split.h
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include <vector>
-#include "paddle/fluid/lite/core/op_lite.h"
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-template <typename T>
-void split_cpy(const T* din, T* dout, int num);
-template <typename T>
-void split(const T* din, const std::vector<lite::Tensor*>& dout, const int axis,
-           const std::vector<int>& in_strides);
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle