add floor op,elementwise_div op and assign op test=develop (#1882)

26450c49 · huzhiqiang · GitHub · 5e8b15f5 · 26450c49 · 26450c49
24 changed file
--- a/lite/api/_paddle_use_kernels.h
+++ b/lite/api/_paddle_use_kernels.h
@@ -46,6 +46,8 @@ USE_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(elementwise_mul, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(elementwise_max, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(elementwise_div, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(fusion_elementwise_div_activation, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(fusion_elementwise_add_activation, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(fusion_elementwise_mul_activation, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(fusion_elementwise_max_activation, kARM, kFloat, kNCHW, def);
@@ -118,6 +120,7 @@ USE_LITE_KERNEL(while, kARM, kFloat, kNCHW, def)
 USE_LITE_KERNEL(lod_reset, kARM, kFloat, kNCHW, def)
 USE_LITE_KERNEL(lookup_table, kARM, kFloat, kNCHW, def)
 USE_LITE_KERNEL(is_empty, kARM, kFloat, kNCHW, def)
+USE_LITE_KERNEL(assign, kARM, kFloat, kNCHW, def);
 #endif

 #ifdef LITE_WITH_X86

--- a/lite/api/_paddle_use_ops.h
+++ b/lite/api/_paddle_use_ops.h
@@ -35,9 +35,11 @@ USE_LITE_OP(elementwise_add)
 USE_LITE_OP(elementwise_sub)
 USE_LITE_OP(elementwise_mul)
 USE_LITE_OP(elementwise_max)
+USE_LITE_OP(elementwise_div)
 USE_LITE_OP(fusion_elementwise_add_activation)
 USE_LITE_OP(fusion_elementwise_mul_activation)
 USE_LITE_OP(fusion_elementwise_max_activation)
+USE_LITE_OP(fusion_elementwise_div_activation)
 USE_LITE_OP(square)
 USE_LITE_OP(softmax)
 USE_LITE_OP(dropout)
@@ -68,6 +70,7 @@ USE_LITE_OP(yolo_box)
 USE_LITE_OP(bilinear_interp)
 USE_LITE_OP(nearest_interp)

+USE_LITE_OP(assign);
 USE_LITE_OP(crop)
 USE_LITE_OP(prior_box)
 USE_LITE_OP(density_prior_box)

--- a/lite/arm/math/activation.cc
+++ b/lite/arm/math/activation.cc
@@ -666,6 +666,17 @@ void act_exp(const float* din, float* dout, int size, int threads) {
  }
 }

+template <>
+void act_floor<float>(const float* din, float* dout, int size, int threads) {
+  const float* ptr_in = din;
+  float* ptr_out = dout;
+  for (int i = 0; i < size; ++i) {
+    ptr_out[0] = floorf(ptr_in[0]);
+    ptr_in++;
+    ptr_out++;
+  }
+}
+
 }  // namespace math
 }  // namespace arm
 }  // namespace lite

--- a/lite/arm/math/activation.h
+++ b/lite/arm/math/activation.h
@@ -55,6 +55,9 @@ void act_log(const T* din, T* dout, int size, int threads);
 template <typename T>
 void act_exp(const T* din, T* dout, int size, int threads);

+template <typename T>
+void act_floor(const T* din, T* dout, int size, int threads);
+
 }  // namespace math
 }  // namespace arm
 }  // namespace lite

--- a/lite/arm/math/elementwise.cc
+++ b/lite/arm/math/elementwise.cc
@@ -752,6 +752,293 @@ void elementwise_max_relu_broadcast<float>(const float* dinx,
  }
 }

+template <>
+void elementwise_div<float>(const float* dinx,
+                            const float* diny,
+                            float* dout,
+                            int num) {
+  int cnt = num >> 4;
+  int remain = num % 16;
+#pragma omp parallel for
+  for (int i = 0; i < cnt; i++) {
+    const float* dinx_ptr = dinx + (i << 4);
+    const float* diny_ptr = diny + (i << 4);
+    float* dout_ptr = dout + (i << 4);
+
+    float32x4_t dinx0 = vld1q_f32(dinx_ptr);
+    float32x4_t dinx1 = vld1q_f32(dinx_ptr + 4);
+    float32x4_t dinx2 = vld1q_f32(dinx_ptr + 8);
+    float32x4_t dinx3 = vld1q_f32(dinx_ptr + 12);
+
+    float32x4_t diny0 = vld1q_f32(diny_ptr);
+    float32x4_t diny1 = vld1q_f32(diny_ptr + 4);
+    float32x4_t diny2 = vld1q_f32(diny_ptr + 8);
+    float32x4_t diny3 = vld1q_f32(diny_ptr + 12);
+
+#ifdef __aarch64__
+    dinx0 = vdivq_f32(dinx0, diny0);
+    dinx1 = vdivq_f32(dinx1, diny1);
+    dinx2 = vdivq_f32(dinx2, diny2);
+    dinx3 = vdivq_f32(dinx3, diny3);
+#else
+    dinx0 = div_ps(dinx0, diny0);
+    dinx1 = div_ps(dinx1, diny1);
+    dinx2 = div_ps(dinx2, diny2);
+    dinx3 = div_ps(dinx3, diny3);
+#endif
+    vst1q_f32(dout_ptr, dinx0);
+    vst1q_f32(dout_ptr + 4, dinx1);
+    vst1q_f32(dout_ptr + 8, dinx2);
+    vst1q_f32(dout_ptr + 12, dinx3);
+  }
+  if (remain > 0) {
+    const float* dinx_ptr = dinx + (cnt << 4);
+    const float* diny_ptr = diny + (cnt << 4);
+    float* dout_ptr = dout + (cnt << 4);
+    for (int i = 0; i < remain; i++) {
+      *dout_ptr = *dinx_ptr / *diny_ptr;
+      dout_ptr++;
+      dinx_ptr++;
+      diny_ptr++;
+    }
+  }
+}
+
+template <>
+void elementwise_div_broadcast<float>(const float* dinx,
+                                      const float* diny,
+                                      float* dout,
+                                      int batch,
+                                      int channels,
+                                      int num) {
+#pragma omp parallel for collapse(2)
+  for (int i = 0; i < batch; ++i) {
+    for (int j = 0; j < channels; ++j) {
+      int offset = (i * channels + j) * num;
+      const float* din_ptr = dinx + offset;
+      const float diny_data = diny[j];
+      float* dout_ptr = dout + offset;
+
+      int cnt = num >> 4;
+      int remain = num % 16;
+      float32x4_t rb = vdupq_n_f32(diny_data);
+      for (int k = 0; k < cnt; ++k) {
+        float32x4_t din0 = vld1q_f32(din_ptr);
+        float32x4_t din1 = vld1q_f32(din_ptr + 4);
+        float32x4_t din2 = vld1q_f32(din_ptr + 8);
+        float32x4_t din3 = vld1q_f32(din_ptr + 12);
+
+#ifdef __aarch64__
+        din0 = vdivq_f32(din0, rb);
+        din1 = vdivq_f32(din1, rb);
+        din2 = vdivq_f32(din2, rb);
+        din3 = vdivq_f32(din3, rb);
+#else
+        din0 = div_ps(din0, rb);
+        din1 = div_ps(din1, rb);
+        din2 = div_ps(din2, rb);
+        din3 = div_ps(din3, rb);
+#endif
+
+        vst1q_f32(dout_ptr, din0);
+        vst1q_f32(dout_ptr + 4, din1);
+        vst1q_f32(dout_ptr + 8, din2);
+        vst1q_f32(dout_ptr + 12, din3);
+        din_ptr += 16;
+        dout_ptr += 16;
+      }
+      if (remain >= 8) {
+        float32x4_t din0 = vld1q_f32(din_ptr);
+        float32x4_t din1 = vld1q_f32(din_ptr + 4);
+#ifdef __aarch64__
+        din0 = vdivq_f32(din0, rb);
+        din1 = vdivq_f32(din1, rb);
+#else
+        din0 = div_ps(din0, rb);
+        din1 = div_ps(din1, rb);
+#endif
+        vst1q_f32(dout_ptr, din0);
+        vst1q_f32(dout_ptr + 4, din1);
+        din_ptr += 8;
+        dout_ptr += 8;
+        remain -= 8;
+      }
+      if (remain >= 4) {
+        float32x4_t din0 = vld1q_f32(din_ptr);
+#ifdef __aarch64__
+        din0 = vdivq_f32(din0, rb);
+#else
+        din0 = div_ps(din0, rb);
+#endif
+        vst1q_f32(dout_ptr, din0);
+        din_ptr += 4;
+        dout_ptr += 4;
+        remain -= 4;
+      }
+      if (remain > 0) {
+        for (int p = 0; p < remain; p++) {
+          *dout_ptr = *din_ptr / diny_data;
+          dout_ptr++;
+          din_ptr++;
+        }
+      }
+    }
+  }
+}
+
+template <>
+void elementwise_div_relu<float>(const float* dinx,
+                                 const float* diny,
+                                 float* dout,
+                                 int num) {
+  int cnt = num >> 4;
+  int remain = num % 16;
+  float32x4_t vzero = vdupq_n_f32(0.f);
+#pragma omp parallel for
+  for (int i = 0; i < cnt; ++i) {
+    const float* dinx_ptr = dinx + (i << 4);
+    const float* diny_ptr = diny + (i << 4);
+    float* dout_ptr = dout + (i << 4);
+
+    float32x4_t dinx0 = vld1q_f32(dinx_ptr);
+    float32x4_t dinx1 = vld1q_f32(dinx_ptr + 4);
+    float32x4_t dinx2 = vld1q_f32(dinx_ptr + 8);
+    float32x4_t dinx3 = vld1q_f32(dinx_ptr + 12);
+
+    float32x4_t diny0 = vld1q_f32(diny_ptr);
+    float32x4_t diny1 = vld1q_f32(diny_ptr + 4);
+    float32x4_t diny2 = vld1q_f32(diny_ptr + 8);
+    float32x4_t diny3 = vld1q_f32(diny_ptr + 12);
+
+#ifdef __aarch64__
+    dinx0 = vdivq_f32(dinx0, diny0);
+    dinx1 = vdivq_f32(dinx1, diny1);
+    dinx2 = vdivq_f32(dinx2, diny2);
+    dinx3 = vdivq_f32(dinx3, diny3);
+#else
+    dinx0 = div_ps(dinx0, diny0);
+    dinx1 = div_ps(dinx1, diny1);
+    dinx2 = div_ps(dinx2, diny2);
+    dinx3 = div_ps(dinx3, diny3);
+#endif
+    // relu
+    dinx0 = vmaxq_f32(dinx0, vzero);
+    dinx1 = vmaxq_f32(dinx1, vzero);
+    dinx2 = vmaxq_f32(dinx2, vzero);
+    dinx3 = vmaxq_f32(dinx3, vzero);
+
+    vst1q_f32(dout_ptr, dinx0);
+    vst1q_f32(dout_ptr + 4, dinx1);
+    vst1q_f32(dout_ptr + 8, dinx2);
+    vst1q_f32(dout_ptr + 12, dinx3);
+  }
+  if (remain > 0) {
+    const float* dinx_ptr = dinx + (cnt << 4);
+    const float* diny_ptr = diny + (cnt << 4);
+    float* dout_ptr = dout + (cnt << 4);
+    for (int i = 0; i < remain; ++i) {
+      float tmp = *dinx_ptr / *diny_ptr;
+      *(dout_ptr++) = tmp > 0.f ? tmp : 0.f;
+      dinx_ptr++;
+      diny_ptr++;
+    }
+  }
+}
+
+template <>
+void elementwise_div_relu_broadcast<float>(const float* dinx,
+                                           const float* diny,
+                                           float* dout,
+                                           int batch,
+                                           int channels,
+                                           int num) {
+  float32x4_t vzero = vdupq_n_f32(0.f);
+#pragma omp parallel for collapse(2)
+  for (int i = 0; i < batch; ++i) {
+    for (int j = 0; j < channels; ++j) {
+      int offset = (i * channels + j) * num;
+      const float* din_ptr = dinx + offset;
+      const float diny_data = diny[j];
+      float* dout_ptr = dout + offset;
+
+      int cnt = num >> 4;
+      int remain = num % 16;
+      float32x4_t rb = vdupq_n_f32(diny_data);
+      for (int k = 0; k < cnt; ++k) {
+        float32x4_t din0 = vld1q_f32(din_ptr);
+        float32x4_t din1 = vld1q_f32(din_ptr + 4);
+        float32x4_t din2 = vld1q_f32(din_ptr + 8);
+        float32x4_t din3 = vld1q_f32(din_ptr + 12);
+
+#ifdef __aarch64__
+        din0 = vdivq_f32(din0, rb);
+        din1 = vdivq_f32(din1, rb);
+        din2 = vdivq_f32(din2, rb);
+        din3 = vdivq_f32(din3, rb);
+#else
+        din0 = div_ps(din0, rb);
+        din1 = div_ps(din1, rb);
+        din2 = div_ps(din2, rb);
+        din3 = div_ps(din3, rb);
+#endif
+        // relu
+        din0 = vmaxq_f32(din0, vzero);
+        din1 = vmaxq_f32(din1, vzero);
+        din2 = vmaxq_f32(din2, vzero);
+        din3 = vmaxq_f32(din3, vzero);
+
+        vst1q_f32(dout_ptr, din0);
+        vst1q_f32(dout_ptr + 4, din1);
+        vst1q_f32(dout_ptr + 8, din2);
+        vst1q_f32(dout_ptr + 12, din3);
+        din_ptr += 16;
+        dout_ptr += 16;
+      }
+      if (remain >= 8) {
+        float32x4_t din0 = vld1q_f32(din_ptr);
+        float32x4_t din1 = vld1q_f32(din_ptr + 4);
+#ifdef __aarch64__
+        din0 = vdivq_f32(din0, rb);
+        din1 = vdivq_f32(din1, rb);
+#else
+        din0 = div_ps(din0, rb);
+        din1 = div_ps(din1, rb);
+#endif
+        // relu
+        din0 = vmaxq_f32(din0, vzero);
+        din1 = vmaxq_f32(din1, vzero);
+        vst1q_f32(dout_ptr, din0);
+        vst1q_f32(dout_ptr + 4, din1);
+        din_ptr += 8;
+        dout_ptr += 8;
+        remain -= 8;
+      }
+      if (remain >= 4) {
+        float32x4_t din0 = vld1q_f32(din_ptr);
+#ifdef __aarch64__
+        din0 = vdivq_f32(din0, rb);
+#else
+        din0 = div_ps(din0, rb);
+#endif
+        // relu
+        din0 = vmaxq_f32(din0, vzero);
+        vst1q_f32(dout_ptr, din0);
+        din_ptr += 4;
+        dout_ptr += 4;
+        remain -= 4;
+      }
+      if (remain > 0) {
+        for (int p = 0; p < remain; p++) {
+          float tmp = *din_ptr / diny_data;
+          *dout_ptr = tmp > 0.f ? tmp : 0.f;
+          dout_ptr++;
+          din_ptr++;
+        }
+      }
+    }
+  }
+}
+
 }  // namespace math
 }  // namespace arm
 }  // namespace lite

--- a/lite/arm/math/elementwise.h
+++ b/lite/arm/math/elementwise.h
@@ -61,6 +61,20 @@ template <typename T>
 void elementwise_max_relu_broadcast(
    const T* dinx, const T* diny, T* dout, int batch, int channels, int num);

+template <typename T>
+void elementwise_div(const T* dinx, const T* diny, T* dout, int num);
+
+template <typename T>
+void elementwise_div_broadcast(
+    const T* dinx, const T* diny, T* dout, int batch, int channels, int num);
+
+template <typename T>
+void elementwise_div_relu(const T* dinx, const T* diny, T* dout, int num);
+
+template <typename T>
+void elementwise_div_relu_broadcast(
+    const T* dinx, const T* diny, T* dout, int batch, int channels, int num);
+
 }  // namespace math
 }  // namespace arm
 }  // namespace lite

--- a/lite/kernels/arm/CMakeLists.txt
+++ b/lite/kernels/arm/CMakeLists.txt
@@ -45,6 +45,7 @@ add_kernel(reduce_max_compute_arm ARM basic SRCS reduce_max_compute.cc DEPS ${li
 add_kernel(sequence_expand_compute_arm ARM basic SRCS sequence_expand_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(im2sequence_compute_arm ARM basic SRCS im2sequence_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(sequence_pool_compute_arm ARM basic SRCS sequence_pool_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(assign_compute_arm ARM basic SRCS assign_compute.cc DEPS ${lite_kernel_deps} math_arm) 

 # for OCR specific
 add_kernel(gru_unit_compute_arm ARM extra SRCS gru_unit_compute.cc DEPS ${lite_kernel_deps} math_arm)

--- a/lite/kernels/arm/activation_compute.cc
+++ b/lite/kernels/arm/activation_compute.cc
@@ -137,6 +137,16 @@ void ExpCompute::Run() {
      x_data, output_data, x_dims.production(), ctx.threads());
 }

+void FloorCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<ARMContext>();
+  auto x_dims = param.X->dims();
+  auto x_data = param.X->data<float>();
+  auto output_data = param.Out->mutable_data<float>();
+  lite::arm::math::act_floor<float>(
+      x_data, output_data, x_dims.production(), ctx.threads());
+}
+
 }  // namespace arm
 }  // namespace kernels
 }  // namespace lite
@@ -209,3 +219,8 @@ REGISTER_LITE_KERNEL(
    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
    .Finalize();
+REGISTER_LITE_KERNEL(
+    floor, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::FloorCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
--- a/lite/kernels/arm/activation_compute.h
+++ b/lite/kernels/arm/activation_compute.h
@@ -112,6 +112,15 @@ class ExpCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
  virtual ~ExpCompute() = default;
 };

+class FloorCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  void Run() override;
+
+  virtual ~FloorCompute() = default;
+};
+
 }  // namespace arm
 }  // namespace kernels
 }  // namespace lite

--- a/lite/kernels/arm/assign_compute.cc
+++ b/lite/kernels/arm/assign_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/assign_compute.h"
+#include <vector>
+#include "lite/arm/math/funcs.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/type_system.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+void AssignCompute::PrepareForRun() {
+  //  CHECK_OR_FALSE(param_t.Out);
+}
+
+void AssignCompute::Run() {
+  // LOG(INFO) << "into kernel compute run";
+  auto& param = Param<param_t>();
+  const lite::Tensor* input = param.X;
+  lite::Tensor* output = param.Out;
+  output->CopyDataFrom(*input);
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    assign, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::AssignCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
--- a/lite/kernels/arm/assign_compute.h
+++ b/lite/kernels/arm/assign_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include "lite/core/kernel.h"
+#include "lite/operators/assign_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+class AssignCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::AssignParam;
+  void PrepareForRun() override;
+  void Run() override;
+
+  virtual ~AssignCompute() = default;
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/arm/elementwise_compute.cc
+++ b/lite/kernels/arm/elementwise_compute.cc
@@ -205,6 +205,55 @@ void ElementwiseMaxActivationCompute::Run() {
  }
 }

+void ElementwiseDivCompute::Run() {
+  auto& param = Param<operators::ElementwiseParam>();
+  const float* x_data = param.X->data<float>();
+  const float* y_data = param.Y->data<float>();
+  float* out_data = param.Out->mutable_data<float>();
+  int axis = param.axis;
+  auto x_dims = param.X->dims();
+  auto y_dims = param.Y->dims();
+  int pre, n, post;
+  if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
+    lite::arm::math::elementwise_div_broadcast(
+        x_data, y_data, out_data, pre, n, post);
+  } else {
+    lite::arm::math::elementwise_div(
+        x_data, y_data, out_data, x_dims.production());
+  }
+}
+
+void ElementwiseDivActivationCompute::Run() {
+  auto& param = Param<operators::FusionElementwiseActivationParam>();
+  const float* x_data = param.X->data<float>();
+  const float* y_data = param.Y->data<float>();
+  float* out_data = param.Out->mutable_data<float>();
+  int axis = param.axis;
+  std::string act_type = param.act_type;
+  auto x_dims = param.X->dims();
+  auto y_dims = param.Y->dims();
+  int pre, n, post;
+  if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
+    if (act_type == "relu") {
+      lite::arm::math::elementwise_div_relu_broadcast(
+          x_data, y_data, out_data, pre, n, post);
+    } else {
+      LOG(FATAL) << "unsupported Activation type: " << act_type;
+    }
+  } else {
+    if (act_type == "relu") {
+      lite::arm::math::elementwise_div_relu(
+          x_data, y_data, out_data, x_dims.production());
+    } else {
+      LOG(FATAL) << "unsupported Activation type: " << act_type;
+    }
+  }
+  for (int i = 0; i < x_dims.production(); i++) {
+    LOG(INFO) << "x:" << x_data[i] << "  y:" << y_data[i]
+              << "  out:" << out_data[i];
+  }
+}
+
 }  // namespace arm
 }  // namespace kernels
 }  // namespace lite
@@ -278,3 +327,26 @@ REGISTER_LITE_KERNEL(
    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_div,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::ElementwiseDivCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    fusion_elementwise_div_activation,
+    kARM,
+    kFloat,
+    kNCHW,
+    paddle::lite::kernels::arm::ElementwiseDivActivationCompute,
+    def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
--- a/lite/kernels/arm/elementwise_compute.h
+++ b/lite/kernels/arm/elementwise_compute.h
@@ -70,6 +70,22 @@ class ElementwiseMaxActivationCompute
  virtual ~ElementwiseMaxActivationCompute() = default;
 };

+class ElementwiseDivCompute
+    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+
+  virtual ~ElementwiseDivCompute() = default;
+};
+
+class ElementwiseDivActivationCompute
+    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+
+  virtual ~ElementwiseDivActivationCompute() = default;
+};
+
 }  // namespace arm
 }  // namespace kernels
 }  // namespace lite

--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
@@ -59,6 +59,7 @@ add_operator(shape_op_lite basic SRCS shape_op.cc DEPS ${op_DEPS})
 add_operator(sequence_expand_op_lite basic SRCS sequence_expand_op.cc DEPS ${op_DEPS})
 add_operator(squeeze_op_lite basic SRCS squeeze_op.cc DEPS ${op_DEPS})
 add_operator(im2sequence_op basic SRCS im2sequence_op.cc DEPS ${op_DEPS})
+add_operator(assign_op basic SRCS assign_op.cc DEPS ${op_DEPS})

 # for OCR specific
 add_operator(while_op extra SRCS while_op.cc DEPS ${op_DEPS})

--- a/lite/operators/activation_ops.cc
+++ b/lite/operators/activation_ops.cc
@@ -110,6 +110,7 @@ REGISTER_LITE_OP(swish, paddle::lite::operators::ActivationOp);
 REGISTER_LITE_OP(relu6, paddle::lite::operators::ActivationOp);
 REGISTER_LITE_OP(log, paddle::lite::operators::ActivationOp);
 REGISTER_LITE_OP(exp, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(floor, paddle::lite::operators::ActivationOp);

 #ifdef LITE_WITH_TRAIN
 REGISTER_LITE_OP(square_grad, paddle::lite::operators::ActivationGradOp);

--- a/lite/operators/assign_op.cc
+++ b/lite/operators/assign_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/assign_op.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool AssignOpLite::CheckShape() const {
+  CHECK_OR_FALSE(param_.X);
+  CHECK_OR_FALSE(param_.Out);
+  return true;
+}
+
+bool AssignOpLite::InferShape() const {
+  lite::DDim input_dims;
+  input_dims = param_.X->dims();
+  param_.Out->Resize(lite::DDim(input_dims));
+  return true;
+}
+
+// TODO(Superjomn) replace framework::OpDesc with a lite one.
+bool AssignOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
+  auto input = op_desc.Input("X").front();
+  auto out = op_desc.Output("Out").front();
+
+  param_.X = scope->FindVar(input)->GetMutable<lite::Tensor>();
+  CHECK(scope->FindVar(out));
+  param_.Out = scope->FindVar(out)->GetMutable<lite::Tensor>();
+
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(assign, paddle::lite::operators::AssignOpLite);
--- a/lite/operators/assign_op.h
+++ b/lite/operators/assign_op.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class AssignOpLite : public OpLite {
+ public:
+  AssignOpLite() {}
+  explicit AssignOpLite(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "assign"; }
+
+ private:
+  mutable AssignParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
--- a/lite/operators/elementwise_ops.cc
+++ b/lite/operators/elementwise_ops.cc
@@ -89,6 +89,7 @@ REGISTER_LITE_OP(elementwise_add, paddle::lite::operators::ElementwiseOp);

 REGISTER_LITE_OP(elementwise_mul, paddle::lite::operators::ElementwiseOp);
 REGISTER_LITE_OP(elementwise_max, paddle::lite::operators::ElementwiseOp);
+REGISTER_LITE_OP(elementwise_div, paddle::lite::operators::ElementwiseOp);

 #ifdef LITE_WITH_TRAIN
 REGISTER_LITE_OP(elementwise_sub_grad,

--- a/lite/operators/fusion_elementwise_activation_ops.cc
+++ b/lite/operators/fusion_elementwise_activation_ops.cc
@@ -97,6 +97,8 @@ REGISTER_LITE_OP(fusion_elementwise_mul_activation,
                 paddle::lite::operators::FusionElementwiseActivationOp);
 REGISTER_LITE_OP(fusion_elementwise_max_activation,
                 paddle::lite::operators::FusionElementwiseActivationOp);
+REGISTER_LITE_OP(fusion_elementwise_div_activation,
+                 paddle::lite::operators::FusionElementwiseActivationOp);

 #ifdef LITE_WITH_TRAIN
 REGISTER_LITE_OP(

--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
@@ -719,6 +719,12 @@ struct MatMulParam {
  bool transpose_Y{false};
  float alpha{1.0f};
 };
+
+/// ----------------------- assign operators -----------------------
+struct AssignParam {
+  const lite::Tensor* X{};
+  lite::Tensor* Out{};
+};
 }  // namespace operators
 }  // namespace lite
 }  // namespace paddle
--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
@@ -13,6 +13,7 @@ if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND (LITE_WITH_X86 OR LITE_WITH
    lite_cc_test(test_kernel_axpy_compute SRCS axpy_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_conv2d_transpose_compute SRCS conv2d_transpose_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_norm_compute SRCS norm_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_assign_compute SRCS assign_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    #lite_cc_test(test_kernel_sequence_softmax_compute SRCS sequence_softmax_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    #lite_cc_test(test_kernel_im2sequence_compute SRCS im2sequence_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    #lite_cc_test(test_kernel_compare_compute SRCS compare_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})

--- a/lite/tests/kernels/activation_compute_test.cc
+++ b/lite/tests/kernels/activation_compute_test.cc
@@ -32,7 +32,8 @@ enum activation_type_test {
  SWISH,
  RELU6,
  LOG,
-  EXP
+  EXP,
+  FLOOR
 };

 class ActivationComputeTester : public arena::TestCase {
@@ -170,6 +171,12 @@ class ActivationComputeTester : public arena::TestCase {
        }
        break;
      }
+      case FLOOR: {
+        for (int i = 0; i < dims_.production(); i++) {
+          output_data[i] = std::floor(x_data[i]);
+        }
+        break;
+      }
      default:
        LOG(INFO) << "the type of activation is unknow.";
    }
@@ -519,5 +526,32 @@ TEST(Activation_exp, precision) {
 #endif
 }

+TEST(Activation_floor, precision) {
+  LOG(INFO) << "test floor op";
+#ifdef LITE_WITH_ARM
+  Place place(TARGET(kARM));
+  for (auto n : {1, 3}) {
+    for (auto c : {3, 6}) {
+      for (auto h : {9, 18}) {
+        for (auto w : {9, 18}) {
+          std::unique_ptr<arena::TestCase> tester(new ActivationComputeTester(
+              place,
+              "def",
+              0.01,
+              6.,
+              "all",
+              0.,
+              DDim(std::vector<int64_t>({n, c, h, w})),
+              "floor",
+              FLOOR));
+          arena::Arena arena(std::move(tester), place, 2e-5);
+          arena.TestPrecision();
+        }
+      }
+    }
+  }
+#endif
+}
+
 }  // namespace lite
 }  // namespace paddle
--- a/lite/tests/kernels/assign_compute_test.cc
+++ b/lite/tests/kernels/assign_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+
+namespace paddle {
+namespace lite {
+
+class AssignComputeTester : public arena::TestCase {
+ protected:
+  std::string input_ = "X";
+  std::string output_ = "Out";
+  DDim dims_{{100, 20}};
+
+ public:
+  AssignComputeTester(const Place& place, const std::string& alias)
+      : TestCase(place, alias) {}
+
+  void RunBaseline(Scope* scope) override {
+    auto* out = scope->NewTensor(output_);
+    CHECK(out);
+    out->Resize(dims_);
+    auto* out_data = out->mutable_data<float>();
+    auto* x = scope->FindTensor(input_);
+    const auto* x_data = x->data<float>();
+    for (int i = 0; i < dims_.production(); i++) {
+      out_data[i] = x_data[i];
+    }
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType("assign");
+    op_desc->SetInput("X", {input_});
+    op_desc->SetOutput("Out", {output_});
+  }
+
+  void PrepareData() override {
+    std::vector<float> data(dims_.production());
+
+    for (int i = 0; i < dims_.production(); i++) {
+      data[i] = i * 1.1;
+    }
+
+    SetCommonTensor(input_, dims_, data.data());
+  }
+};
+
+void TestAssign(const Place& place) {
+  std::unique_ptr<arena::TestCase> tester(
+      new AssignComputeTester(place, "def"));
+  arena::Arena arena(std::move(tester), place, 2e-5);
+  arena.TestPrecision();
+}
+
+TEST(Assign, precision) {
+#ifdef LITE_WITH_X86
+  Place place(TARGET(kX86));
+#endif
+#ifdef LITE_WITH_ARM
+  Place place(TARGET(kARM));
+  TestAssign(place);
+#endif
+}
+
+}  // namespace lite
+}  // namespace paddle
--- a/lite/tests/kernels/elementwise_compute_test.cc
+++ b/lite/tests/kernels/elementwise_compute_test.cc
@@ -350,6 +350,125 @@ class FusionElementwiseMaxActivationComputeTester : public arena::TestCase {
  }
 };

+class ElementwiseDivComputeTester : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string inputx_ = "x";
+  std::string inputy_ = "y";
+  std::string output_ = "out";
+  int axis_;
+  DDim dims_{{1, 2, 3, 4}};
+
+ public:
+  ElementwiseDivComputeTester(const Place& place,
+                              const std::string& alias,
+                              int axis)
+      : TestCase(place, alias), axis_(axis) {}
+
+  void RunBaseline(Scope* scope) override {
+    auto* out = scope->NewTensor(output_);
+    CHECK(out);
+    out->Resize(dims_);
+    auto* out_data = out->mutable_data<float>();
+
+    auto* x = scope->FindTensor(inputx_);
+    const auto* x_data = x->data<float>();
+    auto* y = scope->FindTensor(inputy_);
+    const auto* y_data = y->data<float>();
+
+    for (int i = 0; i < dims_.production(); i++) {
+      out_data[i] = x_data[i] / y_data[i];
+    }
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType("elementwise_div");
+    op_desc->SetInput("X", {inputx_});
+    op_desc->SetInput("Y", {inputy_});
+    op_desc->SetOutput("Out", {output_});
+    op_desc->SetAttr("axis", axis_);
+  }
+
+  void PrepareData() override {
+    std::vector<float> data(dims_.production());
+
+    for (int i = 0; i < dims_.production(); i++) {
+      data[i] = i * 1.1;
+    }
+
+    std::vector<float> data2(dims_.production());
+    for (int i = 0; i < dims_.production(); i++) {
+      data2[i] = (i + 1) * 1.1;
+    }
+
+    SetCommonTensor(inputx_, dims_, data.data());
+    SetCommonTensor(inputy_, dims_, data2.data());
+  }
+};
+
+class FusionElementwiseDivActivationComputeTester : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string inputx_ = "x";
+  std::string inputy_ = "y";
+  std::string output_ = "out";
+  int axis_;
+  std::string act_type_;
+  DDim dims_{{1, 2, 3, 4}};
+
+ public:
+  FusionElementwiseDivActivationComputeTester(const Place& place,
+                                              const std::string& alias,
+                                              int axis,
+                                              std::string act_type)
+      : TestCase(place, alias), axis_(axis), act_type_(act_type) {}
+
+  void RunBaseline(Scope* scope) override {
+    auto* out = scope->NewTensor(output_);
+    CHECK(out);
+    out->Resize(dims_);
+    auto* out_data = out->mutable_data<float>();
+
+    auto* x = scope->FindTensor(inputx_);
+    const auto* x_data = x->data<float>();
+    auto* y = scope->FindTensor(inputy_);
+    const auto* y_data = y->data<float>();
+
+    for (int i = 0; i < dims_.production(); i++) {
+      out_data[i] = x_data[i] / y_data[i];
+      if (act_type_ == "relu") {
+        out_data[i] = out_data[i] > 0 ? out_data[i] : 0;
+      } else {
+        LOG(FATAL) << "unsupported Activation type: " << act_type_;
+      }
+      LOG(INFO) << "fusion div resul:" << out_data[i];
+    }
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType("fusion_elementwise_div_activation");
+    op_desc->SetInput("X", {inputx_});
+    op_desc->SetInput("Y", {inputy_});
+    op_desc->SetOutput("Out", {output_});
+    op_desc->SetAttr("axis", axis_);
+    op_desc->SetAttr("act_type", act_type_);
+  }
+
+  void PrepareData() override {
+    std::vector<float> data(dims_.production());
+
+    for (int i = 0; i < dims_.production(); i++) {
+      data[i] = i * 1.1;
+    }
+    std::vector<float> data2(dims_.production());
+    for (int i = 0; i < dims_.production(); i++) {
+      data2[i] = (i + 1) * 1.1;
+    }
+    SetCommonTensor(inputx_, dims_, data.data());
+    SetCommonTensor(inputy_, dims_, data2.data());
+  }
+};
+
 void test_elementwise(Place place) {
  for (int axis : {-1, 0, 1, 3}) {
    std::unique_ptr<arena::TestCase> tester(
@@ -366,6 +485,11 @@ void test_elementwise(Place place) {
        new ElementwiseMaxComputeTester(place, "def", axis));
    arena::Arena arena_max(std::move(tester_max), place, 2e-5);
    arena_max.TestPrecision();
+
+    std::unique_ptr<arena::TestCase> tester_div(
+        new ElementwiseDivComputeTester(place, "def", axis));
+    arena::Arena arena_div(std::move(tester_div), place, 2e-5);
+    arena_div.TestPrecision();
  }
 }

@@ -398,6 +522,12 @@ void test_fusion_elementwise(Place place) {
            place, "def", axis, "relu"));
    arena::Arena arena_max_act(std::move(tester_max_act), place, 2e-5);
    arena_max_act.TestPrecision();
+
+    std::unique_ptr<arena::TestCase> tester_div_act(
+        new FusionElementwiseDivActivationComputeTester(
+            place, "def", axis, "relu"));
+    arena::Arena arena_div_act(std::move(tester_div_act), place, 2e-5);
+    arena_div_act.TestPrecision();
  }
 }