From 09a07a23903ecf9e61bd8bd103d82d93bd5fb314 Mon Sep 17 00:00:00 2001 From: Shixiaowei02 <39303645+Shixiaowei02@users.noreply.github.com> Date: Wed, 19 Jun 2019 16:11:27 +0000 Subject: [PATCH] add calib kernel --- paddle/fluid/lite/arm/math/CMakeLists.txt | 2 +- paddle/fluid/lite/arm/math/type_trans.cpp | 579 ------------------ paddle/fluid/lite/kernels/arm/CMakeLists.txt | 2 + .../fluid/lite/kernels/arm/calib_compute.cc | 57 ++ paddle/fluid/lite/kernels/arm/calib_compute.h | 40 ++ .../lite/kernels/arm/calib_compute_test.cc | 149 +++++ paddle/fluid/lite/operators/CMakeLists.txt | 3 + paddle/fluid/lite/operators/calib_op.cc | 56 ++ paddle/fluid/lite/operators/calib_op.h | 60 ++ paddle/fluid/lite/operators/calib_op_test.cc | 64 ++ paddle/fluid/lite/operators/op_params.h | 8 + 11 files changed, 440 insertions(+), 580 deletions(-) delete mode 100644 paddle/fluid/lite/arm/math/type_trans.cpp create mode 100644 paddle/fluid/lite/kernels/arm/calib_compute.cc create mode 100644 paddle/fluid/lite/kernels/arm/calib_compute.h create mode 100644 paddle/fluid/lite/kernels/arm/calib_compute_test.cc create mode 100644 paddle/fluid/lite/operators/calib_op.cc create mode 100644 paddle/fluid/lite/operators/calib_op.h create mode 100644 paddle/fluid/lite/operators/calib_op_test.cc diff --git a/paddle/fluid/lite/arm/math/CMakeLists.txt b/paddle/fluid/lite/arm/math/CMakeLists.txt index 883e7bc4609..dd439bbf0f6 100644 --- a/paddle/fluid/lite/arm/math/CMakeLists.txt +++ b/paddle/fluid/lite/arm/math/CMakeLists.txt @@ -16,7 +16,7 @@ cc_library(math_arm SRCS elementwise.cc concat.cc sgemv.cc - type_trans.cpp + type_trans.cc conv_impl.cc conv_direct_3x3s1.cc conv_direct_3x3s2.cc diff --git a/paddle/fluid/lite/arm/math/type_trans.cpp b/paddle/fluid/lite/arm/math/type_trans.cpp deleted file mode 100644 index f9c3ea590f3..00000000000 --- a/paddle/fluid/lite/arm/math/type_trans.cpp +++ /dev/null @@ -1,579 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/lite/arm/math/type_trans.h" -#include -#include -#include "paddle/fluid/lite/arm/math/saturate.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -template -void int32_to_dtype(const int* din, dtype* dout, const float* scale, - int axis_size, int64_t outer_size, int64_t inner_size); - -void fp32_to_int8(const float* din, signed char* dout, const float* scale, - int axis_size, int64_t outer_size, int64_t inner_size) { - int cnt = inner_size / 16; - int remain = inner_size & 15; - int64_t loop_size = outer_size * axis_size; - -#pragma omp parallel for - for (int j = 0; j < loop_size; ++j) { - float inv_scale = 1.f / scale[j % axis_size]; - float32x4_t vzero = vdupq_n_f32(0.f); - float32x4_t vscale = vdupq_n_f32(inv_scale); - float32x4_t vpoff = vdupq_n_f32(0.5f); - float32x4_t vnoff = vdupq_n_f32(-0.5f); - const float* din_c = din + j * inner_size; - signed char* dout_c = dout + j * inner_size; - if (cnt > 0) { - int cnt_loop = cnt; - const float* din_ptr = din_c; - signed char* dout_ptr = dout_c; -#ifdef __aarch64__ - asm volatile( - "ldp q0, q1, [%[in]], #32 \n" - "ldp q2, q3, [%[in]], #32 \n" - "0: \n" /* main loop */ - "fmul v4.4s, v0.4s, %[scale].4s \n" - "fmul v5.4s, v1.4s, %[scale].4s \n" - "fmul v6.4s, v2.4s, %[scale].4s \n" - "fmul v7.4s, v3.4s, %[scale].4s \n" - "ldp q0, q1, [%[in]], #32 \n" - "subs %[cnt], %[cnt], #1 \n" - "FCVTAS v8.4s, v4.4s \n" - "FCVTAS v9.4s, v5.4s \n" - "FCVTAS v10.4s, v6.4s \n" - "FCVTAS v11.4s, v7.4s \n" - "ldp q2, q3, [%[in]], #32 \n" - "sqxtn v4.4h, v8.4s \n" - "sqxtn2 v4.8h, v9.4s \n" - "sqxtn v5.4h, v10.4s \n" - "sqxtn2 v5.8h, v11.4s \n" - "sqxtn v8.8b, v4.8h \n" - "sqxtn2 v8.16b, v5.8h \n" - "str q8, [%[out]], #16 \n" - "bne 0b \n" - : [in] "+r"(din_ptr), [out] "+r"(dout_ptr), [cnt] "+r"(cnt_loop) - : [scale] "w"(vscale) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11"); -#else - asm volatile( - "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" - "vld1.32 {d4-d7}, [%[din]]! @ load in8~in16\n" - "0: @ main loop\n" - "vand.i32 q4, %q[vpoff], %q[vpoff] @ set offset, 0.5\n" - "vand.i32 q5, q4, q4 @ set offset, 0.5\n" - "vand.i32 q6, q4, q4 @ set offset, 0.5\n" - "vand.i32 q7, q4, q4 @ set offset, 0.5\n" - "vcgt.f32 q8, q0, %q[vzero] @ get mask > 0, in0\n" - "vcgt.f32 q9, q1, %q[vzero] @ get mask > 0, in1\n" - "vcgt.f32 q10, q2, %q[vzero] @ get mask > 0, in2\n" - "vcgt.f32 q11, q3, %q[vzero] @ get mask > 0, in3\n" - "vbif.f32 q4, %q[vnoff], q8 @ get right offset\n" - "vbif.f32 q5, %q[vnoff], q9 @ get right offset\n" - "vbif.f32 q6, %q[vnoff], q10 @ get right offset\n" - "vbif.f32 q7, %q[vnoff], q11 @ get right offset\n" - "vmla.f32 q4, q0, %q[vscale] @ mul scale\n" - "vmla.f32 q5, q1, %q[vscale] @ mul scale\n" - "vmla.f32 q6, q2, %q[vscale] @ mul scale\n" - "vmla.f32 q7, q3, %q[vscale] @ mul scale\n" - "vcvt.s32.f32 q0, q4 @ cvt to int32\n" - "vcvt.s32.f32 q1, q5 @ cvt to int32\n" - "vcvt.s32.f32 q2, q6 @ cvt to int32\n" - "vcvt.s32.f32 q3, q7 @ cvt to int32\n" - "vqmovn.s32 d8, q0 @ cnt to int16\n" - "vqmovn.s32 d9, q1 @ cnt to int16\n" - "vqmovn.s32 d10, q2 @ cnt to int16\n" - "vqmovn.s32 d11, q3 @ cnt to int16\n" - "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" - "vqmovn.s16 d12, q4 @ cnt to int8\n" - "vqmovn.s16 d13, q5 @ cnt to int8\n" - "vld1.32 {d4-d7}, [%[din]]! @ load in8~in16\n" - "vst1.32 {d12-d13}, [%[dout]]! @ write to output\n" - "subs %[cnt], #1 @ loop count -1\n" - "bne 0b @ to main loop\n" - - : [dout] "+r"(dout_ptr), [din] "+r"(din_ptr), [cnt] "+r"(cnt_loop) - : [vscale] "w"(vscale), [vpoff] "w"(vpoff), [vnoff] "w"(vnoff), - [vzero] "w"(vzero) - : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", - "q11"); -#endif - } - const float* din_r = din_c + 16 * cnt; - signed char* dout_r = dout_c + 16 * cnt; - for (int i = 0; i < remain; ++i) { - dout_r[i] = saturate_cast(roundf(inv_scale * din_r[i])); - } - } -} - -void fp32_to_int16(const float* din, int16_t* dout, const float* scale, - int axis_size, int64_t outer_size, int64_t inner_size) { - int cnt = inner_size / 8; - int remain = inner_size & 7; - int64_t loop_size = outer_size * axis_size; - -#pragma omp parallel for - for (int j = 0; j < loop_size; ++j) { - float inv_scale = 1.f / scale[j % axis_size]; - float32x4_t vzero = vdupq_n_f32(0.f); - float32x4_t vscale = vdupq_n_f32(inv_scale); - float32x4_t vpoff = vdupq_n_f32(0.5f); - float32x4_t vnoff = vdupq_n_f32(-0.5f); - const float* din_c = din + j * inner_size; - int16_t* dout_c = dout + j * inner_size; - if (cnt > 0) { - int cnt_loop = cnt; - const float* din_ptr = din_c; - int16_t* dout_ptr = dout_c; -#ifdef __aarch64__ - asm volatile( - "ldp q0, q1, [%[in]], #32 \n" - "0: \n" /* main loop */ - "fmul v4.4s, v0.4s, %[scale].4s \n" - "fmul v5.4s, v1.4s, %[scale].4s \n" - "ldp q0, q1, [%[in]], #32 \n" - "subs %[cnt], %[cnt], #1 \n" - "FCVTAS v8.4s, v4.4s \n" - "FCVTAS v9.4s, v5.4s \n" - "sqxtn v4.4h, v8.4s \n" - "sqxtn2 v4.8h, v9.4s \n" - "str q4, [%[out]], #16 \n" - "bne 0b \n" - : [in] "+r"(din_ptr), [out] "+r"(dout_ptr), [cnt] "+r"(cnt_loop) - : [scale] "w"(vscale) - : "v0", "v1", "v4", "v5", "v8", "v9"); -#else - asm volatile( - "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" - "0: @ main loop\n" - "vand.i32 q4, %q[vpoff], %q[vpoff] @ set offset, 0.5\n" - "vand.i32 q5, q4, q4 @ set offset, 0.5\n" - "vand.i32 q6, q4, q4 @ set offset, 0.5\n" - "vand.i32 q7, q4, q4 @ set offset, 0.5\n" - "vcgt.f32 q8, q0, %q[vzero] @ get mask > 0, in0\n" - "vcgt.f32 q9, q1, %q[vzero] @ get mask > 0, in1\n" - "vbif.f32 q4, %q[vnoff], q8 @ get right offset\n" - "vbif.f32 q5, %q[vnoff], q9 @ get right offset\n" - "vmla.f32 q4, q0, %q[vscale] @ mul scale\n" - "vmla.f32 q5, q1, %q[vscale] @ mul scale\n" - "vcvt.s32.f32 q0, q4 @ cvt to int32\n" - "vcvt.s32.f32 q1, q5 @ cvt to int32\n" - "vqmovn.s32 d8, q0 @ cnt to int16\n" - "vqmovn.s32 d9, q1 @ cnt to int16\n" - "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" - "vst1.32 {d8-d9}, [%[dout]]! @ write to output\n" - "subs %[cnt], #1 @ loop count -1\n" - "bne 0b @ to main loop\n" - - : [dout] "+r"(dout_ptr), [din] "+r"(din_ptr), [cnt] "+r"(cnt_loop) - : [vscale] "w"(vscale), [vpoff] "w"(vpoff), [vnoff] "w"(vnoff), - [vzero] "w"(vzero) - : "q0", "q1", "q4", "q5", "q6", "q7", "q8", "q9"); -#endif - } - const float* din_r = din_c + 8 * cnt; - int16_t* dout_r = dout_c + 8 * cnt; - for (int i = 0; i < remain; ++i) { - dout_r[i] = saturate_cast(roundf(inv_scale * din_r[i])); - } - } -} - -void int8_to_fp32(const signed char* in, float* out, const float* scale, - int axis_size, int64_t outer_size, int64_t inner_size) { - int cnt = inner_size / 16; - int remain = inner_size & 15; - int64_t loop_size = axis_size * outer_size; -#pragma omp parallel for - for (int64_t n = 0; n < loop_size; ++n) { - float in_scale = scale[n % axis_size]; - const signed char* din_c = in + n * inner_size; - float* dout_c = out + n * inner_size; - float32x4_t vscale = vdupq_n_f32(in_scale); - if (cnt > 0) { - int loop = cnt; - const signed char* din_ptr = din_c; - float* dout_ptr = dout_c; -#ifdef __aarch64__ - asm volatile( - "ldp d0, d1, [%[in]], #16 \n" /* load 16 int8*/ - "0: \n" /* main loop */ - "sshll v2.8h, v0.8b, #0 \n" /* trans to int16*/ - "sshll v3.8h, v1.8b, #0 \n" /* trans to int16*/ - - "sshll v4.4s, v2.4h, #0 \n" /* trans to int32*/ - "sshll2 v5.4s, v2.8h, #0 \n" /* trans to int32*/ - "sshll v6.4s, v3.4h, #0 \n" /* trans to int32*/ - "sshll2 v7.4s, v3.8h, #0 \n" /* trans to int32*/ - - "ldp d0, d1, [%[in]], #16 \n" /* load 16 int8*/ - - "scvtf v8.4s, v4.4s \n" /* trans to fp32*/ - "scvtf v9.4s, v5.4s \n" /* trans to fp32*/ - "scvtf v10.4s, v6.4s \n" /* trans to fp32*/ - "scvtf v11.4s, v7.4s \n" /* trans to fp32*/ - - "subs %[loop], %[loop], #1 \n" - - "fmul v4.4s, v8.4s, %[scale].4s \n" /* mul with scale*/ - "fmul v5.4s, v9.4s, %[scale].4s \n" /* mul with scale*/ - "fmul v6.4s, v10.4s, %[scale].4s \n" /* mul with scale*/ - "fmul v7.4s, v11.4s, %[scale].4s \n" /* mul with scale*/ - - "stp q4, q5, [%[out]], #32 \n" /* write to memory*/ - "stp q6, q7, [%[out]], #32 \n" /* write to memory*/ - - "bne 0b \n" - : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr) - : [scale] "w"(vscale) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11"); -#else - asm volatile( - "vld1.32 {d0-d1}, [%[in]]! @ load 16 int8\n" - "0: @ main loop\n" - "vmovl.s8 q2, d0 @ trans to int16\n" - "vmovl.s8 q3, d1 @ trans to int16\n" - "vmovl.s16 q4, d4 @ trans to int32\n" - "vmovl.s16 q5, d5 @ trans to int32\n" - "vmovl.s16 q6, d6 @ trans to int32\n" - "vmovl.s16 q7, d7 @ trans to int32\n" - "vcvt.f32.s32 q0, q4 @ trans to fp32\n" - "vcvt.f32.s32 q1, q5 @ trans to fp32\n" - "vcvt.f32.s32 q2, q6 @ trans to fp32\n" - "vcvt.f32.s32 q3, q7 @ trans to fp32\n" - "vmul.f32 q4, q0, %q[scale] @ mul with scale\n" - "vmul.f32 q5, q1, %q[scale] @ mul with scale\n" - "vmul.f32 q6, q2, %q[scale] @ mul with scale\n" - "vmul.f32 q7, q3, %q[scale] @ mul with scale\n" - - "vld1.32 {d0-d1}, [%[in]]! @ load 16 int8\n" - - "subs %[loop], #1 \n" - - "vst1.f32 {d8-d11}, [%[out]]! @ write to memory\n" - "vst1.f32 {d12-d15}, [%[out]]! @ write to memory\n" - - "bne 0b \n" - : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr) - : [scale] "w"(vscale) - : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); -#endif // __aarch64__ - } - const signed char* din_r = din_c + 16 * cnt; - float* dout_r = dout_c + 16 * cnt; - for (int i = 0; i < remain; ++i) { - dout_r[i] = in_scale * din_r[i]; - } - } -} - -void int16_to_fp32(const int16_t* in, float* out, const float* scale, - int axis_size, int64_t outer_size, int64_t inner_size) { - int cnt = inner_size / 16; - int remain = inner_size & 15; - int64_t loop_size = axis_size * outer_size; -#pragma omp parallel for - for (int64_t n = 0; n < loop_size; ++n) { - float in_scale = scale[n % axis_size]; - const int16_t* din_c = in + n * inner_size; - float* dout_c = out + n * inner_size; - float32x4_t vscale = vdupq_n_f32(in_scale); - if (cnt > 0) { - int loop = cnt; - const int16_t* din_ptr = din_c; - float* dout_ptr = dout_c; -#ifdef __aarch64__ - asm volatile( - "ldp q0, q1, [%[in]], #32 \n" /* load 16 int16*/ - "0: \n" /* main loop */ - "sshll v4.4s, v0.4h, #0 \n" /* trans to int32*/ - "sshll2 v5.4s, v0.8h, #0 \n" /* trans to int32*/ - "sshll v6.4s, v1.4h, #0 \n" /* trans to int32*/ - "sshll2 v7.4s, v1.8h, #0 \n" /* trans to int32*/ - - "ldp q0, q1, [%[in]], #32 \n" /* load 16 int16*/ - - "scvtf v8.4s, v4.4s \n" /* trans to fp32*/ - "scvtf v9.4s, v5.4s \n" /* trans to fp32*/ - "scvtf v10.4s, v6.4s \n" /* trans to fp32*/ - "scvtf v11.4s, v7.4s \n" /* trans to fp32*/ - - "subs %[loop], %[loop], #1 \n" - - "fmul v4.4s, v8.4s, %[scale].4s \n" /* mul with scale*/ - "fmul v5.4s, v9.4s, %[scale].4s \n" /* mul with scale*/ - "fmul v6.4s, v10.4s, %[scale].4s \n" /* mul with scale*/ - "fmul v7.4s, v11.4s, %[scale].4s \n" /* mul with scale*/ - - "stp q4, q5, [%[out]], #32 \n" /* write to memory*/ - "stp q6, q7, [%[out]], #32 \n" /* write to memory*/ - - "bne 0b \n" - : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr) - : [scale] "w"(vscale) - : "v0", "v1", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11"); -#else - asm volatile( - "vld1.32 {d0-d3}, [%[in]]! @ load 16 int16\n" - "0: @ main loop\n" - "vmovl.s16 q4, d0 @ trans to int32\n" - "vmovl.s16 q5, d1 @ trans to int32\n" - "vmovl.s16 q6, d2 @ trans to int32\n" - "vmovl.s16 q7, d3 @ trans to int32\n" - "vcvt.f32.s32 q0, q4 @ trans to fp32\n" - "vcvt.f32.s32 q1, q5 @ trans to fp32\n" - "vcvt.f32.s32 q2, q6 @ trans to fp32\n" - "vcvt.f32.s32 q3, q7 @ trans to fp32\n" - "vmul.f32 q4, q0, %q[scale] @ mul with scale\n" - "vmul.f32 q5, q1, %q[scale] @ mul with scale\n" - "vmul.f32 q6, q2, %q[scale] @ mul with scale\n" - "vmul.f32 q7, q3, %q[scale] @ mul with scale\n" - - "vld1.32 {d0-d3}, [%[in]]! @ load 16 int8\n" - - "subs %[loop], #1 \n" - - "vst1.f32 {d8-d11}, [%[out]]! @ write to memory\n" - "vst1.f32 {d12-d15}, [%[out]]! @ write to memory\n" - - "bne 0b \n" - : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr) - : [scale] "w"(vscale) - : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); -#endif // __aarch64__ - } - const int16_t* din_r = din_c + 16 * cnt; - float* dout_r = dout_c + 16 * cnt; - for (int i = 0; i < remain; ++i) { - dout_r[i] = in_scale * din_r[i]; - } - } -} - -void int32_to_fp32(const int* din, float* dout, const float* scale, - int axis_size, int64_t outer_size, int64_t inner_size) { - int cnt = inner_size / 16; - int remain = inner_size & 15; - int64_t loop_size = axis_size * outer_size; -#pragma omp parallel for - for (int64_t n = 0; n < loop_size; ++n) { - float in_scale = scale[n % axis_size]; - const int* din_c = din + n * inner_size; - float* dout_c = dout + n * inner_size; - float32x4_t vscale = vdupq_n_f32(in_scale); - if (cnt > 0) { - int loop = cnt; - const int* din_ptr = din_c; - float* dout_ptr = dout_c; -#ifdef __aarch64__ - asm volatile( - "ldp q0, q1, [%[in]], #32 \n" - "ldp q2, q3, [%[in]], #32 \n" - "0: \n" - "scvtf v4.4s, v0.4s \n" - "scvtf v5.4s, v1.4s \n" - "scvtf v6.4s, v2.4s \n" - "scvtf v7.4s, v3.4s \n" - "ldp q0, q1, [%[in]], #32 \n" - "fmul v8.4s, v4.4s, %[scale].4s \n" - "fmul v9.4s, v5.4s, %[scale].4s \n" - "fmul v10.4s, v6.4s, %[scale].4s \n" - "fmul v11.4s, v7.4s, %[scale].4s \n" - "ldp q2, q3, [%[in]], #32 \n" - "stp q8, q9, [%[out]], #32 \n" - "stp q10, q11, [%[out]], #32 \n" - "subs %[loop], %[loop], #1 \n" - "bne 0b \n" - : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr) - : [scale] "w"(vscale) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", - "v11"); -#else - asm volatile( - "vld1.s32 {d0-d3}, [%[in]]! \n" - "vld1.s32 {d4-d7}, [%[in]]! \n" - "0: \n" - "vcvt.f32.s32 q4, q0 \n" - "vcvt.f32.s32 q5, q1 \n" - "vcvt.f32.s32 q6, q2 \n" - "vcvt.f32.s32 q7, q3 \n" - "vld1.s32 {d0-d3}, [%[in]]! \n" - "vmul.f32 q8, q4, %q[scale] \n" - "vmul.f32 q9, q5, %q[scale] \n" - "vmul.f32 q10, q6, %q[scale] \n" - "vmul.f32 q11, q7, %q[scale] \n" - "vld1.s32 {d4-d7}, [%[in]]! \n" - "subs %[loop], #1 \n" - "vst1.f32 {d16-d19}, [%[out]]! \n" - "vst1.f32 {d20-d23}, [%[out]]! \n" - "bne 0b \n" - : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr) - : [scale] "w"(vscale) - : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", - "q11"); -#endif // __aarch64__ - } - const int* din_r = din_c + 16 * cnt; - float* dout_r = dout_c + 16 * cnt; - for (int i = 0; i < remain; ++i) { - dout_r[i] = in_scale * din_r[i]; - } - } -} - -void int32_to_int8(const int* din, signed char* dout, const float* scale, - int axis_size, int64_t outer_size, int64_t inner_size) { - int cnt = inner_size / 16; - int remain = inner_size & 15; - int64_t loop_size = outer_size * axis_size; -#pragma omp parallel for - for (int64_t n = 0; n < loop_size; ++n) { - float in_scale = scale[n % axis_size]; - const int* din_c = din + n * inner_size; - signed char* dout_c = dout + n * inner_size; - float32x4_t vscale = vdupq_n_f32(in_scale); - float32x4_t vzero = vdupq_n_f32(0.f); - float32x4_t vpoff = vdupq_n_f32(0.5f); - float32x4_t vnoff = vdupq_n_f32(-0.5f); - if (cnt > 0) { - int loop = cnt; - const int* din_ptr = din_c; - signed char* dout_ptr = dout_c; -#ifdef __aarch64__ - asm volatile( - "0: \n" - "ld1 {v0.4s, v1.4s}, [%[in]], #32 \n" - "ld1 {v2.4s, v3.4s}, [%[in]], #32 \n" - - "scvtf v4.4s, v0.4s \n" - "scvtf v5.4s, v1.4s \n" - "scvtf v6.4s, v2.4s \n" - "scvtf v7.4s, v3.4s \n" - - "fmul v0.4s, v4.4s, %[scale].4s \n" - "fmul v1.4s, v5.4s, %[scale].4s \n" - "fmul v2.4s, v6.4s, %[scale].4s \n" - "fmul v3.4s, v7.4s, %[scale].4s \n" - - "fcvtas v4.4s, v0.4s \n" - "fcvtas v5.4s, v1.4s \n" - "fcvtas v6.4s, v2.4s \n" - "fcvtas v7.4s, v3.4s \n" - - "sqxtn v0.4h, v4.4s \n" - "sqxtn2 v0.8h, v5.4s \n" - "sqxtn v1.4h, v6.4s \n" - "sqxtn2 v1.8h, v7.4s \n" - - "sqxtn v2.8b, v0.8h \n" - "sqxtn2 v2.16b, v1.8h \n" - - "st1 {v2.16b}, [%[out]], #16 \n" - "subs %[loop], %[loop], #1 \n" - "bne 0b \n" - : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr) - : [scale] "w"(vscale) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); -#else - asm volatile( - "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" - "vld1.32 {d4-d7}, [%[din]]! @ load in8~in16\n" - "0: @ main loop\n" - "vcvt.f32.s32 q4, q0 @ cvt to float\n" - "vcvt.f32.s32 q5, q1 @ cvt to float\n" - "vcvt.f32.s32 q6, q2 @ cvt to float\n" - "vcvt.f32.s32 q7, q3 @ cvt to float\n" - "vand.i32 q0, %q[vpoff], %q[vpoff] @ set offset, 0.5\n" - "vand.i32 q1, q0, q0 @ set offset, 0.5\n" - "vand.i32 q2, q0, q0 @ set offset, 0.5\n" - "vand.i32 q3, q0, q0 @ set offset, 0.5\n" - "vcgt.f32 q8, q4, %q[vzero] @ get mask > 0, in0\n" - "vcgt.f32 q9, q5, %q[vzero] @ get mask > 0, in1\n" - "vcgt.f32 q10, q6, %q[vzero] @ get mask > 0, in2\n" - "vcgt.f32 q11, q7, %q[vzero] @ get mask > 0, in3\n" - "vbif.f32 q0, %q[vnoff], q8 @ get right offset\n" - "vbif.f32 q1, %q[vnoff], q9 @ get right offset\n" - "vbif.f32 q2, %q[vnoff], q10 @ get right offset\n" - "vbif.f32 q3, %q[vnoff], q11 @ get right offset\n" - "vmla.f32 q0, q4, %q[vscale] @ mul scale\n" - "vmla.f32 q1, q5, %q[vscale] @ mul scale\n" - "vmla.f32 q2, q6, %q[vscale] @ mul scale\n" - "vmla.f32 q3, q7, %q[vscale] @ mul scale\n" - "vcvt.s32.f32 q4, q0 @ cvt to int32\n" - "vcvt.s32.f32 q5, q1 @ cvt to int32\n" - "vcvt.s32.f32 q6, q2 @ cvt to int32\n" - "vcvt.s32.f32 q7, q3 @ cvt to int32\n" - "vqmovn.s32 d16, q4 @ cnt to int16\n" - "vqmovn.s32 d17, q5 @ cnt to int16\n" - "vqmovn.s32 d18, q6 @ cnt to int16\n" - "vqmovn.s32 d19, q7 @ cnt to int16\n" - "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" - "vqmovn.s16 d8, q8 @ cnt to int8\n" - "vqmovn.s16 d9, q9 @ cnt to int8\n" - "vld1.32 {d4-d7}, [%[din]]! @ load in8~in16\n" - "vst1.32 {d8-d9}, [%[dout]]! @ write to output\n" - "subs %[loop], #1 @ loop count -1\n" - "bne 0b @ to main loop\n" - : [loop] "+r"(loop), [din] "+r"(din_ptr), [dout] "+r"(dout_ptr) - : [vscale] "w"(vscale), [vzero] "w"(vzero), [vnoff] "w"(vnoff), - [vpoff] "w"(vpoff) - : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", - "q11"); -#endif // __aarch64__ - } - const int* din_r = din_c + 16 * cnt; - int8_t* dout_r = dout_c + 16 * cnt; - for (int i = 0; i < remain; ++i) { - dout_r[i] = saturate_cast(roundf(in_scale * din_r[i])); - } - } -} - -void int32_to_int32(const int* din, int* dout, const float* scale, - int axis_size, int64_t outer_size, int64_t inner_size) { - int size_all = outer_size * axis_size * inner_size; - memmove(dout, din, size_all * sizeof(int)); -} - -template <> -void int32_to_dtype(const int* din, float* dout, const float* scale, - int axis_size, int64_t outer_size, int64_t inner_size) { - return int32_to_fp32(din, dout, scale, axis_size, outer_size, inner_size); -} - -template <> -void int32_to_dtype(const int* din, signed char* dout, const float* scale, - int axis_size, int64_t outer_size, int64_t inner_size) { - return int32_to_int8(din, dout, scale, axis_size, outer_size, inner_size); -} - -template <> -void int32_to_dtype(const int* din, int* dout, const float* scale, - int axis_size, int64_t outer_size, int64_t inner_size) { - return int32_to_int32(din, dout, scale, axis_size, outer_size, inner_size); -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/paddle/fluid/lite/kernels/arm/CMakeLists.txt b/paddle/fluid/lite/kernels/arm/CMakeLists.txt index 95c8b95ec16..337fd846cbd 100644 --- a/paddle/fluid/lite/kernels/arm/CMakeLists.txt +++ b/paddle/fluid/lite/kernels/arm/CMakeLists.txt @@ -16,6 +16,7 @@ cc_library(pool_compute_arm SRCS pool_compute.cc DEPS ${lite_kernel_deps} math_a cc_library(split_compute_arm SRCS split_compute.cc DEPS ${lite_kernel_deps} math_arm) cc_library(concat_compute_arm SRCS concat_compute.cc DEPS ${lite_kernel_deps} math_arm) cc_library(dropout_compute_arm SRCS dropout_compute.cc DEPS ${lite_kernel_deps} math_arm) +cc_library(calib_compute_arm SRCS calib_compute.cc DEPS ${lite_kernel_deps} math_arm) cc_library(transpose_compute_arm SRCS transpose_compute.cc DEPS ${lite_kernel_deps} math_arm) lite_cc_test(test_fc_compute_arm SRCS fc_compute_test.cc DEPS fc_compute_arm math_arm) @@ -30,6 +31,7 @@ lite_cc_test(test_mul_compute_arm SRCS mul_compute_test.cc DEPS mul_compute_arm) lite_cc_test(test_split_compute_arm SRCS split_compute_test.cc DEPS split_compute_arm) lite_cc_test(test_concat_compute_arm SRCS concat_compute_test.cc DEPS concat_compute_arm) lite_cc_test(test_dropout_compute_arm SRCS dropout_compute_test.cc DEPS dropout_compute_arm) +lite_cc_test(test_calib_compute_arm SRCS calib_compute_test.cc DEPS calib_compute_arm) lite_cc_test(test_transpose_compute_arm SRCS transpose_compute_test.cc DEPS transpose_compute_arm) set(arm_kernels diff --git a/paddle/fluid/lite/kernels/arm/calib_compute.cc b/paddle/fluid/lite/kernels/arm/calib_compute.cc new file mode 100644 index 00000000000..c64ebdc13db --- /dev/null +++ b/paddle/fluid/lite/kernels/arm/calib_compute.cc @@ -0,0 +1,57 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/lite/kernels/arm/calib_compute.h" +#include +#include "paddle/fluid/lite/arm/math/type_trans.h" +#include "paddle/fluid/lite/core/op_registry.h" +#include "paddle/fluid/lite/core/type_system.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace arm { + +void CalibCompute::Run() { + auto& param = this->Param(); + std::vector scale = {param.in_scale}; + if (param.in_dtype == PRECISION(kFloat) && + param.out_dtype == PRECISION(kInt8)) { + const auto* din = param.input->data(); + auto* dout = param.output->mutable_data(); + lite::arm::math::fp32_to_int8(din, dout, scale.data(), 1, 1, + param.input->numel()); + return; + } + if (param.in_dtype == PRECISION(kInt8) && + param.out_dtype == PRECISION(kFloat)) { + const auto* din = param.input->data(); + auto* dout = param.output->mutable_data(); + lite::arm::math::int8_to_fp32(din, dout, scale.data(), 1, 1, + param.input->numel()); + return; + } + LOG(FATAL) << "Unsupport Dtype."; +} + +} // namespace arm +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(calib, kARM, kAny, kAny, + paddle::lite::kernels::arm::CalibCompute, def) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) + .Finalize(); diff --git a/paddle/fluid/lite/kernels/arm/calib_compute.h b/paddle/fluid/lite/kernels/arm/calib_compute.h new file mode 100644 index 00000000000..898e1ce6383 --- /dev/null +++ b/paddle/fluid/lite/kernels/arm/calib_compute.h @@ -0,0 +1,40 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/lite/core/kernel.h" +#include "paddle/fluid/lite/operators/calib_op.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace arm { + +class CalibCompute : public KernelLite { + public: + using param_t = operators::CalibParam; + + // void PrepareForRun() override; + + void Run() override; + + ~CalibCompute() override{}; + + private: +}; + +} // namespace arm +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/paddle/fluid/lite/kernels/arm/calib_compute_test.cc b/paddle/fluid/lite/kernels/arm/calib_compute_test.cc new file mode 100644 index 00000000000..68bbd94b9b3 --- /dev/null +++ b/paddle/fluid/lite/kernels/arm/calib_compute_test.cc @@ -0,0 +1,149 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/lite/kernels/arm/calib_compute.h" +#include +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/lite/arm/math/funcs.h" +#include "paddle/fluid/lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace arm { + +static int get_rand(int start, int end) { + int i = rand(); // NOLINT + i = (i % (end - start)) + start; + return i; +} + +static void int8_to_fp32_basic(const int8_t* din, float* dout, + const float* scale, int axis_size, + int64_t outer_size, int64_t inner_size) { + int loop_size = axis_size * outer_size; + for (int i = 0; i < loop_size; ++i) { + float scale_in = scale[i % axis_size]; + for (int j = 0; j < inner_size; ++j) { + dout[j] = din[j] * scale_in; + } + dout += inner_size; + din += inner_size; + } +} + +static void fp32_to_int8_basic(const float* din, int8_t* dout, + const float* scale, int axis_size, + int64_t outer_size, int64_t inner_size) { + int loop_size = axis_size * outer_size; + for (int i = 0; i < loop_size; ++i) { + float inv_scale = 1.f / scale[i % axis_size]; + for (int j = 0; j < inner_size; ++j) { + dout[j] = static_cast(roundf(din[j] * inv_scale)); + } + dout += inner_size; + din += inner_size; + } +} + +void calib_ref(const operators::CalibParam& param) { + std::vector scale = {param.in_scale}; + if (param.in_dtype == PRECISION(kFloat) && + param.out_dtype == PRECISION(kInt8)) { + const auto* din = param.input->data(); + auto* dout = param.output->mutable_data(); + fp32_to_int8_basic(din, dout, scale.data(), 1, 1, param.input->numel()); + return; + } + if (param.in_dtype == PRECISION(kInt8) && + param.out_dtype == PRECISION(kFloat)) { + const auto* din = param.input->data(); + auto* dout = param.output->mutable_data(); + int8_to_fp32_basic(din, dout, scale.data(), 1, 1, param.input->numel()); + return; + } + LOG(FATAL) << "Unsupport Dtype."; +} + +TEST(calib_arm, retrive_op) { + auto calib = + KernelRegistry::Global() + .Create("calib"); + ASSERT_FALSE(calib.empty()); + ASSERT_TRUE(calib.front()); +} + +TEST(calib_arm, init) { + CalibCompute calib; + ASSERT_EQ(calib.precision(), PRECISION(kAny)); + ASSERT_EQ(calib.target(), TARGET(kARM)); +} + +TEST(calib_arm, int8_to_fp32) { + DeviceInfo::Init(); + for (auto n : {1, 2}) { + for (auto c : {6, 32 /*, 128*/}) { + for (auto h : {9, 18 /*, 56 , 112, 224, 512*/}) { + for (auto w : {9, 18 /*, 56, 112, 224, 512*/}) { + Tensor x; + Tensor output; + Tensor output_ref; + // set the dims of input, output, ref output tensors + x.Resize({n, c, h, w}); + output.Resize({n, c, h, w}); + output_ref.Resize({n, c, h, w}); + // initialize the data of input tensors + auto* x_data = x.mutable_data(); + auto* output_data = output.mutable_data(); + for (int i = 0; i < x.dims().production(); i++) { + float sign = i % 3 == 0 ? -1.0f : 1.0f; + x_data[i] = sign * static_cast(i % 128) * 0.013f; + } + // prepare kernel params and run + CalibCompute calib; + std::unique_ptr ctx(new KernelContext); + ctx->As(); + calib.SetContext(std::move(ctx)); + operators::CalibParam param; + param.in_scale = get_rand(0, 100) * 0.1f; + param.in_dtype = PRECISION(kInt8); + param.out_dtype = PRECISION(kFloat); + param.input = &x; + param.output = &output; + calib.SetParam(param); + calib.Launch(); + // invoking ref implementation and compare results + param.output = &output_ref; + calib_ref(param); + auto* output_ref_data = output_ref.mutable_data(); + for (int i = 0; i < output.dims().production(); i++) { + EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5); + } + } + } + } + } +} + +} // namespace arm +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(calib, kARM, kAny, kAny, def); diff --git a/paddle/fluid/lite/operators/CMakeLists.txt b/paddle/fluid/lite/operators/CMakeLists.txt index 004e86175ad..900fbc96f06 100644 --- a/paddle/fluid/lite/operators/CMakeLists.txt +++ b/paddle/fluid/lite/operators/CMakeLists.txt @@ -21,6 +21,7 @@ cc_library(fill_constant_op_lite SRCS fill_constant_op.cc DEPS ${op_DEPS}) cc_library(op_params_lite SRCS op_params.cc DEPS ${tensor_lite} any_lite framework_proto_lite) cc_library(dropout_op_lite SRCS dropout_op.cc DEPS ${op_DEPS}) cc_library(concat_op_lite SRCS concat_op.cc DEPS ${op_DEPS}) +cc_library(calib_op_lite SRCS calib_op.cc DEPS ${op_DEPS}) cc_library(split_op_lite SRCS split_op.cc DEPS ${op_DEPS}) cc_library(transpose_op_lite SRCS transpose_op.cc DEPS ${op_DEPS}) @@ -44,6 +45,7 @@ set(ops_lite activation_ops_lite dropout_op_lite concat_op_lite + calib_op_lite split_op_lite transpose_op_lite PARENT_SCOPE) @@ -60,6 +62,7 @@ lite_cc_test(test_softmax_op_lite SRCS softmax_op_test.cc DEPS softmax_op_lite m lite_cc_test(test_reshape_op_lite SRCS reshape_op_test.cc DEPS reshape_op_lite memory_lite) lite_cc_test(test_batch_norm_op_lite SRCS batch_norm_op_test.cc DEPS batch_norm_op_lite memory_lite) lite_cc_test(test_concat_op_lite SRCS concat_op_test.cc DEPS concat_op_lite memory_lite) +lite_cc_test(test_calib_op_lite SRCS calib_op_test.cc DEPS calib_op_lite memory_lite ARM_DEPS calib_compute_arm) lite_cc_test(test_fusion_elementwise_activation_ops_lite SRCS fusion_elementwise_activation_ops_test.cc DEPS fusion_elementwise_activation_ops_lite memory_lite) diff --git a/paddle/fluid/lite/operators/calib_op.cc b/paddle/fluid/lite/operators/calib_op.cc new file mode 100644 index 00000000000..e9d188e4aeb --- /dev/null +++ b/paddle/fluid/lite/operators/calib_op.cc @@ -0,0 +1,56 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/lite/operators/calib_op.h" +#include "paddle/fluid/lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool CalibOpLite::CheckShape() const { + CHECK_OR_FALSE(param_.input); + CHECK_OR_FALSE(param_.output); + return true; +} +bool CalibOpLite::InferShape() const { + param_.output->Resize(param_.input->dims()); + return true; +} + +bool CalibOpLite::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) { + auto x_var = scope->FindVar(opdesc.Input("Input").front()); + auto output_var = scope->FindVar(opdesc.Output("Out").front()); + CHECK(x_var); + CHECK(output_var); + param_.input = const_cast(&(x_var->Get())); + param_.output = output_var->GetMutable(); + std::vector input_arg_names = opdesc.InputArgumentNames(); + param_.in_dtype = + static_cast(opdesc.GetAttr("in_dtype")); + param_.out_dtype = + static_cast(opdesc.GetAttr("out_dtype")); + if (opdesc.HasAttr("in_scale")) { + param_.in_scale = opdesc.GetAttr("in_scale"); + } + CHECK(param_.input) << "Input(X) of CalibOp should not be null."; + CHECK(param_.output) << "Output(Out) of CalibOp should not be null."; + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(calib, paddle::lite::operators::CalibOpLite); diff --git a/paddle/fluid/lite/operators/calib_op.h b/paddle/fluid/lite/operators/calib_op.h new file mode 100644 index 00000000000..ef6e94c4c12 --- /dev/null +++ b/paddle/fluid/lite/operators/calib_op.h @@ -0,0 +1,60 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/fluid/lite/core/compatible_tensor.h" +#include "paddle/fluid/lite/core/kernel.h" +#include "paddle/fluid/lite/core/op_lite.h" +#include "paddle/fluid/lite/core/scope.h" +#include "paddle/fluid/lite/operators/op_params.h" +#include "paddle/fluid/lite/utils/all.h" + +namespace paddle { +namespace lite { +namespace operators { + +class CalibOpLite : public OpLite { + public: + CalibOpLite() {} + + explicit CalibOpLite(const std::string &type) : OpLite(type) {} + + bool CheckShape() const override; + + bool InferShape() const override; + + /* + bool Run() override { + CHECK(kernel_); + kernel_->Run(); + return true; + } + */ + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope); + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + + std::string DebugString() const override { return "calib"; } + + private: + mutable CalibParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/paddle/fluid/lite/operators/calib_op_test.cc b/paddle/fluid/lite/operators/calib_op_test.cc new file mode 100644 index 00000000000..2b54352059a --- /dev/null +++ b/paddle/fluid/lite/operators/calib_op_test.cc @@ -0,0 +1,64 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/lite/operators/calib_op.h" +#include +#include "paddle/fluid/lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +#ifdef LITE_WITH_ARM +TEST(calib_op_lite, TestARM) { + // prepare variables + Scope scope; + auto* x = scope.Var("Input")->GetMutable(); + auto* output = scope.Var("output")->GetMutable(); + x->Resize(DDim(std::vector({1, 10, 20}))); + output->Resize(DDim(std::vector{1, 10, 20})); + + // set data + for (int i = 0; i < 10 * 20; i++) { + x->mutable_data()[i] = i; + } + for (int i = 0; i < 10 * 20; i++) { + output->mutable_data()[i] = 0.; + } + + // prepare op desc + cpp::OpDesc desc; + desc.SetType("calib"); + desc.SetInput("Input", {"Input"}); + desc.SetOutput("Out", {"output"}); + desc.SetAttr("in_dtype", static_cast(PRECISION(kInt8))); + desc.SetAttr("out_dtype", static_cast(PRECISION(kFloat))); + desc.SetAttr("in_scale", 10.0f); + + CalibOpLite calib("calib"); + + calib.SetValidPlaces({Place{TARGET(kARM), PRECISION(kAny)}}); + calib.Attach(desc, &scope); + auto kernels = calib.CreateKernels({Place{TARGET(kARM), PRECISION(kAny)}}); + ASSERT_FALSE(kernels.empty()); +} +#endif + +} // namespace operators +} // namespace lite +} // namespace paddle + +#ifdef LITE_WITH_ARM +USE_LITE_KERNEL(calib, kARM, kAny, kAny, def); +#endif diff --git a/paddle/fluid/lite/operators/op_params.h b/paddle/fluid/lite/operators/op_params.h index b50e14a4855..23a21c41a3c 100644 --- a/paddle/fluid/lite/operators/op_params.h +++ b/paddle/fluid/lite/operators/op_params.h @@ -48,6 +48,14 @@ struct IoCopyParam { lite::Tensor* y{}; }; +struct CalibParam { + const lite::Tensor* input{}; + lite::Tensor* output{}; + float in_scale; + PrecisionType in_dtype; + PrecisionType out_dtype; +}; + /// -------------------------- NN operators ------------------------------------ struct FcParam { -- GitLab