提交 9ae05da3 编写于 作者: T tensor-tang

fix format

上级 1b1f9b07
...@@ -172,3 +172,4 @@ add_subdirectory(model_parser) ...@@ -172,3 +172,4 @@ add_subdirectory(model_parser)
add_subdirectory(utils) add_subdirectory(utils)
add_subdirectory(api) add_subdirectory(api)
add_subdirectory(gen_code) add_subdirectory(gen_code)
...@@ -54,3 +54,4 @@ lite_cc_binary(cxx_api_lite_bin SRCS cxx_api_bin.cc ...@@ -54,3 +54,4 @@ lite_cc_binary(cxx_api_lite_bin SRCS cxx_api_bin.cc
mir_passes mir_passes
${ops_lite} ${host_kernels} ${ops_lite} ${host_kernels}
ARM_DEPS ${arm_kernels}) ARM_DEPS ${arm_kernels})
add_subdirectory(math) add_subdirectory(math)
...@@ -34,3 +34,4 @@ cc_library(math_arm SRCS ...@@ -34,3 +34,4 @@ cc_library(math_arm SRCS
split.cc split.cc
DEPS ${lite_kernel_deps} eigen3) DEPS ${lite_kernel_deps} eigen3)
...@@ -12,9 +12,10 @@ ...@@ -12,9 +12,10 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/lite/arm/math/saturate.h" #include "paddle/fluid/lite/arm/math/type_trans.h"
#include <arm_neon.h> #include <arm_neon.h>
#include <string.h> #include <string.h>
#include "paddle/fluid/lite/arm/math/saturate.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -23,14 +24,13 @@ namespace math { ...@@ -23,14 +24,13 @@ namespace math {
template <typename dtype> template <typename dtype>
void int32_to_dtype(const int* din, dtype* dout, const float* scale, void int32_to_dtype(const int* din, dtype* dout, const float* scale,
int axis_size, long long outer_size, long long inner_size); int axis_size, int64_t outer_size, int64_t inner_size);
void fp32_to_int8(const float* din, signed char* dout, const float* scale, void fp32_to_int8(const float* din, signed char* dout, const float* scale,
int axis_size, long long outer_size, long long inner_size) { int axis_size, int64_t outer_size, int64_t inner_size) {
int cnt = inner_size / 16; int cnt = inner_size / 16;
int remain = inner_size & 15; int remain = inner_size & 15;
long long loop_size = outer_size * axis_size; int64_t loop_size = outer_size * axis_size;
#pragma omp parallel for #pragma omp parallel for
for (int j = 0; j < loop_size; ++j) { for (int j = 0; j < loop_size; ++j) {
...@@ -69,10 +69,10 @@ void fp32_to_int8(const float* din, signed char* dout, const float* scale, ...@@ -69,10 +69,10 @@ void fp32_to_int8(const float* din, signed char* dout, const float* scale,
"sqxtn2 v8.16b, v5.8h \n" "sqxtn2 v8.16b, v5.8h \n"
"str q8, [%[out]], #16 \n" "str q8, [%[out]], #16 \n"
"bne 0b \n" "bne 0b \n"
: [in] "+r" (din_ptr), [out] "+r" (dout_ptr), [cnt] "+r" (cnt_loop) : [in] "+r"(din_ptr), [out] "+r"(dout_ptr), [cnt] "+r"(cnt_loop)
: [scale] "w" (vscale) : [scale] "w"(vscale)
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11" : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
); "v11");
#else #else
asm volatile( asm volatile(
"vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n"
...@@ -110,10 +110,11 @@ void fp32_to_int8(const float* din, signed char* dout, const float* scale, ...@@ -110,10 +110,11 @@ void fp32_to_int8(const float* din, signed char* dout, const float* scale,
"subs %[cnt], #1 @ loop count -1\n" "subs %[cnt], #1 @ loop count -1\n"
"bne 0b @ to main loop\n" "bne 0b @ to main loop\n"
:[dout]"+r"(dout_ptr), [din]"+r"(din_ptr), [cnt]"+r"(cnt_loop) : [dout] "+r"(dout_ptr), [din] "+r"(din_ptr), [cnt] "+r"(cnt_loop)
:[vscale]"w"(vscale), [vpoff]"w"(vpoff), [vnoff]"w"(vnoff), [vzero]"w"(vzero) : [vscale] "w"(vscale), [vpoff] "w"(vpoff), [vnoff] "w"(vnoff),
:"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11" [vzero] "w"(vzero)
); : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10",
"q11");
#endif #endif
} }
const float* din_r = din_c + 16 * cnt; const float* din_r = din_c + 16 * cnt;
...@@ -125,11 +126,10 @@ void fp32_to_int8(const float* din, signed char* dout, const float* scale, ...@@ -125,11 +126,10 @@ void fp32_to_int8(const float* din, signed char* dout, const float* scale,
} }
void fp32_to_int16(const float* din, int16_t* dout, const float* scale, void fp32_to_int16(const float* din, int16_t* dout, const float* scale,
int axis_size, long long outer_size, long long inner_size) { int axis_size, int64_t outer_size, int64_t inner_size) {
int cnt = inner_size / 8; int cnt = inner_size / 8;
int remain = inner_size & 7; int remain = inner_size & 7;
long long loop_size = outer_size * axis_size; int64_t loop_size = outer_size * axis_size;
#pragma omp parallel for #pragma omp parallel for
for (int j = 0; j < loop_size; ++j) { for (int j = 0; j < loop_size; ++j) {
...@@ -158,10 +158,9 @@ void fp32_to_int16(const float* din, int16_t* dout, const float* scale, ...@@ -158,10 +158,9 @@ void fp32_to_int16(const float* din, int16_t* dout, const float* scale,
"sqxtn2 v4.8h, v9.4s \n" "sqxtn2 v4.8h, v9.4s \n"
"str q4, [%[out]], #16 \n" "str q4, [%[out]], #16 \n"
"bne 0b \n" "bne 0b \n"
: [in] "+r" (din_ptr), [out] "+r" (dout_ptr), [cnt] "+r" (cnt_loop) : [in] "+r"(din_ptr), [out] "+r"(dout_ptr), [cnt] "+r"(cnt_loop)
: [scale] "w" (vscale) : [scale] "w"(vscale)
: "v0", "v1", "v4", "v5", "v8", "v9" : "v0", "v1", "v4", "v5", "v8", "v9");
);
#else #else
asm volatile( asm volatile(
"vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n"
...@@ -185,10 +184,10 @@ void fp32_to_int16(const float* din, int16_t* dout, const float* scale, ...@@ -185,10 +184,10 @@ void fp32_to_int16(const float* din, int16_t* dout, const float* scale,
"subs %[cnt], #1 @ loop count -1\n" "subs %[cnt], #1 @ loop count -1\n"
"bne 0b @ to main loop\n" "bne 0b @ to main loop\n"
:[dout]"+r"(dout_ptr), [din]"+r"(din_ptr), [cnt]"+r"(cnt_loop) : [dout] "+r"(dout_ptr), [din] "+r"(din_ptr), [cnt] "+r"(cnt_loop)
:[vscale]"w"(vscale), [vpoff]"w"(vpoff), [vnoff]"w"(vnoff), [vzero]"w"(vzero) : [vscale] "w"(vscale), [vpoff] "w"(vpoff), [vnoff] "w"(vnoff),
:"q0", "q1", "q4", "q5", "q6", "q7", "q8", "q9" [vzero] "w"(vzero)
); : "q0", "q1", "q4", "q5", "q6", "q7", "q8", "q9");
#endif #endif
} }
const float* din_r = din_c + 8 * cnt; const float* din_r = din_c + 8 * cnt;
...@@ -200,13 +199,12 @@ void fp32_to_int16(const float* din, int16_t* dout, const float* scale, ...@@ -200,13 +199,12 @@ void fp32_to_int16(const float* din, int16_t* dout, const float* scale,
} }
void int8_to_fp32(const signed char* in, float* out, const float* scale, void int8_to_fp32(const signed char* in, float* out, const float* scale,
int axis_size, long long outer_size, long long inner_size) { int axis_size, int64_t outer_size, int64_t inner_size) {
int cnt = inner_size / 16; int cnt = inner_size / 16;
int remain = inner_size & 15; int remain = inner_size & 15;
long long loop_size = axis_size * outer_size; int64_t loop_size = axis_size * outer_size;
#pragma omp parallel for #pragma omp parallel for
for (long long n = 0; n < loop_size; ++n) { for (int64_t n = 0; n < loop_size; ++n) {
float in_scale = scale[n % axis_size]; float in_scale = scale[n % axis_size];
const signed char* din_c = in + n * inner_size; const signed char* din_c = in + n * inner_size;
float* dout_c = out + n * inner_size; float* dout_c = out + n * inner_size;
...@@ -245,10 +243,10 @@ void int8_to_fp32(const signed char* in, float* out, const float* scale, ...@@ -245,10 +243,10 @@ void int8_to_fp32(const signed char* in, float* out, const float* scale,
"stp q6, q7, [%[out]], #32 \n" /* write to memory*/ "stp q6, q7, [%[out]], #32 \n" /* write to memory*/
"bne 0b \n" "bne 0b \n"
:[loop] "+r" (loop), [in] "+r" (din_ptr), [out] "+r" (dout_ptr) : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
:[scale] "w" (vscale) : [scale] "w"(vscale)
:"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11" : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
); "v11");
#else #else
asm volatile( asm volatile(
"vld1.32 {d0-d1}, [%[in]]! @ load 16 int8\n" "vld1.32 {d0-d1}, [%[in]]! @ load 16 int8\n"
...@@ -276,11 +274,10 @@ void int8_to_fp32(const signed char* in, float* out, const float* scale, ...@@ -276,11 +274,10 @@ void int8_to_fp32(const signed char* in, float* out, const float* scale,
"vst1.f32 {d12-d15}, [%[out]]! @ write to memory\n" "vst1.f32 {d12-d15}, [%[out]]! @ write to memory\n"
"bne 0b \n" "bne 0b \n"
:[loop] "+r" (loop), [in] "+r" (din_ptr), [out] "+r" (dout_ptr) : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
:[scale] "w" (vscale) : [scale] "w"(vscale)
:"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7" : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
); #endif // __aarch64__
#endif //__aarch64__
} }
const signed char* din_r = din_c + 16 * cnt; const signed char* din_r = din_c + 16 * cnt;
float* dout_r = dout_c + 16 * cnt; float* dout_r = dout_c + 16 * cnt;
...@@ -290,21 +287,20 @@ void int8_to_fp32(const signed char* in, float* out, const float* scale, ...@@ -290,21 +287,20 @@ void int8_to_fp32(const signed char* in, float* out, const float* scale,
} }
} }
void int16_to_fp32(const short* in, float* out, const float* scale, void int16_to_fp32(const int16_t* in, float* out, const float* scale,
int axis_size, long long outer_size, long long inner_size) { int axis_size, int64_t outer_size, int64_t inner_size) {
int cnt = inner_size / 16; int cnt = inner_size / 16;
int remain = inner_size & 15; int remain = inner_size & 15;
long long loop_size = axis_size * outer_size; int64_t loop_size = axis_size * outer_size;
#pragma omp parallel for #pragma omp parallel for
for (long long n = 0; n < loop_size; ++n) { for (int64_t n = 0; n < loop_size; ++n) {
float in_scale = scale[n % axis_size]; float in_scale = scale[n % axis_size];
const short* din_c = in + n * inner_size; const int16_t* din_c = in + n * inner_size;
float* dout_c = out + n * inner_size; float* dout_c = out + n * inner_size;
float32x4_t vscale = vdupq_n_f32(in_scale); float32x4_t vscale = vdupq_n_f32(in_scale);
if (cnt > 0) { if (cnt > 0) {
int loop = cnt; int loop = cnt;
const short* din_ptr = din_c; const int16_t* din_ptr = din_c;
float* dout_ptr = dout_c; float* dout_ptr = dout_c;
#ifdef __aarch64__ #ifdef __aarch64__
asm volatile( asm volatile(
...@@ -333,10 +329,9 @@ void int16_to_fp32(const short* in, float* out, const float* scale, ...@@ -333,10 +329,9 @@ void int16_to_fp32(const short* in, float* out, const float* scale,
"stp q6, q7, [%[out]], #32 \n" /* write to memory*/ "stp q6, q7, [%[out]], #32 \n" /* write to memory*/
"bne 0b \n" "bne 0b \n"
:[loop] "+r" (loop), [in] "+r" (din_ptr), [out] "+r" (dout_ptr) : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
:[scale] "w" (vscale) : [scale] "w"(vscale)
:"v0", "v1", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11" : "v0", "v1", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11");
);
#else #else
asm volatile( asm volatile(
"vld1.32 {d0-d3}, [%[in]]! @ load 16 int16\n" "vld1.32 {d0-d3}, [%[in]]! @ load 16 int16\n"
...@@ -362,13 +357,12 @@ void int16_to_fp32(const short* in, float* out, const float* scale, ...@@ -362,13 +357,12 @@ void int16_to_fp32(const short* in, float* out, const float* scale,
"vst1.f32 {d12-d15}, [%[out]]! @ write to memory\n" "vst1.f32 {d12-d15}, [%[out]]! @ write to memory\n"
"bne 0b \n" "bne 0b \n"
:[loop] "+r" (loop), [in] "+r" (din_ptr), [out] "+r" (dout_ptr) : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
:[scale] "w" (vscale) : [scale] "w"(vscale)
:"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7" : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
); #endif // __aarch64__
#endif //__aarch64__
} }
const short* din_r = din_c + 16 * cnt; const int16_t* din_r = din_c + 16 * cnt;
float* dout_r = dout_c + 16 * cnt; float* dout_r = dout_c + 16 * cnt;
for (int i = 0; i < remain; ++i) { for (int i = 0; i < remain; ++i) {
dout_r[i] = in_scale * din_r[i]; dout_r[i] = in_scale * din_r[i];
...@@ -377,12 +371,12 @@ void int16_to_fp32(const short* in, float* out, const float* scale, ...@@ -377,12 +371,12 @@ void int16_to_fp32(const short* in, float* out, const float* scale,
} }
void int32_to_fp32(const int* din, float* dout, const float* scale, void int32_to_fp32(const int* din, float* dout, const float* scale,
int axis_size, long long outer_size, long long inner_size) { int axis_size, int64_t outer_size, int64_t inner_size) {
int cnt = inner_size / 16; int cnt = inner_size / 16;
int remain = inner_size & 15; int remain = inner_size & 15;
long long loop_size = axis_size * outer_size; int64_t loop_size = axis_size * outer_size;
#pragma omp parallel for #pragma omp parallel for
for (long long n = 0; n < loop_size; ++n) { for (int64_t n = 0; n < loop_size; ++n) {
float in_scale = scale[n % axis_size]; float in_scale = scale[n % axis_size];
const int* din_c = din + n * inner_size; const int* din_c = din + n * inner_size;
float* dout_c = dout + n * inner_size; float* dout_c = dout + n * inner_size;
...@@ -410,10 +404,10 @@ void int32_to_fp32(const int* din, float* dout, const float* scale, ...@@ -410,10 +404,10 @@ void int32_to_fp32(const int* din, float* dout, const float* scale,
"stp q10, q11, [%[out]], #32 \n" "stp q10, q11, [%[out]], #32 \n"
"subs %[loop], %[loop], #1 \n" "subs %[loop], %[loop], #1 \n"
"bne 0b \n" "bne 0b \n"
:[loop] "+r" (loop), [in] "+r" (din_ptr), [out] "+r" (dout_ptr) : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
:[scale] "w" (vscale) : [scale] "w"(vscale)
:"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11" : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
); "v11");
#else #else
asm volatile( asm volatile(
"vld1.s32 {d0-d3}, [%[in]]! \n" "vld1.s32 {d0-d3}, [%[in]]! \n"
...@@ -433,11 +427,11 @@ void int32_to_fp32(const int* din, float* dout, const float* scale, ...@@ -433,11 +427,11 @@ void int32_to_fp32(const int* din, float* dout, const float* scale,
"vst1.f32 {d16-d19}, [%[out]]! \n" "vst1.f32 {d16-d19}, [%[out]]! \n"
"vst1.f32 {d20-d23}, [%[out]]! \n" "vst1.f32 {d20-d23}, [%[out]]! \n"
"bne 0b \n" "bne 0b \n"
:[loop] "+r" (loop), [in] "+r" (din_ptr), [out] "+r" (dout_ptr) : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
:[scale] "w" (vscale) : [scale] "w"(vscale)
:"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11" : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10",
); "q11");
#endif //__aarch64__ #endif // __aarch64__
} }
const int* din_r = din_c + 16 * cnt; const int* din_r = din_c + 16 * cnt;
float* dout_r = dout_c + 16 * cnt; float* dout_r = dout_c + 16 * cnt;
...@@ -447,13 +441,13 @@ void int32_to_fp32(const int* din, float* dout, const float* scale, ...@@ -447,13 +441,13 @@ void int32_to_fp32(const int* din, float* dout, const float* scale,
} }
} }
void int32_to_int8(const int* din, signed char* dout, const float* scale, \ void int32_to_int8(const int* din, signed char* dout, const float* scale,
int axis_size, long long outer_size, long long inner_size) { int axis_size, int64_t outer_size, int64_t inner_size) {
int cnt = inner_size / 16; int cnt = inner_size / 16;
int remain = inner_size & 15; int remain = inner_size & 15;
long long loop_size = outer_size * axis_size; int64_t loop_size = outer_size * axis_size;
#pragma omp parallel for #pragma omp parallel for
for (long long n = 0; n < loop_size; ++n) { for (int64_t n = 0; n < loop_size; ++n) {
float in_scale = scale[n % axis_size]; float in_scale = scale[n % axis_size];
const int* din_c = din + n * inner_size; const int* din_c = din + n * inner_size;
signed char* dout_c = dout + n * inner_size; signed char* dout_c = dout + n * inner_size;
...@@ -497,10 +491,9 @@ void int32_to_int8(const int* din, signed char* dout, const float* scale, \ ...@@ -497,10 +491,9 @@ void int32_to_int8(const int* din, signed char* dout, const float* scale, \
"st1 {v2.16b}, [%[out]], #16 \n" "st1 {v2.16b}, [%[out]], #16 \n"
"subs %[loop], %[loop], #1 \n" "subs %[loop], %[loop], #1 \n"
"bne 0b \n" "bne 0b \n"
:[loop] "+r" (loop), [in] "+r" (din_ptr), [out] "+r" (dout_ptr) : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
:[scale] "w" (vscale) : [scale] "w"(vscale)
:"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
);
#else #else
asm volatile( asm volatile(
"vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n"
...@@ -541,11 +534,12 @@ void int32_to_int8(const int* din, signed char* dout, const float* scale, \ ...@@ -541,11 +534,12 @@ void int32_to_int8(const int* din, signed char* dout, const float* scale, \
"vst1.32 {d8-d9}, [%[dout]]! @ write to output\n" "vst1.32 {d8-d9}, [%[dout]]! @ write to output\n"
"subs %[loop], #1 @ loop count -1\n" "subs %[loop], #1 @ loop count -1\n"
"bne 0b @ to main loop\n" "bne 0b @ to main loop\n"
:[loop] "+r" (loop), [din] "+r" (din_ptr), [dout] "+r" (dout_ptr) : [loop] "+r"(loop), [din] "+r"(din_ptr), [dout] "+r"(dout_ptr)
:[vscale] "w" (vscale), [vzero] "w"(vzero), [vnoff] "w" (vnoff), [vpoff] "w" (vpoff) : [vscale] "w"(vscale), [vzero] "w"(vzero), [vnoff] "w"(vnoff),
:"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11" [vpoff] "w"(vpoff)
); : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10",
#endif //__aarch64__ "q11");
#endif // __aarch64__
} }
const int* din_r = din_c + 16 * cnt; const int* din_r = din_c + 16 * cnt;
int8_t* dout_r = dout_c + 16 * cnt; int8_t* dout_r = dout_c + 16 * cnt;
...@@ -555,30 +549,27 @@ void int32_to_int8(const int* din, signed char* dout, const float* scale, \ ...@@ -555,30 +549,27 @@ void int32_to_int8(const int* din, signed char* dout, const float* scale, \
} }
} }
void int32_to_int32(const int* din, int* dout, const float* scale, \ void int32_to_int32(const int* din, int* dout, const float* scale,
int axis_size, long long outer_size, long long inner_size) { int axis_size, int64_t outer_size, int64_t inner_size) {
int size_all = outer_size * axis_size * inner_size; int size_all = outer_size * axis_size * inner_size;
memmove(dout, din, size_all*sizeof(int)); memmove(dout, din, size_all * sizeof(int));
} }
template <> template <>
void int32_to_dtype(const int* din, float* dout, const float* scale, void int32_to_dtype(const int* din, float* dout, const float* scale,
int axis_size, long long outer_size, long long inner_size) { int axis_size, int64_t outer_size, int64_t inner_size) {
return int32_to_fp32(din, dout, scale, axis_size, outer_size, inner_size); return int32_to_fp32(din, dout, scale, axis_size, outer_size, inner_size);
} }
template <> template <>
void int32_to_dtype(const int* din, signed char* dout, const float* scale, void int32_to_dtype(const int* din, signed char* dout, const float* scale,
int axis_size, long long outer_size, long long inner_size) { int axis_size, int64_t outer_size, int64_t inner_size) {
return int32_to_int8(din, dout, scale, axis_size, outer_size, inner_size); return int32_to_int8(din, dout, scale, axis_size, outer_size, inner_size);
} }
template <> template <>
void int32_to_dtype(const int* din, int* dout, const float* scale, void int32_to_dtype(const int* din, int* dout, const float* scale,
int axis_size, long long outer_size, long long inner_size) { int axis_size, int64_t outer_size, int64_t inner_size) {
return int32_to_int32(din, dout, scale, axis_size, outer_size, inner_size); return int32_to_int32(din, dout, scale, axis_size, outer_size, inner_size);
} }
......
...@@ -57,3 +57,4 @@ lite_cc_test(test_type_system SRCS type_system_test.cc DEPS type_system utils_li ...@@ -57,3 +57,4 @@ lite_cc_test(test_type_system SRCS type_system_test.cc DEPS type_system utils_li
lite_cc_test(test_types_lite SRCS types_test.cc DEPS types_lite) lite_cc_test(test_types_lite SRCS types_test.cc DEPS types_lite)
lite_cc_test(test_memory_lite SRCS memory_test.cc DEPS memory_lite) lite_cc_test(test_memory_lite SRCS memory_test.cc DEPS memory_lite)
lite_cc_test(test_context_lite SRCS context_test.cc DEPS context_lite X86_DEPS operator) lite_cc_test(test_context_lite SRCS context_test.cc DEPS context_lite X86_DEPS operator)
...@@ -59,3 +59,4 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) ...@@ -59,3 +59,4 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
pattern_matcher_high_api proto_desc mir_pass_manager fc_op_lite mul_op_lite elementwise_ops_lite pattern_matcher_high_api proto_desc mir_pass_manager fc_op_lite mul_op_lite elementwise_ops_lite
mir_passes compatible_pb_lite program_lite ${ops_lite}) mir_passes compatible_pb_lite program_lite ${ops_lite})
endif() endif()
...@@ -4,3 +4,4 @@ endif() ...@@ -4,3 +4,4 @@ endif()
lite_cc_library(basic_profiler_lite SRCS basic_profiler.cc) lite_cc_library(basic_profiler_lite SRCS basic_profiler.cc)
lite_cc_test(test_basic_profiler SRCS basic_profiler_test.cc DEPS basic_profiler_lite) lite_cc_test(test_basic_profiler SRCS basic_profiler_test.cc DEPS basic_profiler_lite)
...@@ -4,3 +4,4 @@ endif() ...@@ -4,3 +4,4 @@ endif()
nv_library(target_wrapper_cuda SRCS target_wrapper.cc) nv_library(target_wrapper_cuda SRCS target_wrapper.cc)
nv_library(cuda_blas_lite SRCS blas.cc) nv_library(cuda_blas_lite SRCS blas.cc)
...@@ -25,3 +25,4 @@ if (NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) ...@@ -25,3 +25,4 @@ if (NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
add_dependencies(__generated_code__ test_gen_code_lite) add_dependencies(__generated_code__ test_gen_code_lite)
endif() endif()
cc_library(target_wrapper_host SRCS target_wrapper.cc) cc_library(target_wrapper_host SRCS target_wrapper.cc)
...@@ -5,3 +5,4 @@ add_subdirectory(arm) ...@@ -5,3 +5,4 @@ add_subdirectory(arm)
add_subdirectory(cuda) add_subdirectory(cuda)
add_subdirectory(x86) add_subdirectory(x86)
...@@ -40,3 +40,4 @@ set(arm_kernels ...@@ -40,3 +40,4 @@ set(arm_kernels
set(arm_kernels "${arm_kernels}" CACHE INTERNAL "arm kernels") set(arm_kernels "${arm_kernels}" CACHE INTERNAL "arm kernels")
...@@ -9,3 +9,4 @@ cc_library(io_copy_compute_cuda SRCS io_copy_compute.cc DEPS ${tensor_lite}) ...@@ -9,3 +9,4 @@ cc_library(io_copy_compute_cuda SRCS io_copy_compute.cc DEPS ${tensor_lite})
nv_library(kernels_cuda DEPS mul_compute_cuda io_copy_compute_cuda cuda_blas_lite) nv_library(kernels_cuda DEPS mul_compute_cuda io_copy_compute_cuda cuda_blas_lite)
...@@ -13,3 +13,4 @@ set(host_kernels ...@@ -13,3 +13,4 @@ set(host_kernels
) )
set(host_kernels "${host_kernels}" CACHE GLOBAL "host kernels") set(host_kernels "${host_kernels}" CACHE GLOBAL "host kernels")
...@@ -35,3 +35,4 @@ set(x86_kernels ...@@ -35,3 +35,4 @@ set(x86_kernels
) )
set(x86_kernels "${x86_kernels}" CACHE INTERNAL "x86 kernels") set(x86_kernels "${x86_kernels}" CACHE INTERNAL "x86 kernels")
...@@ -27,3 +27,4 @@ lite_cc_test(test_op_desc_lite SRCS op_desc_test.cc DEPS cpp_op_desc_lite op_des ...@@ -27,3 +27,4 @@ lite_cc_test(test_op_desc_lite SRCS op_desc_test.cc DEPS cpp_op_desc_lite op_des
add_subdirectory(pb) add_subdirectory(pb)
add_subdirectory(cpp) add_subdirectory(cpp)
cc_library(cpp_op_desc_lite SRCS op_desc.cc DEPS any_lite) cc_library(cpp_op_desc_lite SRCS op_desc.cc DEPS any_lite)
cc_library(var_desc_lite SRCS var_desc.cc DEPS framework_proto_lite) cc_library(var_desc_lite SRCS var_desc.cc DEPS framework_proto_lite)
cc_library(op_desc_lite SRCS op_desc.cc DEPS framework_proto_lite) cc_library(op_desc_lite SRCS op_desc.cc DEPS framework_proto_lite)
...@@ -56,3 +56,4 @@ lite_cc_test(test_softmax_op_lite SRCS softmax_op_test.cc DEPS softmax_op_lite m ...@@ -56,3 +56,4 @@ lite_cc_test(test_softmax_op_lite SRCS softmax_op_test.cc DEPS softmax_op_lite m
lite_cc_test(test_reshape_op_lite SRCS reshape_op_test.cc DEPS reshape_op_lite memory_lite) lite_cc_test(test_reshape_op_lite SRCS reshape_op_test.cc DEPS reshape_op_lite memory_lite)
lite_cc_test(test_batch_norm_op_lite SRCS batch_norm_op_test.cc DEPS batch_norm_op_lite memory_lite) lite_cc_test(test_batch_norm_op_lite SRCS batch_norm_op_test.cc DEPS batch_norm_op_lite memory_lite)
lite_cc_test(test_concat_op_lite SRCS concat_op_test.cc DEPS concat_op_lite memory_lite) lite_cc_test(test_concat_op_lite SRCS concat_op_test.cc DEPS concat_op_lite memory_lite)
...@@ -88,3 +88,4 @@ RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple wheel ...@@ -88,3 +88,4 @@ RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple wheel
RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple pre-commit RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple pre-commit
RUN apt-get autoremove -y && apt-get clean RUN apt-get autoremove -y && apt-get clean
RUN rm -rf /sdk-tools-linux-4333796.zip /tmp/android-ndk-r17c-linux-x86_64.zip /cmake-3.10.3-Linux-x86_64.tar.gz RUN rm -rf /sdk-tools-linux-4333796.zip /tmp/android-ndk-r17c-linux-x86_64.zip /cmake-3.10.3-Linux-x86_64.tar.gz
\ No newline at end of file
...@@ -283,3 +283,4 @@ function main { ...@@ -283,3 +283,4 @@ function main {
} }
main $@ main $@
...@@ -124,3 +124,4 @@ $ adb devices ...@@ -124,3 +124,4 @@ $ adb devices
List of devices attached List of devices attached
5cb00b6 device 5cb00b6 device
``` ```
...@@ -9,3 +9,4 @@ set(utils_DEPS glog) ...@@ -9,3 +9,4 @@ set(utils_DEPS glog)
lite_cc_test(test_varient SRCS varient_test.cc DEPS utils_lite) lite_cc_test(test_varient SRCS varient_test.cc DEPS utils_lite)
cc_library(any_lite SRCS any.cc) cc_library(any_lite SRCS any.cc)
cc_library(utils_lite SRCS cp_logging.cc string.cc DEPS ${utils_DEPS} any_lite) cc_library(utils_lite SRCS cp_logging.cc string.cc DEPS ${utils_DEPS} any_lite)
...@@ -4,3 +4,4 @@ endif() ...@@ -4,3 +4,4 @@ endif()
cc_library(target_wrapper_x86 SRCS target_wrapper.cc) cc_library(target_wrapper_x86 SRCS target_wrapper.cc)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册