diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100755 index 0000000000000000000000000000000000000000..a6f900a4d144fa06dbbe8f4f8312bed5923b8b33 --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,74 @@ +before_script: + - env + +image: $SERVER_LITE_DOCKER_IMAGE + +stages: + - ci + - build_server + - build_mobile + +check:prebuilt: + tags: + - lite + stage: ci + script: + #- pip3 install pre-commit + #- alias python=python3 + - rm -rf ~/.pip + - pip install pre-commit + - pre-commit install + - ./paddle/fluid/lite/tools/build.sh check_style + #- ./paddle/fluid/lite/tools/build.sh check_need_ci + cache: + key: check_style + paths: + - /root/.cache + +build:server: + tags: + - lite + image: $SERVER_LITE_DOCKER_IMAGE + stage: build_server + cache: + key: server_thirdparty + paths: + - build/third_party + - /root/.ccache + script: + - apt install ccache + - export http_proxy=http://172.19.57.45:3128 + - export https_proxy=http://172.19.57.45:3128 + #- export http_proxy=http://agent.baidu.com:8118 + #- export https_proxy=http://agent.baidu.com:8118 + - mkdir -p build + - cd build + - ../paddle/fluid/lite/tools/build.sh cmake_x86 + - make extern_eigen3 + - make extern_boost + - make framework_proto + - make extern_warpctc + - cd .. + - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PWD/build/third_party/install/mklml/lib + - ./paddle/fluid/lite/tools/build.sh build_test_server + dependencies: + - check:prebuilt + +build:mobile: + tags: + - lite + stage: build_mobile + image: $MOBILE_LITE_DOCKER_IMAGE + cache: + key: mobile_thirdparty + paths: + - $MOBILE_LITE_CACHE0 + - $MOBILE_LITE_CACHE1 + - /root/.ccache + script: + - apt install ccache + - export http_proxy=http://172.19.57.45:3128 + - export https_proxy=http://172.19.57.45:3128 + - ./paddle/fluid/lite/tools/build.sh build_test_arm + dependencies: + - build:server diff --git a/CMakeLists.txt b/CMakeLists.txt index 4ef4a4c351e4b701f481b5b23076ea3535fa7231..312bdb7f1ae11576abf6f5ec222bae72bcd67bb5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -166,6 +166,7 @@ if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) #include(external/zlib) # download, build, install gtest include(external/protobuf) # download, build, install protobuf include(external/eigen) # download eigen3 + include(ccache) # set ccache for compilation include(generic) # simplify cmake module include(configure) # add paddle env configuration diff --git a/paddle/fluid/lite/CMakeLists.txt b/paddle/fluid/lite/CMakeLists.txt index ac9ff84da449c6babdb380b54e90920d8cb6e70f..301dbea2b7601d43b20095685d82a11ae5dcc2f6 100644 --- a/paddle/fluid/lite/CMakeLists.txt +++ b/paddle/fluid/lite/CMakeLists.txt @@ -172,3 +172,4 @@ add_subdirectory(model_parser) add_subdirectory(utils) add_subdirectory(api) add_subdirectory(gen_code) + diff --git a/paddle/fluid/lite/api/CMakeLists.txt b/paddle/fluid/lite/api/CMakeLists.txt index cc4d4b440b681b4cb6974e6a0ef6800b70ad0c4e..46f38534c74d7269a440670331f90c33179dffb2 100644 --- a/paddle/fluid/lite/api/CMakeLists.txt +++ b/paddle/fluid/lite/api/CMakeLists.txt @@ -14,7 +14,7 @@ if(LITE_WITH_CUDA) set(light_api_deps ${light_api_deps} target_wrapper_cuda) endif() -cc_library(light_api_lite SRCS light_api.cc DEPS ${light_api_deps} ${ops_lite} ${host_kernels}) +#cc_library(light_api_lite SRCS light_api.cc DEPS ${light_api_deps} ${ops_lite} ${host_kernels}) message(STATUS "get ops ${ops_lite}") message(STATUS "get Host kernels ${host_kernels}") diff --git a/paddle/fluid/lite/api/cxx_api_bin.cc b/paddle/fluid/lite/api/cxx_api_bin.cc index fb3cead2b16104a9da9cf25e53b0b75e8913388a..0cc786c024f6d7447ec57bb4a539ddf8bcdb1c25 100644 --- a/paddle/fluid/lite/api/cxx_api_bin.cc +++ b/paddle/fluid/lite/api/cxx_api_bin.cc @@ -66,7 +66,7 @@ USE_LITE_OP(fetch); USE_LITE_OP(io_copy); USE_LITE_OP(con2d); -USE_LITE_OP(batch_norm); +// USE_LITE_OP(batch_norm); USE_LITE_OP(relu); USE_LITE_OP(depthwise_conv2d); USE_LITE_OP(pool2d); diff --git a/paddle/fluid/lite/arm/CMakeLists.txt b/paddle/fluid/lite/arm/CMakeLists.txt index 8abd04b52338299f75399903aa68fe834ce81d04..1980267380d4ed32f7530ef62861119c9094f015 100644 --- a/paddle/fluid/lite/arm/CMakeLists.txt +++ b/paddle/fluid/lite/arm/CMakeLists.txt @@ -1,2 +1,3 @@ add_subdirectory(math) + diff --git a/paddle/fluid/lite/arm/math/CMakeLists.txt b/paddle/fluid/lite/arm/math/CMakeLists.txt index 2a912e434ae60ab8be587d044541c4b8b464a435..17d1b7d9b2adc4f048b0e4056d435365f9410b53 100644 --- a/paddle/fluid/lite/arm/math/CMakeLists.txt +++ b/paddle/fluid/lite/arm/math/CMakeLists.txt @@ -32,5 +32,7 @@ cc_library(math_arm SRCS conv_winograd_3x3.cc conv_winograd.cc split.cc - DEPS ${lite_kernel_deps} eigen3) + DEPS ${lite_kernel_deps} eigen3 framework_proto_lite) + # TODO(TJ): fix me do not deps proto + diff --git a/paddle/fluid/lite/arm/math/scale.cc b/paddle/fluid/lite/arm/math/scale.cc index 40b91e6979f6f330f96f4c086fe1856707d9b189..ce969358f689ef7713efb435ce58ba72471d282b 100644 --- a/paddle/fluid/lite/arm/math/scale.cc +++ b/paddle/fluid/lite/arm/math/scale.cc @@ -58,6 +58,111 @@ void scale(const float* din, float* dout, int num, float scale, } } +template <> +void scale(const float* din, float* dout, int outer_dim, int scale_dim, + int inner_dim, const float* scale_data, + const float* bias_data) { + int cnt = inner_dim >> 4; + int remain = inner_dim % 16; + int size = inner_dim * scale_dim; + for (int n = 0; n < outer_dim; n++) { + const float* din_ptr_n = din + n * size; + float* dout_ptr_n = dout + n * size; +#pragma omp parallel for + for (int i = 0; i < scale_dim; i++) { + const float* din_ptr = din_ptr_n + i * inner_dim; + float* dout_ptr = dout_ptr_n + i * inner_dim; + float scale = scale_data[i]; + float32x4_t vscale = vdupq_n_f32(scale); + float bias = bias_data[i]; + float32x4_t vbias = vdupq_n_f32(bias); + for (int j = 0; j < cnt; j++) { + float32x4_t din0 = vld1q_f32(din_ptr); + float32x4_t din1 = vld1q_f32(din_ptr + 4); + float32x4_t din2 = vld1q_f32(din_ptr + 8); + float32x4_t din3 = vld1q_f32(din_ptr + 12); + + float32x4_t vsum1 = vmlaq_f32(vbias, din0, vscale); + float32x4_t vsum2 = vmlaq_f32(vbias, din1, vscale); + float32x4_t vsum3 = vmlaq_f32(vbias, din2, vscale); + float32x4_t vsum4 = vmlaq_f32(vbias, din3, vscale); + + din_ptr += 16; + vst1q_f32(dout_ptr, vsum1); + vst1q_f32(dout_ptr + 4, vsum2); + vst1q_f32(dout_ptr + 8, vsum3); + vst1q_f32(dout_ptr + 12, vsum4); + + dout_ptr += 16; + } + for (int j = 0; j < remain; j++) { + *dout_ptr = *din_ptr * scale + bias; + dout_ptr++; + din_ptr++; + } + } + } +} + +template <> +void scale(const float* din, float* dout, int outer_dim, int scale_dim, + const float* scale_data, const float* bias_data) { + int cnt = scale_dim >> 4; + int remain = scale_dim % 16; + for (int n = 0; n < outer_dim; n++) { + const float* din_ptr_n = din + n * scale_dim; + float* dout_ptr_n = dout + n * scale_dim; +#pragma omp parallel for + for (int i = 0; i < cnt; i++) { + int idx = i << 4; + const float* din_ptr = din_ptr_n + idx; + const float* scale_ptr = scale_data + idx; + const float* bias_ptr = bias_data + idx; + float* dout_ptr = dout_ptr_n + idx; + + float32x4_t din0 = vld1q_f32(din_ptr); + float32x4_t vscale0 = vld1q_f32(scale_ptr); + float32x4_t vbias0 = vld1q_f32(bias_ptr); + + float32x4_t din1 = vld1q_f32(din_ptr + 4); + float32x4_t vscale1 = vld1q_f32(scale_ptr + 4); + float32x4_t vbias1 = vld1q_f32(bias_ptr + 4); + + float32x4_t din2 = vld1q_f32(din_ptr + 8); + float32x4_t vscale2 = vld1q_f32(scale_ptr + 8); + float32x4_t vbias2 = vld1q_f32(bias_ptr + 8); + + float32x4_t vsum1 = vmlaq_f32(vbias0, din0, vscale0); + float32x4_t vsum2 = vmlaq_f32(vbias1, din1, vscale1); + + float32x4_t din3 = vld1q_f32(din_ptr + 12); + float32x4_t vscale3 = vld1q_f32(scale_ptr + 12); + float32x4_t vbias3 = vld1q_f32(bias_ptr + 12); + + vst1q_f32(dout_ptr, vsum1); + vst1q_f32(dout_ptr + 4, vsum2); + + float32x4_t vsum3 = vmlaq_f32(vbias2, din2, vscale2); + float32x4_t vsum4 = vmlaq_f32(vbias3, din3, vscale3); + + vst1q_f32(dout_ptr + 8, vsum3); + vst1q_f32(dout_ptr + 12, vsum4); + } + int idx = cnt << 4; + const float* din_ptr = din_ptr_n + idx; + float* dout_ptr = dout_ptr_n + idx; + const float* scale_ptr = scale_data + idx; + const float* bias_ptr = bias_data + idx; + for (int j = 0; j < remain; j++) { + *dout_ptr = *din_ptr * (*scale_ptr) + (*bias_ptr); + dout_ptr++; + din_ptr++; + scale_ptr++; + bias_ptr++; + } + } +} + } // namespace math } // namespace arm } // namespace lite diff --git a/paddle/fluid/lite/arm/math/scale.h b/paddle/fluid/lite/arm/math/scale.h index 97a5f79fc6bfabee5e38854e2ba89ce388648aac..2274dd23d2f4f486e39b97ad5040bde47af8a042 100644 --- a/paddle/fluid/lite/arm/math/scale.h +++ b/paddle/fluid/lite/arm/math/scale.h @@ -22,6 +22,14 @@ namespace math { template void scale(const T* din, T* dout, int num, float scale, float bias); +template +void scale(const T* din, T* dout, int outer_dim, int scale_dim, int inner_dim, + const float* scale_data, const float* bias_data); + +template +void scale(const T* din, T* dout, int outer_dim, int scale_dim, + const float* scale_data, const float* bias_data); + } // namespace math } // namespace arm } // namespace lite diff --git a/paddle/fluid/lite/arm/math/type_trans.cpp b/paddle/fluid/lite/arm/math/type_trans.cpp index a60cc80f8d164324cd397f07e800d8e32a74533b..f9c3ea590f394d226bee675ae793097b7afa031d 100644 --- a/paddle/fluid/lite/arm/math/type_trans.cpp +++ b/paddle/fluid/lite/arm/math/type_trans.cpp @@ -12,9 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/lite/arm/math/saturate.h" +#include "paddle/fluid/lite/arm/math/type_trans.h" #include #include +#include "paddle/fluid/lite/arm/math/saturate.h" namespace paddle { namespace lite { @@ -23,563 +24,553 @@ namespace math { template void int32_to_dtype(const int* din, dtype* dout, const float* scale, - int axis_size, long long outer_size, long long inner_size); + int axis_size, int64_t outer_size, int64_t inner_size); void fp32_to_int8(const float* din, signed char* dout, const float* scale, - int axis_size, long long outer_size, long long inner_size) { - - int cnt = inner_size / 16; - int remain = inner_size & 15; - long long loop_size = outer_size * axis_size; + int axis_size, int64_t outer_size, int64_t inner_size) { + int cnt = inner_size / 16; + int remain = inner_size & 15; + int64_t loop_size = outer_size * axis_size; #pragma omp parallel for - for (int j = 0; j < loop_size; ++j) { - float inv_scale = 1.f / scale[j % axis_size]; - float32x4_t vzero = vdupq_n_f32(0.f); - float32x4_t vscale = vdupq_n_f32(inv_scale); - float32x4_t vpoff = vdupq_n_f32(0.5f); - float32x4_t vnoff = vdupq_n_f32(-0.5f); - const float* din_c = din + j * inner_size; - signed char* dout_c = dout + j * inner_size; - if (cnt > 0) { - int cnt_loop = cnt; - const float* din_ptr = din_c; - signed char* dout_ptr = dout_c; + for (int j = 0; j < loop_size; ++j) { + float inv_scale = 1.f / scale[j % axis_size]; + float32x4_t vzero = vdupq_n_f32(0.f); + float32x4_t vscale = vdupq_n_f32(inv_scale); + float32x4_t vpoff = vdupq_n_f32(0.5f); + float32x4_t vnoff = vdupq_n_f32(-0.5f); + const float* din_c = din + j * inner_size; + signed char* dout_c = dout + j * inner_size; + if (cnt > 0) { + int cnt_loop = cnt; + const float* din_ptr = din_c; + signed char* dout_ptr = dout_c; #ifdef __aarch64__ - asm volatile( - "ldp q0, q1, [%[in]], #32 \n" - "ldp q2, q3, [%[in]], #32 \n" - "0: \n" /* main loop */ - "fmul v4.4s, v0.4s, %[scale].4s \n" - "fmul v5.4s, v1.4s, %[scale].4s \n" - "fmul v6.4s, v2.4s, %[scale].4s \n" - "fmul v7.4s, v3.4s, %[scale].4s \n" - "ldp q0, q1, [%[in]], #32 \n" - "subs %[cnt], %[cnt], #1 \n" - "FCVTAS v8.4s, v4.4s \n" - "FCVTAS v9.4s, v5.4s \n" - "FCVTAS v10.4s, v6.4s \n" - "FCVTAS v11.4s, v7.4s \n" - "ldp q2, q3, [%[in]], #32 \n" - "sqxtn v4.4h, v8.4s \n" - "sqxtn2 v4.8h, v9.4s \n" - "sqxtn v5.4h, v10.4s \n" - "sqxtn2 v5.8h, v11.4s \n" - "sqxtn v8.8b, v4.8h \n" - "sqxtn2 v8.16b, v5.8h \n" - "str q8, [%[out]], #16 \n" - "bne 0b \n" - : [in] "+r" (din_ptr), [out] "+r" (dout_ptr), [cnt] "+r" (cnt_loop) - : [scale] "w" (vscale) - : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11" - ); + asm volatile( + "ldp q0, q1, [%[in]], #32 \n" + "ldp q2, q3, [%[in]], #32 \n" + "0: \n" /* main loop */ + "fmul v4.4s, v0.4s, %[scale].4s \n" + "fmul v5.4s, v1.4s, %[scale].4s \n" + "fmul v6.4s, v2.4s, %[scale].4s \n" + "fmul v7.4s, v3.4s, %[scale].4s \n" + "ldp q0, q1, [%[in]], #32 \n" + "subs %[cnt], %[cnt], #1 \n" + "FCVTAS v8.4s, v4.4s \n" + "FCVTAS v9.4s, v5.4s \n" + "FCVTAS v10.4s, v6.4s \n" + "FCVTAS v11.4s, v7.4s \n" + "ldp q2, q3, [%[in]], #32 \n" + "sqxtn v4.4h, v8.4s \n" + "sqxtn2 v4.8h, v9.4s \n" + "sqxtn v5.4h, v10.4s \n" + "sqxtn2 v5.8h, v11.4s \n" + "sqxtn v8.8b, v4.8h \n" + "sqxtn2 v8.16b, v5.8h \n" + "str q8, [%[out]], #16 \n" + "bne 0b \n" + : [in] "+r"(din_ptr), [out] "+r"(dout_ptr), [cnt] "+r"(cnt_loop) + : [scale] "w"(vscale) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", + "v11"); #else - asm volatile( - "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" - "vld1.32 {d4-d7}, [%[din]]! @ load in8~in16\n" - "0: @ main loop\n" - "vand.i32 q4, %q[vpoff], %q[vpoff] @ set offset, 0.5\n" - "vand.i32 q5, q4, q4 @ set offset, 0.5\n" - "vand.i32 q6, q4, q4 @ set offset, 0.5\n" - "vand.i32 q7, q4, q4 @ set offset, 0.5\n" - "vcgt.f32 q8, q0, %q[vzero] @ get mask > 0, in0\n" - "vcgt.f32 q9, q1, %q[vzero] @ get mask > 0, in1\n" - "vcgt.f32 q10, q2, %q[vzero] @ get mask > 0, in2\n" - "vcgt.f32 q11, q3, %q[vzero] @ get mask > 0, in3\n" - "vbif.f32 q4, %q[vnoff], q8 @ get right offset\n" - "vbif.f32 q5, %q[vnoff], q9 @ get right offset\n" - "vbif.f32 q6, %q[vnoff], q10 @ get right offset\n" - "vbif.f32 q7, %q[vnoff], q11 @ get right offset\n" - "vmla.f32 q4, q0, %q[vscale] @ mul scale\n" - "vmla.f32 q5, q1, %q[vscale] @ mul scale\n" - "vmla.f32 q6, q2, %q[vscale] @ mul scale\n" - "vmla.f32 q7, q3, %q[vscale] @ mul scale\n" - "vcvt.s32.f32 q0, q4 @ cvt to int32\n" - "vcvt.s32.f32 q1, q5 @ cvt to int32\n" - "vcvt.s32.f32 q2, q6 @ cvt to int32\n" - "vcvt.s32.f32 q3, q7 @ cvt to int32\n" - "vqmovn.s32 d8, q0 @ cnt to int16\n" - "vqmovn.s32 d9, q1 @ cnt to int16\n" - "vqmovn.s32 d10, q2 @ cnt to int16\n" - "vqmovn.s32 d11, q3 @ cnt to int16\n" - "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" - "vqmovn.s16 d12, q4 @ cnt to int8\n" - "vqmovn.s16 d13, q5 @ cnt to int8\n" - "vld1.32 {d4-d7}, [%[din]]! @ load in8~in16\n" - "vst1.32 {d12-d13}, [%[dout]]! @ write to output\n" - "subs %[cnt], #1 @ loop count -1\n" - "bne 0b @ to main loop\n" - - :[dout]"+r"(dout_ptr), [din]"+r"(din_ptr), [cnt]"+r"(cnt_loop) - :[vscale]"w"(vscale), [vpoff]"w"(vpoff), [vnoff]"w"(vnoff), [vzero]"w"(vzero) - :"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11" - ); + asm volatile( + "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" + "vld1.32 {d4-d7}, [%[din]]! @ load in8~in16\n" + "0: @ main loop\n" + "vand.i32 q4, %q[vpoff], %q[vpoff] @ set offset, 0.5\n" + "vand.i32 q5, q4, q4 @ set offset, 0.5\n" + "vand.i32 q6, q4, q4 @ set offset, 0.5\n" + "vand.i32 q7, q4, q4 @ set offset, 0.5\n" + "vcgt.f32 q8, q0, %q[vzero] @ get mask > 0, in0\n" + "vcgt.f32 q9, q1, %q[vzero] @ get mask > 0, in1\n" + "vcgt.f32 q10, q2, %q[vzero] @ get mask > 0, in2\n" + "vcgt.f32 q11, q3, %q[vzero] @ get mask > 0, in3\n" + "vbif.f32 q4, %q[vnoff], q8 @ get right offset\n" + "vbif.f32 q5, %q[vnoff], q9 @ get right offset\n" + "vbif.f32 q6, %q[vnoff], q10 @ get right offset\n" + "vbif.f32 q7, %q[vnoff], q11 @ get right offset\n" + "vmla.f32 q4, q0, %q[vscale] @ mul scale\n" + "vmla.f32 q5, q1, %q[vscale] @ mul scale\n" + "vmla.f32 q6, q2, %q[vscale] @ mul scale\n" + "vmla.f32 q7, q3, %q[vscale] @ mul scale\n" + "vcvt.s32.f32 q0, q4 @ cvt to int32\n" + "vcvt.s32.f32 q1, q5 @ cvt to int32\n" + "vcvt.s32.f32 q2, q6 @ cvt to int32\n" + "vcvt.s32.f32 q3, q7 @ cvt to int32\n" + "vqmovn.s32 d8, q0 @ cnt to int16\n" + "vqmovn.s32 d9, q1 @ cnt to int16\n" + "vqmovn.s32 d10, q2 @ cnt to int16\n" + "vqmovn.s32 d11, q3 @ cnt to int16\n" + "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" + "vqmovn.s16 d12, q4 @ cnt to int8\n" + "vqmovn.s16 d13, q5 @ cnt to int8\n" + "vld1.32 {d4-d7}, [%[din]]! @ load in8~in16\n" + "vst1.32 {d12-d13}, [%[dout]]! @ write to output\n" + "subs %[cnt], #1 @ loop count -1\n" + "bne 0b @ to main loop\n" + + : [dout] "+r"(dout_ptr), [din] "+r"(din_ptr), [cnt] "+r"(cnt_loop) + : [vscale] "w"(vscale), [vpoff] "w"(vpoff), [vnoff] "w"(vnoff), + [vzero] "w"(vzero) + : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", + "q11"); #endif - } - const float* din_r = din_c + 16 * cnt; - signed char* dout_r = dout_c + 16 * cnt; - for (int i = 0; i < remain; ++i) { - dout_r[i] = saturate_cast(roundf(inv_scale * din_r[i])); - } } + const float* din_r = din_c + 16 * cnt; + signed char* dout_r = dout_c + 16 * cnt; + for (int i = 0; i < remain; ++i) { + dout_r[i] = saturate_cast(roundf(inv_scale * din_r[i])); + } + } } void fp32_to_int16(const float* din, int16_t* dout, const float* scale, - int axis_size, long long outer_size, long long inner_size) { - - int cnt = inner_size / 8; - int remain = inner_size & 7; - long long loop_size = outer_size * axis_size; + int axis_size, int64_t outer_size, int64_t inner_size) { + int cnt = inner_size / 8; + int remain = inner_size & 7; + int64_t loop_size = outer_size * axis_size; #pragma omp parallel for - for (int j = 0; j < loop_size; ++j) { - float inv_scale = 1.f / scale[j % axis_size]; - float32x4_t vzero = vdupq_n_f32(0.f); - float32x4_t vscale = vdupq_n_f32(inv_scale); - float32x4_t vpoff = vdupq_n_f32(0.5f); - float32x4_t vnoff = vdupq_n_f32(-0.5f); - const float* din_c = din + j * inner_size; - int16_t* dout_c = dout + j * inner_size; - if (cnt > 0) { - int cnt_loop = cnt; - const float* din_ptr = din_c; - int16_t* dout_ptr = dout_c; + for (int j = 0; j < loop_size; ++j) { + float inv_scale = 1.f / scale[j % axis_size]; + float32x4_t vzero = vdupq_n_f32(0.f); + float32x4_t vscale = vdupq_n_f32(inv_scale); + float32x4_t vpoff = vdupq_n_f32(0.5f); + float32x4_t vnoff = vdupq_n_f32(-0.5f); + const float* din_c = din + j * inner_size; + int16_t* dout_c = dout + j * inner_size; + if (cnt > 0) { + int cnt_loop = cnt; + const float* din_ptr = din_c; + int16_t* dout_ptr = dout_c; #ifdef __aarch64__ - asm volatile( - "ldp q0, q1, [%[in]], #32 \n" - "0: \n" /* main loop */ - "fmul v4.4s, v0.4s, %[scale].4s \n" - "fmul v5.4s, v1.4s, %[scale].4s \n" - "ldp q0, q1, [%[in]], #32 \n" - "subs %[cnt], %[cnt], #1 \n" - "FCVTAS v8.4s, v4.4s \n" - "FCVTAS v9.4s, v5.4s \n" - "sqxtn v4.4h, v8.4s \n" - "sqxtn2 v4.8h, v9.4s \n" - "str q4, [%[out]], #16 \n" - "bne 0b \n" - : [in] "+r" (din_ptr), [out] "+r" (dout_ptr), [cnt] "+r" (cnt_loop) - : [scale] "w" (vscale) - : "v0", "v1", "v4", "v5", "v8", "v9" - ); + asm volatile( + "ldp q0, q1, [%[in]], #32 \n" + "0: \n" /* main loop */ + "fmul v4.4s, v0.4s, %[scale].4s \n" + "fmul v5.4s, v1.4s, %[scale].4s \n" + "ldp q0, q1, [%[in]], #32 \n" + "subs %[cnt], %[cnt], #1 \n" + "FCVTAS v8.4s, v4.4s \n" + "FCVTAS v9.4s, v5.4s \n" + "sqxtn v4.4h, v8.4s \n" + "sqxtn2 v4.8h, v9.4s \n" + "str q4, [%[out]], #16 \n" + "bne 0b \n" + : [in] "+r"(din_ptr), [out] "+r"(dout_ptr), [cnt] "+r"(cnt_loop) + : [scale] "w"(vscale) + : "v0", "v1", "v4", "v5", "v8", "v9"); #else - asm volatile( - "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" - "0: @ main loop\n" - "vand.i32 q4, %q[vpoff], %q[vpoff] @ set offset, 0.5\n" - "vand.i32 q5, q4, q4 @ set offset, 0.5\n" - "vand.i32 q6, q4, q4 @ set offset, 0.5\n" - "vand.i32 q7, q4, q4 @ set offset, 0.5\n" - "vcgt.f32 q8, q0, %q[vzero] @ get mask > 0, in0\n" - "vcgt.f32 q9, q1, %q[vzero] @ get mask > 0, in1\n" - "vbif.f32 q4, %q[vnoff], q8 @ get right offset\n" - "vbif.f32 q5, %q[vnoff], q9 @ get right offset\n" - "vmla.f32 q4, q0, %q[vscale] @ mul scale\n" - "vmla.f32 q5, q1, %q[vscale] @ mul scale\n" - "vcvt.s32.f32 q0, q4 @ cvt to int32\n" - "vcvt.s32.f32 q1, q5 @ cvt to int32\n" - "vqmovn.s32 d8, q0 @ cnt to int16\n" - "vqmovn.s32 d9, q1 @ cnt to int16\n" - "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" - "vst1.32 {d8-d9}, [%[dout]]! @ write to output\n" - "subs %[cnt], #1 @ loop count -1\n" - "bne 0b @ to main loop\n" - - :[dout]"+r"(dout_ptr), [din]"+r"(din_ptr), [cnt]"+r"(cnt_loop) - :[vscale]"w"(vscale), [vpoff]"w"(vpoff), [vnoff]"w"(vnoff), [vzero]"w"(vzero) - :"q0", "q1", "q4", "q5", "q6", "q7", "q8", "q9" - ); + asm volatile( + "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" + "0: @ main loop\n" + "vand.i32 q4, %q[vpoff], %q[vpoff] @ set offset, 0.5\n" + "vand.i32 q5, q4, q4 @ set offset, 0.5\n" + "vand.i32 q6, q4, q4 @ set offset, 0.5\n" + "vand.i32 q7, q4, q4 @ set offset, 0.5\n" + "vcgt.f32 q8, q0, %q[vzero] @ get mask > 0, in0\n" + "vcgt.f32 q9, q1, %q[vzero] @ get mask > 0, in1\n" + "vbif.f32 q4, %q[vnoff], q8 @ get right offset\n" + "vbif.f32 q5, %q[vnoff], q9 @ get right offset\n" + "vmla.f32 q4, q0, %q[vscale] @ mul scale\n" + "vmla.f32 q5, q1, %q[vscale] @ mul scale\n" + "vcvt.s32.f32 q0, q4 @ cvt to int32\n" + "vcvt.s32.f32 q1, q5 @ cvt to int32\n" + "vqmovn.s32 d8, q0 @ cnt to int16\n" + "vqmovn.s32 d9, q1 @ cnt to int16\n" + "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" + "vst1.32 {d8-d9}, [%[dout]]! @ write to output\n" + "subs %[cnt], #1 @ loop count -1\n" + "bne 0b @ to main loop\n" + + : [dout] "+r"(dout_ptr), [din] "+r"(din_ptr), [cnt] "+r"(cnt_loop) + : [vscale] "w"(vscale), [vpoff] "w"(vpoff), [vnoff] "w"(vnoff), + [vzero] "w"(vzero) + : "q0", "q1", "q4", "q5", "q6", "q7", "q8", "q9"); #endif - } - const float* din_r = din_c + 8 * cnt; - int16_t* dout_r = dout_c + 8 * cnt; - for (int i = 0; i < remain; ++i) { - dout_r[i] = saturate_cast(roundf(inv_scale * din_r[i])); - } } + const float* din_r = din_c + 8 * cnt; + int16_t* dout_r = dout_c + 8 * cnt; + for (int i = 0; i < remain; ++i) { + dout_r[i] = saturate_cast(roundf(inv_scale * din_r[i])); + } + } } void int8_to_fp32(const signed char* in, float* out, const float* scale, - int axis_size, long long outer_size, long long inner_size) { - - int cnt = inner_size / 16; - int remain = inner_size & 15; - long long loop_size = axis_size * outer_size; + int axis_size, int64_t outer_size, int64_t inner_size) { + int cnt = inner_size / 16; + int remain = inner_size & 15; + int64_t loop_size = axis_size * outer_size; #pragma omp parallel for - for (long long n = 0; n < loop_size; ++n) { - float in_scale = scale[n % axis_size]; - const signed char* din_c = in + n * inner_size; - float* dout_c = out + n * inner_size; - float32x4_t vscale = vdupq_n_f32(in_scale); - if (cnt > 0) { - int loop = cnt; - const signed char* din_ptr = din_c; - float* dout_ptr = dout_c; + for (int64_t n = 0; n < loop_size; ++n) { + float in_scale = scale[n % axis_size]; + const signed char* din_c = in + n * inner_size; + float* dout_c = out + n * inner_size; + float32x4_t vscale = vdupq_n_f32(in_scale); + if (cnt > 0) { + int loop = cnt; + const signed char* din_ptr = din_c; + float* dout_ptr = dout_c; #ifdef __aarch64__ - asm volatile( - "ldp d0, d1, [%[in]], #16 \n" /* load 16 int8*/ - "0: \n" /* main loop */ - "sshll v2.8h, v0.8b, #0 \n" /* trans to int16*/ - "sshll v3.8h, v1.8b, #0 \n" /* trans to int16*/ - - "sshll v4.4s, v2.4h, #0 \n" /* trans to int32*/ - "sshll2 v5.4s, v2.8h, #0 \n" /* trans to int32*/ - "sshll v6.4s, v3.4h, #0 \n" /* trans to int32*/ - "sshll2 v7.4s, v3.8h, #0 \n" /* trans to int32*/ - - "ldp d0, d1, [%[in]], #16 \n" /* load 16 int8*/ - - "scvtf v8.4s, v4.4s \n" /* trans to fp32*/ - "scvtf v9.4s, v5.4s \n" /* trans to fp32*/ - "scvtf v10.4s, v6.4s \n" /* trans to fp32*/ - "scvtf v11.4s, v7.4s \n" /* trans to fp32*/ - - "subs %[loop], %[loop], #1 \n" - - "fmul v4.4s, v8.4s, %[scale].4s \n" /* mul with scale*/ - "fmul v5.4s, v9.4s, %[scale].4s \n" /* mul with scale*/ - "fmul v6.4s, v10.4s, %[scale].4s \n" /* mul with scale*/ - "fmul v7.4s, v11.4s, %[scale].4s \n" /* mul with scale*/ - - "stp q4, q5, [%[out]], #32 \n" /* write to memory*/ - "stp q6, q7, [%[out]], #32 \n" /* write to memory*/ - - "bne 0b \n" - :[loop] "+r" (loop), [in] "+r" (din_ptr), [out] "+r" (dout_ptr) - :[scale] "w" (vscale) - :"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11" - ); + asm volatile( + "ldp d0, d1, [%[in]], #16 \n" /* load 16 int8*/ + "0: \n" /* main loop */ + "sshll v2.8h, v0.8b, #0 \n" /* trans to int16*/ + "sshll v3.8h, v1.8b, #0 \n" /* trans to int16*/ + + "sshll v4.4s, v2.4h, #0 \n" /* trans to int32*/ + "sshll2 v5.4s, v2.8h, #0 \n" /* trans to int32*/ + "sshll v6.4s, v3.4h, #0 \n" /* trans to int32*/ + "sshll2 v7.4s, v3.8h, #0 \n" /* trans to int32*/ + + "ldp d0, d1, [%[in]], #16 \n" /* load 16 int8*/ + + "scvtf v8.4s, v4.4s \n" /* trans to fp32*/ + "scvtf v9.4s, v5.4s \n" /* trans to fp32*/ + "scvtf v10.4s, v6.4s \n" /* trans to fp32*/ + "scvtf v11.4s, v7.4s \n" /* trans to fp32*/ + + "subs %[loop], %[loop], #1 \n" + + "fmul v4.4s, v8.4s, %[scale].4s \n" /* mul with scale*/ + "fmul v5.4s, v9.4s, %[scale].4s \n" /* mul with scale*/ + "fmul v6.4s, v10.4s, %[scale].4s \n" /* mul with scale*/ + "fmul v7.4s, v11.4s, %[scale].4s \n" /* mul with scale*/ + + "stp q4, q5, [%[out]], #32 \n" /* write to memory*/ + "stp q6, q7, [%[out]], #32 \n" /* write to memory*/ + + "bne 0b \n" + : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr) + : [scale] "w"(vscale) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", + "v11"); #else - asm volatile( - "vld1.32 {d0-d1}, [%[in]]! @ load 16 int8\n" - "0: @ main loop\n" - "vmovl.s8 q2, d0 @ trans to int16\n" - "vmovl.s8 q3, d1 @ trans to int16\n" - "vmovl.s16 q4, d4 @ trans to int32\n" - "vmovl.s16 q5, d5 @ trans to int32\n" - "vmovl.s16 q6, d6 @ trans to int32\n" - "vmovl.s16 q7, d7 @ trans to int32\n" - "vcvt.f32.s32 q0, q4 @ trans to fp32\n" - "vcvt.f32.s32 q1, q5 @ trans to fp32\n" - "vcvt.f32.s32 q2, q6 @ trans to fp32\n" - "vcvt.f32.s32 q3, q7 @ trans to fp32\n" - "vmul.f32 q4, q0, %q[scale] @ mul with scale\n" - "vmul.f32 q5, q1, %q[scale] @ mul with scale\n" - "vmul.f32 q6, q2, %q[scale] @ mul with scale\n" - "vmul.f32 q7, q3, %q[scale] @ mul with scale\n" - - "vld1.32 {d0-d1}, [%[in]]! @ load 16 int8\n" - - "subs %[loop], #1 \n" - - "vst1.f32 {d8-d11}, [%[out]]! @ write to memory\n" - "vst1.f32 {d12-d15}, [%[out]]! @ write to memory\n" - - "bne 0b \n" - :[loop] "+r" (loop), [in] "+r" (din_ptr), [out] "+r" (dout_ptr) - :[scale] "w" (vscale) - :"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7" - ); -#endif //__aarch64__ - } - const signed char* din_r = din_c + 16 * cnt; - float* dout_r = dout_c + 16 * cnt; - for (int i = 0; i < remain; ++i) { - dout_r[i] = in_scale * din_r[i]; - } + asm volatile( + "vld1.32 {d0-d1}, [%[in]]! @ load 16 int8\n" + "0: @ main loop\n" + "vmovl.s8 q2, d0 @ trans to int16\n" + "vmovl.s8 q3, d1 @ trans to int16\n" + "vmovl.s16 q4, d4 @ trans to int32\n" + "vmovl.s16 q5, d5 @ trans to int32\n" + "vmovl.s16 q6, d6 @ trans to int32\n" + "vmovl.s16 q7, d7 @ trans to int32\n" + "vcvt.f32.s32 q0, q4 @ trans to fp32\n" + "vcvt.f32.s32 q1, q5 @ trans to fp32\n" + "vcvt.f32.s32 q2, q6 @ trans to fp32\n" + "vcvt.f32.s32 q3, q7 @ trans to fp32\n" + "vmul.f32 q4, q0, %q[scale] @ mul with scale\n" + "vmul.f32 q5, q1, %q[scale] @ mul with scale\n" + "vmul.f32 q6, q2, %q[scale] @ mul with scale\n" + "vmul.f32 q7, q3, %q[scale] @ mul with scale\n" + + "vld1.32 {d0-d1}, [%[in]]! @ load 16 int8\n" + + "subs %[loop], #1 \n" + + "vst1.f32 {d8-d11}, [%[out]]! @ write to memory\n" + "vst1.f32 {d12-d15}, [%[out]]! @ write to memory\n" + + "bne 0b \n" + : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr) + : [scale] "w"(vscale) + : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); +#endif // __aarch64__ + } + const signed char* din_r = din_c + 16 * cnt; + float* dout_r = dout_c + 16 * cnt; + for (int i = 0; i < remain; ++i) { + dout_r[i] = in_scale * din_r[i]; } + } } -void int16_to_fp32(const short* in, float* out, const float* scale, - int axis_size, long long outer_size, long long inner_size) { - - int cnt = inner_size / 16; - int remain = inner_size & 15; - long long loop_size = axis_size * outer_size; +void int16_to_fp32(const int16_t* in, float* out, const float* scale, + int axis_size, int64_t outer_size, int64_t inner_size) { + int cnt = inner_size / 16; + int remain = inner_size & 15; + int64_t loop_size = axis_size * outer_size; #pragma omp parallel for - for (long long n = 0; n < loop_size; ++n) { - float in_scale = scale[n % axis_size]; - const short* din_c = in + n * inner_size; - float* dout_c = out + n * inner_size; - float32x4_t vscale = vdupq_n_f32(in_scale); - if (cnt > 0) { - int loop = cnt; - const short* din_ptr = din_c; - float* dout_ptr = dout_c; + for (int64_t n = 0; n < loop_size; ++n) { + float in_scale = scale[n % axis_size]; + const int16_t* din_c = in + n * inner_size; + float* dout_c = out + n * inner_size; + float32x4_t vscale = vdupq_n_f32(in_scale); + if (cnt > 0) { + int loop = cnt; + const int16_t* din_ptr = din_c; + float* dout_ptr = dout_c; #ifdef __aarch64__ - asm volatile( - "ldp q0, q1, [%[in]], #32 \n" /* load 16 int16*/ - "0: \n" /* main loop */ - "sshll v4.4s, v0.4h, #0 \n" /* trans to int32*/ - "sshll2 v5.4s, v0.8h, #0 \n" /* trans to int32*/ - "sshll v6.4s, v1.4h, #0 \n" /* trans to int32*/ - "sshll2 v7.4s, v1.8h, #0 \n" /* trans to int32*/ - - "ldp q0, q1, [%[in]], #32 \n" /* load 16 int16*/ - - "scvtf v8.4s, v4.4s \n" /* trans to fp32*/ - "scvtf v9.4s, v5.4s \n" /* trans to fp32*/ - "scvtf v10.4s, v6.4s \n" /* trans to fp32*/ - "scvtf v11.4s, v7.4s \n" /* trans to fp32*/ - - "subs %[loop], %[loop], #1 \n" - - "fmul v4.4s, v8.4s, %[scale].4s \n" /* mul with scale*/ - "fmul v5.4s, v9.4s, %[scale].4s \n" /* mul with scale*/ - "fmul v6.4s, v10.4s, %[scale].4s \n" /* mul with scale*/ - "fmul v7.4s, v11.4s, %[scale].4s \n" /* mul with scale*/ - - "stp q4, q5, [%[out]], #32 \n" /* write to memory*/ - "stp q6, q7, [%[out]], #32 \n" /* write to memory*/ - - "bne 0b \n" - :[loop] "+r" (loop), [in] "+r" (din_ptr), [out] "+r" (dout_ptr) - :[scale] "w" (vscale) - :"v0", "v1", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11" - ); + asm volatile( + "ldp q0, q1, [%[in]], #32 \n" /* load 16 int16*/ + "0: \n" /* main loop */ + "sshll v4.4s, v0.4h, #0 \n" /* trans to int32*/ + "sshll2 v5.4s, v0.8h, #0 \n" /* trans to int32*/ + "sshll v6.4s, v1.4h, #0 \n" /* trans to int32*/ + "sshll2 v7.4s, v1.8h, #0 \n" /* trans to int32*/ + + "ldp q0, q1, [%[in]], #32 \n" /* load 16 int16*/ + + "scvtf v8.4s, v4.4s \n" /* trans to fp32*/ + "scvtf v9.4s, v5.4s \n" /* trans to fp32*/ + "scvtf v10.4s, v6.4s \n" /* trans to fp32*/ + "scvtf v11.4s, v7.4s \n" /* trans to fp32*/ + + "subs %[loop], %[loop], #1 \n" + + "fmul v4.4s, v8.4s, %[scale].4s \n" /* mul with scale*/ + "fmul v5.4s, v9.4s, %[scale].4s \n" /* mul with scale*/ + "fmul v6.4s, v10.4s, %[scale].4s \n" /* mul with scale*/ + "fmul v7.4s, v11.4s, %[scale].4s \n" /* mul with scale*/ + + "stp q4, q5, [%[out]], #32 \n" /* write to memory*/ + "stp q6, q7, [%[out]], #32 \n" /* write to memory*/ + + "bne 0b \n" + : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr) + : [scale] "w"(vscale) + : "v0", "v1", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11"); #else - asm volatile( - "vld1.32 {d0-d3}, [%[in]]! @ load 16 int16\n" - "0: @ main loop\n" - "vmovl.s16 q4, d0 @ trans to int32\n" - "vmovl.s16 q5, d1 @ trans to int32\n" - "vmovl.s16 q6, d2 @ trans to int32\n" - "vmovl.s16 q7, d3 @ trans to int32\n" - "vcvt.f32.s32 q0, q4 @ trans to fp32\n" - "vcvt.f32.s32 q1, q5 @ trans to fp32\n" - "vcvt.f32.s32 q2, q6 @ trans to fp32\n" - "vcvt.f32.s32 q3, q7 @ trans to fp32\n" - "vmul.f32 q4, q0, %q[scale] @ mul with scale\n" - "vmul.f32 q5, q1, %q[scale] @ mul with scale\n" - "vmul.f32 q6, q2, %q[scale] @ mul with scale\n" - "vmul.f32 q7, q3, %q[scale] @ mul with scale\n" - - "vld1.32 {d0-d3}, [%[in]]! @ load 16 int8\n" - - "subs %[loop], #1 \n" - - "vst1.f32 {d8-d11}, [%[out]]! @ write to memory\n" - "vst1.f32 {d12-d15}, [%[out]]! @ write to memory\n" - - "bne 0b \n" - :[loop] "+r" (loop), [in] "+r" (din_ptr), [out] "+r" (dout_ptr) - :[scale] "w" (vscale) - :"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7" - ); -#endif //__aarch64__ - } - const short* din_r = din_c + 16 * cnt; - float* dout_r = dout_c + 16 * cnt; - for (int i = 0; i < remain; ++i) { - dout_r[i] = in_scale * din_r[i]; - } + asm volatile( + "vld1.32 {d0-d3}, [%[in]]! @ load 16 int16\n" + "0: @ main loop\n" + "vmovl.s16 q4, d0 @ trans to int32\n" + "vmovl.s16 q5, d1 @ trans to int32\n" + "vmovl.s16 q6, d2 @ trans to int32\n" + "vmovl.s16 q7, d3 @ trans to int32\n" + "vcvt.f32.s32 q0, q4 @ trans to fp32\n" + "vcvt.f32.s32 q1, q5 @ trans to fp32\n" + "vcvt.f32.s32 q2, q6 @ trans to fp32\n" + "vcvt.f32.s32 q3, q7 @ trans to fp32\n" + "vmul.f32 q4, q0, %q[scale] @ mul with scale\n" + "vmul.f32 q5, q1, %q[scale] @ mul with scale\n" + "vmul.f32 q6, q2, %q[scale] @ mul with scale\n" + "vmul.f32 q7, q3, %q[scale] @ mul with scale\n" + + "vld1.32 {d0-d3}, [%[in]]! @ load 16 int8\n" + + "subs %[loop], #1 \n" + + "vst1.f32 {d8-d11}, [%[out]]! @ write to memory\n" + "vst1.f32 {d12-d15}, [%[out]]! @ write to memory\n" + + "bne 0b \n" + : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr) + : [scale] "w"(vscale) + : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); +#endif // __aarch64__ + } + const int16_t* din_r = din_c + 16 * cnt; + float* dout_r = dout_c + 16 * cnt; + for (int i = 0; i < remain; ++i) { + dout_r[i] = in_scale * din_r[i]; } + } } void int32_to_fp32(const int* din, float* dout, const float* scale, - int axis_size, long long outer_size, long long inner_size) { - int cnt = inner_size / 16; - int remain = inner_size & 15; - long long loop_size = axis_size * outer_size; + int axis_size, int64_t outer_size, int64_t inner_size) { + int cnt = inner_size / 16; + int remain = inner_size & 15; + int64_t loop_size = axis_size * outer_size; #pragma omp parallel for - for (long long n = 0; n < loop_size; ++n) { - float in_scale = scale[n % axis_size]; - const int* din_c = din + n * inner_size; - float* dout_c = dout + n * inner_size; - float32x4_t vscale = vdupq_n_f32(in_scale); - if (cnt > 0) { - int loop = cnt; - const int* din_ptr = din_c; - float* dout_ptr = dout_c; + for (int64_t n = 0; n < loop_size; ++n) { + float in_scale = scale[n % axis_size]; + const int* din_c = din + n * inner_size; + float* dout_c = dout + n * inner_size; + float32x4_t vscale = vdupq_n_f32(in_scale); + if (cnt > 0) { + int loop = cnt; + const int* din_ptr = din_c; + float* dout_ptr = dout_c; #ifdef __aarch64__ - asm volatile( - "ldp q0, q1, [%[in]], #32 \n" - "ldp q2, q3, [%[in]], #32 \n" - "0: \n" - "scvtf v4.4s, v0.4s \n" - "scvtf v5.4s, v1.4s \n" - "scvtf v6.4s, v2.4s \n" - "scvtf v7.4s, v3.4s \n" - "ldp q0, q1, [%[in]], #32 \n" - "fmul v8.4s, v4.4s, %[scale].4s \n" - "fmul v9.4s, v5.4s, %[scale].4s \n" - "fmul v10.4s, v6.4s, %[scale].4s \n" - "fmul v11.4s, v7.4s, %[scale].4s \n" - "ldp q2, q3, [%[in]], #32 \n" - "stp q8, q9, [%[out]], #32 \n" - "stp q10, q11, [%[out]], #32 \n" - "subs %[loop], %[loop], #1 \n" - "bne 0b \n" - :[loop] "+r" (loop), [in] "+r" (din_ptr), [out] "+r" (dout_ptr) - :[scale] "w" (vscale) - :"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11" - ); + asm volatile( + "ldp q0, q1, [%[in]], #32 \n" + "ldp q2, q3, [%[in]], #32 \n" + "0: \n" + "scvtf v4.4s, v0.4s \n" + "scvtf v5.4s, v1.4s \n" + "scvtf v6.4s, v2.4s \n" + "scvtf v7.4s, v3.4s \n" + "ldp q0, q1, [%[in]], #32 \n" + "fmul v8.4s, v4.4s, %[scale].4s \n" + "fmul v9.4s, v5.4s, %[scale].4s \n" + "fmul v10.4s, v6.4s, %[scale].4s \n" + "fmul v11.4s, v7.4s, %[scale].4s \n" + "ldp q2, q3, [%[in]], #32 \n" + "stp q8, q9, [%[out]], #32 \n" + "stp q10, q11, [%[out]], #32 \n" + "subs %[loop], %[loop], #1 \n" + "bne 0b \n" + : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr) + : [scale] "w"(vscale) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", + "v11"); #else - asm volatile( - "vld1.s32 {d0-d3}, [%[in]]! \n" - "vld1.s32 {d4-d7}, [%[in]]! \n" - "0: \n" - "vcvt.f32.s32 q4, q0 \n" - "vcvt.f32.s32 q5, q1 \n" - "vcvt.f32.s32 q6, q2 \n" - "vcvt.f32.s32 q7, q3 \n" - "vld1.s32 {d0-d3}, [%[in]]! \n" - "vmul.f32 q8, q4, %q[scale] \n" - "vmul.f32 q9, q5, %q[scale] \n" - "vmul.f32 q10, q6, %q[scale] \n" - "vmul.f32 q11, q7, %q[scale] \n" - "vld1.s32 {d4-d7}, [%[in]]! \n" - "subs %[loop], #1 \n" - "vst1.f32 {d16-d19}, [%[out]]! \n" - "vst1.f32 {d20-d23}, [%[out]]! \n" - "bne 0b \n" - :[loop] "+r" (loop), [in] "+r" (din_ptr), [out] "+r" (dout_ptr) - :[scale] "w" (vscale) - :"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11" - ); -#endif //__aarch64__ - } - const int* din_r = din_c + 16 * cnt; - float* dout_r = dout_c + 16 * cnt; - for (int i = 0; i < remain; ++i) { - dout_r[i] = in_scale * din_r[i]; - } + asm volatile( + "vld1.s32 {d0-d3}, [%[in]]! \n" + "vld1.s32 {d4-d7}, [%[in]]! \n" + "0: \n" + "vcvt.f32.s32 q4, q0 \n" + "vcvt.f32.s32 q5, q1 \n" + "vcvt.f32.s32 q6, q2 \n" + "vcvt.f32.s32 q7, q3 \n" + "vld1.s32 {d0-d3}, [%[in]]! \n" + "vmul.f32 q8, q4, %q[scale] \n" + "vmul.f32 q9, q5, %q[scale] \n" + "vmul.f32 q10, q6, %q[scale] \n" + "vmul.f32 q11, q7, %q[scale] \n" + "vld1.s32 {d4-d7}, [%[in]]! \n" + "subs %[loop], #1 \n" + "vst1.f32 {d16-d19}, [%[out]]! \n" + "vst1.f32 {d20-d23}, [%[out]]! \n" + "bne 0b \n" + : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr) + : [scale] "w"(vscale) + : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", + "q11"); +#endif // __aarch64__ } + const int* din_r = din_c + 16 * cnt; + float* dout_r = dout_c + 16 * cnt; + for (int i = 0; i < remain; ++i) { + dout_r[i] = in_scale * din_r[i]; + } + } } -void int32_to_int8(const int* din, signed char* dout, const float* scale, \ - int axis_size, long long outer_size, long long inner_size) { - int cnt = inner_size / 16; - int remain = inner_size & 15; - long long loop_size = outer_size * axis_size; +void int32_to_int8(const int* din, signed char* dout, const float* scale, + int axis_size, int64_t outer_size, int64_t inner_size) { + int cnt = inner_size / 16; + int remain = inner_size & 15; + int64_t loop_size = outer_size * axis_size; #pragma omp parallel for - for (long long n = 0; n < loop_size; ++n) { - float in_scale = scale[n % axis_size]; - const int* din_c = din + n * inner_size; - signed char* dout_c = dout + n * inner_size; - float32x4_t vscale = vdupq_n_f32(in_scale); - float32x4_t vzero = vdupq_n_f32(0.f); - float32x4_t vpoff = vdupq_n_f32(0.5f); - float32x4_t vnoff = vdupq_n_f32(-0.5f); - if (cnt > 0) { - int loop = cnt; - const int* din_ptr = din_c; - signed char* dout_ptr = dout_c; + for (int64_t n = 0; n < loop_size; ++n) { + float in_scale = scale[n % axis_size]; + const int* din_c = din + n * inner_size; + signed char* dout_c = dout + n * inner_size; + float32x4_t vscale = vdupq_n_f32(in_scale); + float32x4_t vzero = vdupq_n_f32(0.f); + float32x4_t vpoff = vdupq_n_f32(0.5f); + float32x4_t vnoff = vdupq_n_f32(-0.5f); + if (cnt > 0) { + int loop = cnt; + const int* din_ptr = din_c; + signed char* dout_ptr = dout_c; #ifdef __aarch64__ - asm volatile( - "0: \n" - "ld1 {v0.4s, v1.4s}, [%[in]], #32 \n" - "ld1 {v2.4s, v3.4s}, [%[in]], #32 \n" - - "scvtf v4.4s, v0.4s \n" - "scvtf v5.4s, v1.4s \n" - "scvtf v6.4s, v2.4s \n" - "scvtf v7.4s, v3.4s \n" - - "fmul v0.4s, v4.4s, %[scale].4s \n" - "fmul v1.4s, v5.4s, %[scale].4s \n" - "fmul v2.4s, v6.4s, %[scale].4s \n" - "fmul v3.4s, v7.4s, %[scale].4s \n" - - "fcvtas v4.4s, v0.4s \n" - "fcvtas v5.4s, v1.4s \n" - "fcvtas v6.4s, v2.4s \n" - "fcvtas v7.4s, v3.4s \n" - - "sqxtn v0.4h, v4.4s \n" - "sqxtn2 v0.8h, v5.4s \n" - "sqxtn v1.4h, v6.4s \n" - "sqxtn2 v1.8h, v7.4s \n" - - "sqxtn v2.8b, v0.8h \n" - "sqxtn2 v2.16b, v1.8h \n" - - "st1 {v2.16b}, [%[out]], #16 \n" - "subs %[loop], %[loop], #1 \n" - "bne 0b \n" - :[loop] "+r" (loop), [in] "+r" (din_ptr), [out] "+r" (dout_ptr) - :[scale] "w" (vscale) - :"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" - ); + asm volatile( + "0: \n" + "ld1 {v0.4s, v1.4s}, [%[in]], #32 \n" + "ld1 {v2.4s, v3.4s}, [%[in]], #32 \n" + + "scvtf v4.4s, v0.4s \n" + "scvtf v5.4s, v1.4s \n" + "scvtf v6.4s, v2.4s \n" + "scvtf v7.4s, v3.4s \n" + + "fmul v0.4s, v4.4s, %[scale].4s \n" + "fmul v1.4s, v5.4s, %[scale].4s \n" + "fmul v2.4s, v6.4s, %[scale].4s \n" + "fmul v3.4s, v7.4s, %[scale].4s \n" + + "fcvtas v4.4s, v0.4s \n" + "fcvtas v5.4s, v1.4s \n" + "fcvtas v6.4s, v2.4s \n" + "fcvtas v7.4s, v3.4s \n" + + "sqxtn v0.4h, v4.4s \n" + "sqxtn2 v0.8h, v5.4s \n" + "sqxtn v1.4h, v6.4s \n" + "sqxtn2 v1.8h, v7.4s \n" + + "sqxtn v2.8b, v0.8h \n" + "sqxtn2 v2.16b, v1.8h \n" + + "st1 {v2.16b}, [%[out]], #16 \n" + "subs %[loop], %[loop], #1 \n" + "bne 0b \n" + : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr) + : [scale] "w"(vscale) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); #else - asm volatile( - "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" - "vld1.32 {d4-d7}, [%[din]]! @ load in8~in16\n" - "0: @ main loop\n" - "vcvt.f32.s32 q4, q0 @ cvt to float\n" - "vcvt.f32.s32 q5, q1 @ cvt to float\n" - "vcvt.f32.s32 q6, q2 @ cvt to float\n" - "vcvt.f32.s32 q7, q3 @ cvt to float\n" - "vand.i32 q0, %q[vpoff], %q[vpoff] @ set offset, 0.5\n" - "vand.i32 q1, q0, q0 @ set offset, 0.5\n" - "vand.i32 q2, q0, q0 @ set offset, 0.5\n" - "vand.i32 q3, q0, q0 @ set offset, 0.5\n" - "vcgt.f32 q8, q4, %q[vzero] @ get mask > 0, in0\n" - "vcgt.f32 q9, q5, %q[vzero] @ get mask > 0, in1\n" - "vcgt.f32 q10, q6, %q[vzero] @ get mask > 0, in2\n" - "vcgt.f32 q11, q7, %q[vzero] @ get mask > 0, in3\n" - "vbif.f32 q0, %q[vnoff], q8 @ get right offset\n" - "vbif.f32 q1, %q[vnoff], q9 @ get right offset\n" - "vbif.f32 q2, %q[vnoff], q10 @ get right offset\n" - "vbif.f32 q3, %q[vnoff], q11 @ get right offset\n" - "vmla.f32 q0, q4, %q[vscale] @ mul scale\n" - "vmla.f32 q1, q5, %q[vscale] @ mul scale\n" - "vmla.f32 q2, q6, %q[vscale] @ mul scale\n" - "vmla.f32 q3, q7, %q[vscale] @ mul scale\n" - "vcvt.s32.f32 q4, q0 @ cvt to int32\n" - "vcvt.s32.f32 q5, q1 @ cvt to int32\n" - "vcvt.s32.f32 q6, q2 @ cvt to int32\n" - "vcvt.s32.f32 q7, q3 @ cvt to int32\n" - "vqmovn.s32 d16, q4 @ cnt to int16\n" - "vqmovn.s32 d17, q5 @ cnt to int16\n" - "vqmovn.s32 d18, q6 @ cnt to int16\n" - "vqmovn.s32 d19, q7 @ cnt to int16\n" - "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" - "vqmovn.s16 d8, q8 @ cnt to int8\n" - "vqmovn.s16 d9, q9 @ cnt to int8\n" - "vld1.32 {d4-d7}, [%[din]]! @ load in8~in16\n" - "vst1.32 {d8-d9}, [%[dout]]! @ write to output\n" - "subs %[loop], #1 @ loop count -1\n" - "bne 0b @ to main loop\n" - :[loop] "+r" (loop), [din] "+r" (din_ptr), [dout] "+r" (dout_ptr) - :[vscale] "w" (vscale), [vzero] "w"(vzero), [vnoff] "w" (vnoff), [vpoff] "w" (vpoff) - :"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11" - ); -#endif //__aarch64__ - } - const int* din_r = din_c + 16 * cnt; - int8_t* dout_r = dout_c + 16 * cnt; - for (int i = 0; i < remain; ++i) { - dout_r[i] = saturate_cast(roundf(in_scale * din_r[i])); - } + asm volatile( + "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" + "vld1.32 {d4-d7}, [%[din]]! @ load in8~in16\n" + "0: @ main loop\n" + "vcvt.f32.s32 q4, q0 @ cvt to float\n" + "vcvt.f32.s32 q5, q1 @ cvt to float\n" + "vcvt.f32.s32 q6, q2 @ cvt to float\n" + "vcvt.f32.s32 q7, q3 @ cvt to float\n" + "vand.i32 q0, %q[vpoff], %q[vpoff] @ set offset, 0.5\n" + "vand.i32 q1, q0, q0 @ set offset, 0.5\n" + "vand.i32 q2, q0, q0 @ set offset, 0.5\n" + "vand.i32 q3, q0, q0 @ set offset, 0.5\n" + "vcgt.f32 q8, q4, %q[vzero] @ get mask > 0, in0\n" + "vcgt.f32 q9, q5, %q[vzero] @ get mask > 0, in1\n" + "vcgt.f32 q10, q6, %q[vzero] @ get mask > 0, in2\n" + "vcgt.f32 q11, q7, %q[vzero] @ get mask > 0, in3\n" + "vbif.f32 q0, %q[vnoff], q8 @ get right offset\n" + "vbif.f32 q1, %q[vnoff], q9 @ get right offset\n" + "vbif.f32 q2, %q[vnoff], q10 @ get right offset\n" + "vbif.f32 q3, %q[vnoff], q11 @ get right offset\n" + "vmla.f32 q0, q4, %q[vscale] @ mul scale\n" + "vmla.f32 q1, q5, %q[vscale] @ mul scale\n" + "vmla.f32 q2, q6, %q[vscale] @ mul scale\n" + "vmla.f32 q3, q7, %q[vscale] @ mul scale\n" + "vcvt.s32.f32 q4, q0 @ cvt to int32\n" + "vcvt.s32.f32 q5, q1 @ cvt to int32\n" + "vcvt.s32.f32 q6, q2 @ cvt to int32\n" + "vcvt.s32.f32 q7, q3 @ cvt to int32\n" + "vqmovn.s32 d16, q4 @ cnt to int16\n" + "vqmovn.s32 d17, q5 @ cnt to int16\n" + "vqmovn.s32 d18, q6 @ cnt to int16\n" + "vqmovn.s32 d19, q7 @ cnt to int16\n" + "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" + "vqmovn.s16 d8, q8 @ cnt to int8\n" + "vqmovn.s16 d9, q9 @ cnt to int8\n" + "vld1.32 {d4-d7}, [%[din]]! @ load in8~in16\n" + "vst1.32 {d8-d9}, [%[dout]]! @ write to output\n" + "subs %[loop], #1 @ loop count -1\n" + "bne 0b @ to main loop\n" + : [loop] "+r"(loop), [din] "+r"(din_ptr), [dout] "+r"(dout_ptr) + : [vscale] "w"(vscale), [vzero] "w"(vzero), [vnoff] "w"(vnoff), + [vpoff] "w"(vpoff) + : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", + "q11"); +#endif // __aarch64__ + } + const int* din_r = din_c + 16 * cnt; + int8_t* dout_r = dout_c + 16 * cnt; + for (int i = 0; i < remain; ++i) { + dout_r[i] = saturate_cast(roundf(in_scale * din_r[i])); } + } } -void int32_to_int32(const int* din, int* dout, const float* scale, \ - int axis_size, long long outer_size, long long inner_size) { - int size_all = outer_size * axis_size * inner_size; - memmove(dout, din, size_all*sizeof(int)); +void int32_to_int32(const int* din, int* dout, const float* scale, + int axis_size, int64_t outer_size, int64_t inner_size) { + int size_all = outer_size * axis_size * inner_size; + memmove(dout, din, size_all * sizeof(int)); } template <> void int32_to_dtype(const int* din, float* dout, const float* scale, - int axis_size, long long outer_size, long long inner_size) { - - return int32_to_fp32(din, dout, scale, axis_size, outer_size, inner_size); + int axis_size, int64_t outer_size, int64_t inner_size) { + return int32_to_fp32(din, dout, scale, axis_size, outer_size, inner_size); } template <> void int32_to_dtype(const int* din, signed char* dout, const float* scale, - int axis_size, long long outer_size, long long inner_size) { - - return int32_to_int8(din, dout, scale, axis_size, outer_size, inner_size); + int axis_size, int64_t outer_size, int64_t inner_size) { + return int32_to_int8(din, dout, scale, axis_size, outer_size, inner_size); } template <> void int32_to_dtype(const int* din, int* dout, const float* scale, - int axis_size, long long outer_size, long long inner_size) { - - return int32_to_int32(din, dout, scale, axis_size, outer_size, inner_size); + int axis_size, int64_t outer_size, int64_t inner_size) { + return int32_to_int32(din, dout, scale, axis_size, outer_size, inner_size); } } // namespace math diff --git a/paddle/fluid/lite/core/CMakeLists.txt b/paddle/fluid/lite/core/CMakeLists.txt index e5aef8d84fac3a458ac7f26924283c3e861be6fc..227216990fc3af39529c40ffc14d06339ca20047 100644 --- a/paddle/fluid/lite/core/CMakeLists.txt +++ b/paddle/fluid/lite/core/CMakeLists.txt @@ -24,13 +24,14 @@ cc_library(variable_lite SRCS variable.cc) cc_library(op_registry_lite SRCS op_registry.cc DEPS framework_proto_lite) cc_library(scope_lite SRCS scope.cc DEPS ${tensor_lite}) cc_library(cpu_info_lite SRCS cpu_info.cc) -cc_library(context_lite SRCS context.cc DEPS ${tensor_lite} any_lite cpu_info_lite) +lite_cc_library(context_lite SRCS context.cc DEPS ${tensor_lite} any_lite cpu_info_lite eigen3) cc_library(op_lite SRCS op_lite.cc DEPS scope_lite op_registry_lite target_wrapper_lite cpp_op_desc_lite ${tensor_lite}) cc_library(types_lite SRCS types.cc) cc_library(type_system SRCS type_system.cc DEPS ${tensor_lite} target_wrapper_lite) -lite_cc_library(program_lite SRCS program.cc DEPS op_lite kernel_lite compatible_pb_lite model_parser_lite HVY_DEPS framework_proto) +lite_cc_library(program_lite SRCS program.cc DEPS op_lite kernel_lite compatible_pb_lite model_parser_lite HVY_DEPS framework_proto + PROFILE_DEPS basic_profiler_lite) cc_library(optimizer_lite SRCS optimizer.cc DEPS mir_pass_manager model_parser_lite program_lite) add_subdirectory(mir) @@ -56,3 +57,4 @@ lite_cc_test(test_type_system SRCS type_system_test.cc DEPS type_system utils_li lite_cc_test(test_types_lite SRCS types_test.cc DEPS types_lite) lite_cc_test(test_memory_lite SRCS memory_test.cc DEPS memory_lite) lite_cc_test(test_context_lite SRCS context_test.cc DEPS context_lite X86_DEPS operator) + diff --git a/paddle/fluid/lite/core/cpu_info.cc b/paddle/fluid/lite/core/cpu_info.cc index df80f1c857688fd6fb76350e720effef0f3c15f6..ab1968295813006d5d11fc4fbf416b4f9c3a3215 100644 --- a/paddle/fluid/lite/core/cpu_info.cc +++ b/paddle/fluid/lite/core/cpu_info.cc @@ -54,15 +54,15 @@ void DeviceInfo::InitInternal(DeviceInfo* dev) { << ", cluster ID: " << dev->cluster_ids_[dev->core_ids_[i]] << ", CPU ARCH: A" << dev->archs_[i]; } - LOG(INFO) << "L1 DataCache size is: "; + VLOG(1) << "L1 DataCache size is: "; for (int i = 0; i < dev->compute_core_num_; ++i) { - LOG(INFO) << dev->L1_cache_[i] / 1024 << " KB"; + VLOG(1) << dev->L1_cache_[i] / 1024 << " KB"; } - LOG(INFO) << "L2 Cache size is: "; + VLOG(1) << "L2 Cache size is: "; for (int i = 0; i < dev->compute_core_num_; ++i) { - LOG(INFO) << dev->L2_cache_[i] / 1024 << " KB"; + VLOG(1) << dev->L2_cache_[i] / 1024 << " KB"; } - LOG(INFO) << "Total memory: " << dev->max_memory_ << "KB"; + VLOG(1) << "Total memory: " << dev->max_memory_ << "KB"; dev->max_freq_ = max_freq[0]; for (int j = 1; j < dev->compute_core_num_; ++j) { diff --git a/paddle/fluid/lite/core/hvy_tensor.h b/paddle/fluid/lite/core/hvy_tensor.h index 16172a80035e6512244f0bccd91ff2f5d2553f0d..748e80c2559718d278a08e3c568532e177c835eb 100644 --- a/paddle/fluid/lite/core/hvy_tensor.h +++ b/paddle/fluid/lite/core/hvy_tensor.h @@ -107,6 +107,8 @@ class TensorHvy : public TensorBase { data_.Resize(framework::make_ddim(dims.Vectorize())); } + void Resize(const std::vector& x) { Resize(DDimHvy(x)); } + void ShareDataWith(const TensorHvy& other) { data_.ShareDataWith(other.data_); } diff --git a/paddle/fluid/lite/core/mir/CMakeLists.txt b/paddle/fluid/lite/core/mir/CMakeLists.txt index 84cba88d11d8b697510d08c0f576342f0818ab0e..c3d3df9c6778eee53bf6492f4c4bfae36ae80687 100644 --- a/paddle/fluid/lite/core/mir/CMakeLists.txt +++ b/paddle/fluid/lite/core/mir/CMakeLists.txt @@ -1,5 +1,5 @@ cc_library(mir_node SRCS node.cc DEPS framework_proto_lite) -cc_library(mir_ssa_graph SRCS ssa_graph.cc DEPS mir_node) +cc_library(mir_ssa_graph SRCS ssa_graph.cc DEPS mir_node program_lite) cc_library(mir_pass SRCS pass.cc DEPS mir_ssa_graph) cc_library(mir_pass_manager SRCS pass_manager.cc DEPS mir_pass mir_ssa_graph mir_passes) cc_library(mir_pass_registry SRCS pass_registry.cc DEPS mir_pass_manager) @@ -20,14 +20,14 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) return() endif() cc_test(test_mir_pass_manager SRCS pass_manager_test.cc DEPS mir_pass_manager mir_passes) -cc_test(test_ssa_graph SRCS ssa_graph_test.cc DEPS - mir_ssa_graph scope_lite op_lite - fc_op_lite - ${host_kernels} - mir_passes - mir_pass_manager - program_fake_utils - ) +#cc_test(test_ssa_graph SRCS ssa_graph_test.cc DEPS + #mir_ssa_graph scope_lite op_lite + #fc_op_lite + #${host_kernels} + #mir_passes + #mir_pass_manager + #program_fake_utils + #) # lite_cc_test(test_variable_place_infrence_pass SRCS variable_place_inference_pass_test.cc # DEPS # mul_op_lite @@ -59,3 +59,4 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) pattern_matcher_high_api proto_desc mir_pass_manager fc_op_lite mul_op_lite elementwise_ops_lite mir_passes compatible_pb_lite program_lite ${ops_lite}) endif() + diff --git a/paddle/fluid/lite/core/naive_test_model.py b/paddle/fluid/lite/core/naive_test_model.py index 832661e5ee86f2759acfeb4a6a410cce6050ad53..f89a5e115fa805bab818cabeab1d63cac00158d0 100644 --- a/paddle/fluid/lite/core/naive_test_model.py +++ b/paddle/fluid/lite/core/naive_test_model.py @@ -18,10 +18,10 @@ import numpy as np import paddle.fluid as fluid from paddle.fluid.backward import append_backward -a = fluid.layers.data(name="a", shape=[100], dtype='float32') -label = fluid.layers.data(name="label", shape=[100], dtype='float32') +a = fluid.layers.data(name="a", shape=[2], dtype='float32') +label = fluid.layers.data(name="label", shape=[10], dtype='float32') -a1 = fluid.layers.fc(input=a, size=500, act=None, bias_attr=False) +a1 = fluid.layers.fc(input=a, size=3, act=None, bias_attr=False) cost = fluid.layers.square_error_cost(a1, label) avg_cost = fluid.layers.mean(cost) @@ -36,7 +36,7 @@ exe.run(fluid.default_startup_program()) with open('startup_program.pb', 'wb') as f: f.write(fluid.default_startup_program().desc.serialize_to_string()) -data_1 = np.array(numpy.random.random([100, 100]), dtype='float32') +#data_1 = np.array(numpy.random.random([100, 100]), dtype='float32') #fluid.default_main_program().desc. @@ -50,7 +50,7 @@ with open('main_program.pb', 'wb') as f: #outs = exe.run(program=prog, feed={'a':data_1, }, fetch_list=[cost]) -sys.exit(0) +#sys.exit(0) fluid.io.save_inference_model("./model2", [a.name], [a1], exe) -print(numpy.array(outs)) +#print(numpy.array(outs)) diff --git a/paddle/fluid/lite/core/profile/CMakeLists.txt b/paddle/fluid/lite/core/profile/CMakeLists.txt index 43731e8a414cff29b9ac4c681e4e0fd67a52603a..92ac495b6b6b35fce710a3d522ae139e2ce54e0a 100644 --- a/paddle/fluid/lite/core/profile/CMakeLists.txt +++ b/paddle/fluid/lite/core/profile/CMakeLists.txt @@ -4,3 +4,4 @@ endif() lite_cc_library(basic_profiler_lite SRCS basic_profiler.cc) lite_cc_test(test_basic_profiler SRCS basic_profiler_test.cc DEPS basic_profiler_lite) + diff --git a/paddle/fluid/lite/cuda/CMakeLists.txt b/paddle/fluid/lite/cuda/CMakeLists.txt index 505759c7d4afef95423ce3815912794ae28255b0..9889b8b1aa02b9f886bf45aaf9b997f0043c3278 100644 --- a/paddle/fluid/lite/cuda/CMakeLists.txt +++ b/paddle/fluid/lite/cuda/CMakeLists.txt @@ -4,3 +4,4 @@ endif() nv_library(target_wrapper_cuda SRCS target_wrapper.cc) nv_library(cuda_blas_lite SRCS blas.cc) + diff --git a/paddle/fluid/lite/gen_code/CMakeLists.txt b/paddle/fluid/lite/gen_code/CMakeLists.txt index bacfc3e988e6035dba696ac626da7a8072821b52..d6e447a2592856730136e8a80bd671ef52cd295c 100644 --- a/paddle/fluid/lite/gen_code/CMakeLists.txt +++ b/paddle/fluid/lite/gen_code/CMakeLists.txt @@ -18,10 +18,11 @@ if (NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) DEPS scope_lite op_lite kernel_lite paddle_infer_gencode ) - lite_cc_test(test_generated_code SRCS generated_code_test.cc DEPS __generated_code__ - ${ops_lite} ${host_kernels} - X86_DEPS ${x86_kernels} - ) + # lite_cc_test(test_generated_code SRCS generated_code_test.cc DEPS __generated_code__ + # ${ops_lite} ${host_kernels} + # X86_DEPS ${x86_kernels} + # ) - add_dependencies(__generated_code__ test_gen_code_lite) + # add_dependencies(__generated_code__ test_gen_code_lite) endif() + diff --git a/paddle/fluid/lite/host/CMakeLists.txt b/paddle/fluid/lite/host/CMakeLists.txt index 90812f3f3cd712571eb7f11261e23c8dcb78b0fe..7f7cf8b238f99fa9db5569952f9e0e39a8ef9f37 100644 --- a/paddle/fluid/lite/host/CMakeLists.txt +++ b/paddle/fluid/lite/host/CMakeLists.txt @@ -1 +1,2 @@ cc_library(target_wrapper_host SRCS target_wrapper.cc) + diff --git a/paddle/fluid/lite/kernels/CMakeLists.txt b/paddle/fluid/lite/kernels/CMakeLists.txt index ce22ba1216664cdf539ee4f576016adc389622ca..0d2178382d99debe1775bd015701825b0a06133a 100644 --- a/paddle/fluid/lite/kernels/CMakeLists.txt +++ b/paddle/fluid/lite/kernels/CMakeLists.txt @@ -5,3 +5,4 @@ add_subdirectory(arm) add_subdirectory(cuda) add_subdirectory(x86) + diff --git a/paddle/fluid/lite/kernels/arm/CMakeLists.txt b/paddle/fluid/lite/kernels/arm/CMakeLists.txt index 1cf66b0d266b3edf0b0d271ceb5e375f01f652c3..6e4d73ecc6f65c5a5a09178680afe8a6ec7f8445 100644 --- a/paddle/fluid/lite/kernels/arm/CMakeLists.txt +++ b/paddle/fluid/lite/kernels/arm/CMakeLists.txt @@ -6,10 +6,11 @@ message(STATUS "compile with lite ARM kernels") cc_library(fc_compute_arm SRCS fc_compute.cc DEPS ${lite_kernel_deps} math_arm) cc_library(relu_compute_arm SRCS relu_compute.cc DEPS ${lite_kernel_deps}) -cc_library(mul_compute_arm SRCS mul_compute.cc DEPS ${lite_kernel_deps} eigen3) +cc_library(mul_compute_arm SRCS mul_compute.cc DEPS ${lite_kernel_deps} math_arm) cc_library(scale_compute_arm SRCS scale_compute.cc DEPS ${lite_kernel_deps} math_arm) cc_library(softmax_compute_arm SRCS softmax_compute.cc DEPS ${lite_kernel_deps} math_arm) cc_library(conv_compute_arm SRCS conv_compute.cc DEPS ${lite_kernel_deps} math_arm) +cc_library(batch_norm_compute_arm SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps} math_arm) cc_library(elementwise_add_compute_arm SRCS elementwise_add_compute.cc DEPS ${lite_kernel_deps} math_arm) cc_library(pool_compute_arm SRCS pool_compute.cc DEPS ${lite_kernel_deps} math_arm) cc_library(split_compute_arm SRCS split_compute.cc DEPS ${lite_kernel_deps} math_arm) @@ -18,8 +19,10 @@ lite_cc_test(test_fc_compute_arm SRCS fc_compute_test.cc DEPS fc_compute_arm mat lite_cc_test(test_scale_compute_arm SRCS scale_compute_test.cc DEPS scale_compute_arm) lite_cc_test(test_softmax_compute_arm SRCS softmax_compute_test.cc DEPS softmax_compute_arm) lite_cc_test(test_conv_compute_arm SRCS conv_compute_test.cc DEPS conv_compute_arm) +lite_cc_test(test_batch_norm_compute_arm SRCS batch_norm_compute_test.cc DEPS batch_norm_compute_arm) lite_cc_test(test_elementwise_add_compute_arm SRCS elementwise_add_compute_test.cc DEPS elementwise_add_compute_arm) lite_cc_test(test_pool_compute_arm SRCS pool_compute_test.cc DEPS pool_compute_arm) +lite_cc_test(test_mul_compute_arm SRCS mul_compute_test.cc DEPS mul_compute_arm) lite_cc_test(test_split_compute_arm SRCS split_compute_test.cc DEPS split_compute_arm) set(arm_kernels @@ -29,6 +32,7 @@ set(arm_kernels scale_compute_arm softmax_compute_arm conv_compute_arm + batch_norm_compute_arm elementwise_add_compute_arm pool_compute_arm split_compute_arm @@ -36,3 +40,4 @@ set(arm_kernels set(arm_kernels "${arm_kernels}" CACHE INTERNAL "arm kernels") + diff --git a/paddle/fluid/lite/kernels/arm/batch_norm_compute.cc b/paddle/fluid/lite/kernels/arm/batch_norm_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..0cb43dd5e0430092cb4e3edb13226ca30de61e4d --- /dev/null +++ b/paddle/fluid/lite/kernels/arm/batch_norm_compute.cc @@ -0,0 +1,114 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/lite/kernels/arm/batch_norm_compute.h" +#include "paddle/fluid/lite/arm/math/funcs.h" +#include "paddle/fluid/lite/core/op_registry.h" +#include "paddle/fluid/lite/core/type_system.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace arm { + +void BatchNormCompute::PrepareForRun() { + auto& param = this->Param(); + auto x_dims = param.x->dims(); + bool global_stats = param.is_test || param.use_global_stats; + if (global_stats) { + int64_t channel_size = 0; + switch (param.data_layout) { + case DATALAYOUT(kNCHW): + channel_size = x_dims[1]; + break; + // case DATALAYOUT(kNHWC): + // channel_size = x_dims[x_dims.size() - 1]; + // break; + default: + LOG(FATAL) << "Unknown storage order: " + << DataLayoutToStr(param.data_layout); + break; + } + new_scale.Resize({channel_size}); + new_bias.Resize({channel_size}); + auto* scale_data = param.scale->mutable_data(); + auto* bias_data = param.bias->mutable_data(); + auto* mean_data = param.mean->mutable_data(); + auto* variance_data = param.variance->mutable_data(); + auto* new_scale_data = new_scale.mutable_data(); + auto* new_bias_data = new_bias.mutable_data(); + for (int c = 0; c < channel_size; c++) { + float inv_scale = 1.f / (std::sqrt(variance_data[c] + param.epsilon)); + new_bias_data[c] = + bias_data[c] - inv_scale * scale_data[c] * mean_data[c]; + new_scale_data[c] = inv_scale * scale_data[c]; + } + } +} + +void BatchNormCompute::Run() { + auto& param = this->Param(); + auto x_dims = param.x->dims(); + auto x_data = param.x->mutable_data(); + auto y_data = param.y->mutable_data(); + bool global_stats = param.is_test || param.use_global_stats; + if (global_stats) { + auto* new_scale_data = new_scale.mutable_data(); + auto* new_bias_data = new_bias.mutable_data(); + int64_t outer_size = 0; + int64_t channel_size = 0; + int64_t inner_size = 0; + switch (param.data_layout) { + case DATALAYOUT(kNCHW): + outer_size = x_dims[0]; + channel_size = x_dims[1]; + inner_size = x_dims.Slice(2, x_dims.size()).production(); + lite::arm::math::scale(x_data, y_data, outer_size, channel_size, + inner_size, new_scale_data, new_bias_data); + break; + // case DATALAYOUT(kNHWC): + // outer_size = x_dims.Slice(0, x_dims.size() - 1).production(); + // channel_size = x_dims[x_dims.size() - 1]; + // lite::arm::math::scale(x_data, y_data, outer_size, channel_size, + // new_scale_data, new_bias_data); + // break; + default: + LOG(FATAL) << "Unknown storage order: " + << DataLayoutToStr(param.data_layout); + break; + } + } else { + // TODO(hong19860320) calculate mean_out, variance_out, saved_mean and + // saved_variance + } +} + +} // namespace arm +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(batch_norm, kARM, kFloat, kNCHW, + paddle::lite::kernels::arm::BatchNormCompute, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Mean", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Variance", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("MeanOut", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("VarianceOut", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("SavedMean", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("SavedVariance", {LiteType::GetTensorTy(TARGET(kARM))}) + .Finalize(); diff --git a/paddle/fluid/lite/kernels/arm/batch_norm_compute.h b/paddle/fluid/lite/kernels/arm/batch_norm_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..cf3ad3accded0db9a95d0f0794c863b4f7b1cd8e --- /dev/null +++ b/paddle/fluid/lite/kernels/arm/batch_norm_compute.h @@ -0,0 +1,42 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/lite/core/kernel.h" +#include "paddle/fluid/lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace arm { + +class BatchNormCompute : public KernelLite { + public: + using param_t = operators::BatchNormParam; + + void PrepareForRun() override; + + void Run() override; + + virtual ~BatchNormCompute() = default; + + private: + Tensor new_scale; + Tensor new_bias; +}; + +} // namespace arm +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/paddle/fluid/lite/kernels/arm/batch_norm_compute_test.cc b/paddle/fluid/lite/kernels/arm/batch_norm_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..3ca1a0b599b3448fe2dbed08fb37ccc9dae3450c --- /dev/null +++ b/paddle/fluid/lite/kernels/arm/batch_norm_compute_test.cc @@ -0,0 +1,221 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/lite/kernels/arm/batch_norm_compute.h" +#include +#include +#include +#include +#include "paddle/fluid/lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace arm { + +template +void batch_norm_compute_ref(const operators::BatchNormParam& param) { + DDim x_dims = param.x->dims(); + auto x_data = param.x->mutable_data(); + auto scale_data = param.scale->mutable_data(); + auto bias_data = param.bias->mutable_data(); + auto mean_data = param.mean->mutable_data(); + auto variance_data = param.variance->mutable_data(); + auto y_data = param.y->mutable_data(); + float epsilon = param.epsilon; + float momentum = param.momentum; + DataLayoutType data_layout = param.data_layout; + + bool global_stats = param.is_test || param.use_global_stats; + if (global_stats) { + int64_t outer_size = 0; + int64_t channel_size = 0; + int64_t inner_size = 0; + switch (data_layout) { + case DATALAYOUT(kNCHW): + outer_size = x_dims[0]; + channel_size = x_dims[1]; + inner_size = x_dims.Slice(2, x_dims.size()).production(); + break; + // case DATALAYOUT(kNHWC): + // outer_size = x_dims.Slice(0, x_dims.size() - 1).production(); + // channel_size = x_dims[x_dims.size() - 1]; + // inner_size = 1; + // break; + default: + LOG(FATAL) << "Unknown storage order: " << DataLayoutToStr(data_layout); + break; + } + auto x_ptr = x_data; + auto y_ptr = y_data; + for (int o = 0; o < outer_size; o++) { + for (int c = 0; c < channel_size; c++) { + for (int i = 0; i < inner_size; i++) { + dtype norm_x = + (*x_ptr - mean_data[c]) / std::sqrt(variance_data[c] + epsilon); + *y_ptr = norm_x * scale_data[c] + bias_data[c]; + x_ptr++; + y_ptr++; + } + } + } + } else { + // TODO(hong19860320) calculate mean_out, variance_out, saved_mean and + // saved_variance + } +} + +TEST(batch_norm_arm, retrive_op) { + auto batch_norm = + KernelRegistry::Global().Create( + "batch_norm"); + ASSERT_FALSE(batch_norm.empty()); + ASSERT_TRUE(batch_norm.front()); +} + +TEST(batch_norm_arm, init) { + BatchNormCompute batch_norm; + ASSERT_EQ(batch_norm.precision(), PRECISION(kFloat)); + ASSERT_EQ(batch_norm.target(), TARGET(kARM)); +} + +TEST(batch_norm_arm, compute) { + DeviceInfo::Init(); + for (auto n : {1, 2}) { + for (auto c : {6, 32 /*, 128*/}) { + for (auto h : {9, 18 /*, 56 , 112, 224, 512*/}) { + for (auto w : {9, 18 /*, 56, 112, 224, 512*/}) { + for (auto is_test : {/*false, */ true}) { + for (auto use_global_stats : {false, true}) { + for (auto epsilon : {1e-4f, 1e-5f}) { + for (auto momentum : {0.9f, 0.99f}) { + for (auto data_layout : + {DATALAYOUT(kNCHW) /*, DATALAYOUT(kNHWC)*/}) { + Tensor x; + Tensor scale; + Tensor bias; + Tensor mean; + Tensor variance; + Tensor y; + Tensor mean_out; + Tensor variance_out; + Tensor saved_mean; + Tensor saved_variance; + Tensor y_ref; + Tensor mean_out_ref; + Tensor variance_out_ref; + Tensor saved_mean_ref; + Tensor saved_variance_ref; + // set the dims of input, output, ref output tensors + std::vector in_out_shape; + switch (data_layout) { + case DATALAYOUT(kNCHW): + in_out_shape = {n, c, h, w}; + break; + // case DATALAYOUT(kNHWC): + // in_out_shape = {n, h, w, c}; + // break; + default: + LOG(FATAL) << "Unknown storage order: " + << DataLayoutToStr(data_layout); + break; + } + x.Resize(in_out_shape); + scale.Resize({c}); + bias.Resize({c}); + mean.Resize({c}); + variance.Resize({c}); + y.Resize(in_out_shape); + mean_out.Resize({c}); + variance_out.Resize({c}); + saved_mean.Resize({c}); + saved_variance.Resize({c}); + y_ref.Resize(in_out_shape); + mean_out_ref.Resize({c}); + variance_out_ref.Resize({c}); + saved_mean_ref.Resize({c}); + saved_variance_ref.Resize({c}); + // initialize the data of input tensors + auto* x_data = x.mutable_data(); + auto* scale_data = scale.mutable_data(); + auto* bias_data = bias.mutable_data(); + auto* mean_data = mean.mutable_data(); + auto* variance_data = variance.mutable_data(); + auto* y_data = y.mutable_data(); + for (int i = 0; i < x.dims().production(); i++) { + x_data[i] = static_cast(i % 64); + } + for (int i = 0; i < scale.dims().production(); i++) { + scale_data[i] = static_cast(i) * 0.01f + 0.03f; + } + for (int i = 0; i < bias.dims().production(); i++) { + bias_data[i] = static_cast(i) * 0.065f + 0.1f; + } + for (int i = 0; i < mean.dims().production(); i++) { + mean_data[i] = static_cast(i) * 0.0565f; + } + for (int i = 0; i < variance.dims().production(); i++) { + variance_data[i] = static_cast(i) * 2.08f + 1.5f; + } + // prepare kernel params and run + BatchNormCompute batch_norm; + std::unique_ptr ctx(new KernelContext); + ctx->As(); + batch_norm.SetContext(std::move(ctx)); + operators::BatchNormParam param; + param.x = &x; + param.scale = &scale; + param.bias = &bias; + param.mean = &mean; + param.variance = &variance; + param.is_test = is_test; + param.use_global_stats = use_global_stats; + param.epsilon = epsilon; + param.momentum = momentum; + param.data_layout = data_layout; + param.y = &y; + param.mean_out = &mean_out; + param.variance_out = &variance_out; + param.saved_mean = &saved_mean; + param.saved_variance = &saved_variance; + batch_norm.SetParam(param); + batch_norm.Launch(); + // invoking ref implementation and compare results + param.y = &y_ref; + param.mean_out = &mean_out_ref; + param.variance_out = &variance_out_ref; + param.saved_mean = &saved_mean_ref; + param.saved_variance = &saved_variance_ref; + batch_norm_compute_ref(param); + auto* y_ref_data = y_ref.mutable_data(); + for (int i = 0; i < y.dims().production(); i++) { + EXPECT_NEAR(y_data[i], y_ref_data[i], 1e-5); + } + } + } + } + } + } + } + } + } + } +} + +} // namespace arm +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(batch_norm, kARM, kFloat, kNCHW, def); diff --git a/paddle/fluid/lite/kernels/arm/conv_compute_test.cc b/paddle/fluid/lite/kernels/arm/conv_compute_test.cc index 4b95aa5ce4a3fd8bc1aa76c7ae3f66f13f60b4ea..e4d80265d7728fa0eeea97fd070a982a8888ec7e 100644 --- a/paddle/fluid/lite/kernels/arm/conv_compute_test.cc +++ b/paddle/fluid/lite/kernels/arm/conv_compute_test.cc @@ -124,6 +124,20 @@ TEST(conv_arm, init) { TEST(conv_arm, compute) { DeviceInfo::Init(); +#if 1 + for (auto n : {2}) { + for (auto ic : {6}) { + for (auto oc : {6}) { + for (auto ih : {9}) { + for (auto iw : {9}) { + for (auto flag_bias : {false, true}) { + for (auto flag_relu : {false, true}) { + for (auto depthwise : {false, true}) { + for (auto dilation : {1}) { + for (auto stride : {1, 2}) { + for (auto padding : {0, 1, 2}) { + for (auto ks : {1, 3, 5}) { +#else for (auto n : {1, 2}) { for (auto ic : {6, 32 /*, 128*/}) { for (auto oc : {6, 32 /*, 128*/}) { @@ -136,6 +150,7 @@ TEST(conv_arm, compute) { for (auto stride : {1, 2}) { for (auto padding : {0, 1, 2}) { for (auto ks : {1, 3, 5}) { +#endif int group = 1; if (depthwise) { // depthwise convolution ? group = oc = ic; diff --git a/paddle/fluid/lite/kernels/arm/fc_compute.cc b/paddle/fluid/lite/kernels/arm/fc_compute.cc index b26551e0533a5ae68c930cc1b9512ba0ca13253a..e31c36d91dbb6cb38fd963510f779df754ec3434 100644 --- a/paddle/fluid/lite/kernels/arm/fc_compute.cc +++ b/paddle/fluid/lite/kernels/arm/fc_compute.cc @@ -22,7 +22,7 @@ namespace lite { namespace kernels { namespace arm { -void FcCompute::Run() { +void FcCompute::PrepareForRun() { auto& param = this->Param(); auto x_dims = param.input->dims(); auto w_dims = param.w->dims(); @@ -31,39 +31,56 @@ void FcCompute::Run() { CHECK_EQ(w_dims.size(), 2UL); CHECK_EQ(param.output->dims().size(), 2UL); + m_ = x_dims.Slice(0, param.in_num_col_dims).production(); + k_ = x_dims.Slice(param.in_num_col_dims, x_dims.size()).production(); + n_ = w_dims[1]; + CHECK_EQ(k_, static_cast(w_dims[0])); + + if (m_ == 1) { + if (!transed_weight_) { + transed_weight_ = new Tensor; + } + transed_weight_->Resize({n_, k_}); + const auto* w_data = param.w->data(); + auto* t_data = transed_weight_->mutable_data(); + int i = 0; + + for (int nn = 0; nn < n_; ++nn) { + for (int kk = 0; kk < k_; ++kk) { + t_data[i++] = w_data[kk * n_ + nn]; + } + } + } +} + +void FcCompute::Run() { + auto& param = this->Param(); + const auto* i_data = param.input->data(); const auto* w_data = param.w->data(); const auto* b_data = param.bias ? param.bias->data() : nullptr; auto* o_data = param.output->mutable_data(); - int x_h = x_dims.Slice(0, param.in_num_col_dims).production(); - int x_w = x_dims.Slice(param.in_num_col_dims, x_dims.size()).production(); - int n = w_dims[1]; - CHECK_EQ(x_w, static_cast(w_dims[0])); auto& ctx = this->ctx_->template As(); - if (x_h > 1) { + if (m_ > 1) { float* packed_in = static_cast(ctx.workspace_data()) + ctx.l2_cache_size() / sizeof(float); - lite::arm::math::prepackA(packed_in, i_data, x_w, 0, x_h, 0, x_w, false, - &ctx); - lite::arm::math::sgemm_prepack(packed_in, w_data, b_data, o_data, x_h, n, - x_w, false, false, false, &ctx); - + lite::arm::math::prepackA(packed_in, i_data, k_, 0, m_, 0, k_, false, &ctx); + lite::arm::math::sgemm_prepack(packed_in, w_data, b_data, o_data, m_, n_, + k_, false, false, false, &ctx); if (param.bias) { - CHECK_EQ(param.bias->numel(), n); - lite::arm::math::fill_bias_fc(o_data, b_data, x_h, n); + CHECK_EQ(param.bias->numel(), n_); + lite::arm::math::fill_bias_fc(o_data, b_data, m_, n_); } } else { - // use sgemmv - // sgemv((const float*)weights, (const float*)din, (float*)dout, - // false, n, x_w, _param->_flag_bias, (float*)bias, false); + CHECK(transed_weight_); + const auto* t_data = transed_weight_->data(); + + lite::arm::math::sgemv(t_data, i_data, o_data, false, n_, k_, + b_data != nullptr, b_data, false); } } -TargetType FcCompute::target() const { return TARGET(kARM); } - -PrecisionType FcCompute::precision() const { return PRECISION(kFloat); } - } // namespace arm } // namespace kernels } // namespace lite diff --git a/paddle/fluid/lite/kernels/arm/fc_compute.h b/paddle/fluid/lite/kernels/arm/fc_compute.h index 414517843354f638ed37f54ef596dc6db53193ce..37f90b31f8a186e1108549d28c465b816c436b0f 100644 --- a/paddle/fluid/lite/kernels/arm/fc_compute.h +++ b/paddle/fluid/lite/kernels/arm/fc_compute.h @@ -25,12 +25,19 @@ class FcCompute : public KernelLite { public: using param_t = operators::FcParam; + void PrepareForRun() override; + void Run() override; - TargetType target() const override; - PrecisionType precision() const override; + ~FcCompute() override { + if (transed_weight_) { + delete transed_weight_; + } + }; - virtual ~FcCompute() = default; + private: + lite::Tensor* transed_weight_{nullptr}; + int m_, n_, k_; }; } // namespace arm diff --git a/paddle/fluid/lite/kernels/arm/fc_compute_test.cc b/paddle/fluid/lite/kernels/arm/fc_compute_test.cc index 2e85fccf7d66be1cbb596bf762d8d1c0f9d608bd..8eec578c27188fd175ff3562082df144458c85fa 100644 --- a/paddle/fluid/lite/kernels/arm/fc_compute_test.cc +++ b/paddle/fluid/lite/kernels/arm/fc_compute_test.cc @@ -14,6 +14,11 @@ #include "paddle/fluid/lite/kernels/arm/fc_compute.h" #include +#include +#include +#include +#include +#include #include #include "paddle/fluid/lite/arm/math/funcs.h" #include "paddle/fluid/lite/core/op_registry.h" @@ -23,6 +28,17 @@ namespace lite { namespace kernels { namespace arm { +template +void FillData(T* a, const int n, const T lower = static_cast(-2.f), + const T upper = static_cast(2.f)) { + static unsigned int seed = 100; + std::mt19937 rng(seed++); + std::uniform_real_distribution uniform_dist(0, 1); + for (int i = 0; i < n; ++i) { + a[i] = static_cast(uniform_dist(rng) * (upper - lower) + lower); + } +} + TEST(fc_arm, retrive_op) { auto fc = KernelRegistry::Global().Create("fc"); @@ -37,108 +53,117 @@ TEST(fc_arm, init) { } TEST(fc_arm, compare_test) { - lite::Tensor x, w, b, out, ref; - constexpr int batch_size = 2; - x.Resize({batch_size, 3}); - w.Resize({3, 4}); - b.Resize({1, 4}); - out.Resize({batch_size, 4}); - ref.Resize({batch_size, 4}); - - auto x_data = x.mutable_data(); - auto w_data = w.mutable_data(); - auto b_data = b.mutable_data(); - auto out_data = out.mutable_data(); - auto ref_data = ref.mutable_data(); - - for (int64_t i = 0; i < x.dims().product(); i++) { - x_data[i] = static_cast(i); - } - for (int64_t i = 0; i < w.dims().product(); i++) { - w_data[i] = static_cast(i); - } - for (int64_t i = 0; i < b.dims().product(); i++) { - b_data[i] = static_cast(i); - } - - lite::arm::math::fc_compute_eigen(x_data, batch_size, 3, // - w_data, 3, 4, // - b_data, ref_data); - - // fc compute kernel - FcCompute fc; - operators::FcParam param; - - param.in_num_col_dims = 1; - param.input = &x; - param.w = &w; - param.bias = &b; - param.output = &out; - param.in_mat_dims = x.dims(); - - DeviceInfo::Init(); - std::unique_ptr ctx(new KernelContext); - ctx->As(); - fc.SetParam(param); - fc.SetContext(std::move(ctx)); - fc.Run(); - - VLOG(3) << "output vs ref"; - for (int i = 0; i < out.dims().product(); i++) { - VLOG(3) << out_data[i] << " vs " << ref_data[i]; - } - - for (int i = 0; i < out.dims().product(); ++i) { - EXPECT_NEAR(out_data[i], ref_data[i], 1e-5); + using T = float; + + for (int m : {1, 2, 3, 4}) { + for (int n : {1, 2, 3, 4}) { + for (int k : {1, 2, 3, 4}) { + for (bool with_bias : {true, false}) { + VLOG(3) << "m: " << m << ", n: " << n << ", k: " << k + << (with_bias ? ", with bias" : ""); + lite::Tensor x, w, b, out, ref; + + x.Resize({m, k}); + w.Resize({k, n}); + b.Resize({1, n}); + out.Resize({m, n}); + ref.Resize({m, n}); + + auto* x_data = x.mutable_data(); + auto* w_data = w.mutable_data(); + auto* b_data = with_bias ? b.mutable_data() : nullptr; + + auto* out_data = out.mutable_data(); + auto* ref_data = ref.mutable_data(); + + FillData(x_data, x.dims().production()); + FillData(w_data, w.dims().production()); + FillData(out_data, out.dims().production(), 0, 0); + FillData(ref_data, ref.dims().production(), 0, 0); + + if (with_bias) { + FillData(b_data, b.dims().production()); + } + + FcCompute fc; + operators::FcParam param; + + param.input = &x; + param.w = &w; + param.bias = with_bias ? &b : nullptr; + param.output = &out; + param.in_num_col_dims = 1; + param.in_mat_dims = x.dims(); + + DeviceInfo::Init(); + std::unique_ptr ctx(new KernelContext); + ctx->As(); + fc.SetParam(param); + fc.SetContext(std::move(ctx)); + fc.PrepareForRun(); + fc.Run(); + + lite::arm::math::fc_compute_eigen(x_data, m, k, w_data, k, n, b_data, + ref_data); + for (int i = 0; i < out.dims().production(); i++) { + EXPECT_NEAR(out_data[i], ref_data[i], 1e-3); + } + } + } + } } } TEST(fc_arm, num_col_dims) { - FcCompute fc; - operators::FcParam param; - - lite::Tensor x; - lite::Tensor w; - lite::Tensor bias; - lite::Tensor output; - - x.Resize({1, 2, 3}); - w.Resize({3, 4}); - bias.Resize({1, 4}); - output.Resize({2, 4}); - - auto* x_data = x.mutable_data(); - auto* w_data = w.mutable_data(); - auto* bias_data = bias.mutable_data(); - auto* output_data = output.mutable_data(); - - for (int64_t i = 0; i < x.dims().product(); i++) { - x_data[i] = static_cast(i); - } - for (int64_t i = 0; i < w.dims().product(); i++) { - w_data[i] = static_cast(i); + using T = float; + + for (bool with_bias : {true, false}) { + lite::Tensor x, w, b, out, ref; + + x.Resize({1, 2, 3}); + w.Resize({3, 4}); + b.Resize({1, 4}); + out.Resize({2, 4}); + ref.Resize({2, 4}); + + auto* x_data = x.mutable_data(); + auto* w_data = w.mutable_data(); + auto* b_data = with_bias ? b.mutable_data() : nullptr; + + auto* out_data = out.mutable_data(); + auto* ref_data = ref.mutable_data(); + + FillData(x_data, x.dims().production()); + FillData(w_data, w.dims().production()); + FillData(out_data, out.dims().production(), 0, 0); + FillData(ref_data, ref.dims().production(), 0, 0); + if (with_bias) { + FillData(b_data, b.dims().production()); + } + FcCompute fc; + operators::FcParam param; + param.input = &x; + param.w = &w; + param.bias = with_bias ? &b : nullptr; + param.output = &out; + param.in_num_col_dims = 2; + param.in_mat_dims = x.dims(); + + std::unique_ptr ctx(new KernelContext); + ctx->As(); + DeviceInfo::Init(); + + fc.SetParam(param); + fc.SetContext(std::move(ctx)); + fc.PrepareForRun(); + fc.Run(); + + lite::arm::math::fc_compute_eigen(x_data, 2, 3, w_data, 3, 4, b_data, + ref_data); + for (int i = 0; i < out.dims().production(); i++) { + EXPECT_NEAR(out_data[i], ref_data[i], 1e-3); + } } - for (int64_t i = 0; i < bias.dims().product(); i++) { - bias_data[i] = static_cast(i); - } - for (int64_t i = 0; i < output.dims().product(); i++) { - output_data[i] = static_cast(i); - } - - param.in_num_col_dims = 2; - param.input = &x; - param.w = &w; - param.bias = &bias; - param.output = &output; - param.in_mat_dims = x.dims(); - - std::unique_ptr ctx(new KernelContext); - ctx->As(); - DeviceInfo::Init(); - - fc.SetParam(param); - fc.SetContext(std::move(ctx)); - fc.Run(); } } // namespace arm diff --git a/paddle/fluid/lite/kernels/arm/mul_compute.cc b/paddle/fluid/lite/kernels/arm/mul_compute.cc index ff12b236031896cfd8503903327ab1141b5171ae..269e4842252c2a88f33c8faf6666d139e36e49f3 100644 --- a/paddle/fluid/lite/kernels/arm/mul_compute.cc +++ b/paddle/fluid/lite/kernels/arm/mul_compute.cc @@ -12,57 +12,57 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include -#include "paddle/fluid/lite/core/kernel.h" +#include "paddle/fluid/lite/kernels/arm/mul_compute.h" +#include "paddle/fluid/lite/arm/math/funcs.h" #include "paddle/fluid/lite/core/op_registry.h" -#include "paddle/fluid/lite/core/types.h" +#include "paddle/fluid/lite/core/type_system.h" namespace paddle { namespace lite { namespace kernels { namespace arm { -template -void mul_compute_eigen(const T* x, int x_h, int x_w, const T* y, int y_h, - int y_w, T* out) { - using matrix_t = - Eigen::Matrix; +void MulCompute::PrepareForRun() { + // TODO(TJ): transpose x or y if necessary +} - Eigen::Map X(x, x_h, x_w); - Eigen::Map Y(y, y_h, y_w); - Eigen::Map Out(out, x_h, y_w); +void MulCompute::Run() { + auto& param = Param(); - Out = X * Y; -} + const auto* x_data = param.x->data(); + const auto* y_data = param.y->data(); + auto* o_data = param.output->mutable_data(); -class MulCompute : public KernelLite { - public: - using param_t = operators::MulParam; + int m = static_cast( + param.x->dims().Slice(0, param.x_num_col_dims).production()); + int x_w = + static_cast(param.x->dims() + .Slice(param.x_num_col_dims, param.x->dims().size()) + .production()); + int y_h = static_cast( + param.y->dims().Slice(0, param.y_num_col_dims).production()); + int n = + static_cast(param.y->dims() + .Slice(param.y_num_col_dims, param.y->dims().size()) + .production()); - void Run() override { - auto& param = Param(); - core::dim2 x_shape( - {static_cast( - param.x->dims().Slice(0, param.x_num_col_dims).production()), - static_cast( - param.x->dims() - .Slice(param.x_num_col_dims, param.x->dims().size()) - .production())}); - core::dim2 y_shape( - {static_cast( - param.y->dims().Slice(0, param.y_num_col_dims).production()), - static_cast( - param.y->dims() - .Slice(param.y_num_col_dims, param.y->dims().size()) - .production())}); + CHECK_EQ(x_w, y_h) << "x_w must be equal with y_h"; + auto k = x_w; + if (n == 1) { + lite::arm::math::sgemv(x_data, y_data, o_data, false, m, k, false, nullptr, + false); - mul_compute_eigen(param.x->data(), x_shape.x, x_shape.y, // - param.y->data(), y_shape.x, y_shape.y, // - param.output->mutable_data()); - } + } else { + constexpr bool is_tranposed_y = false; + auto& ctx = this->ctx_->template As(); - virtual ~MulCompute() = default; -}; + float* packed_x = static_cast(ctx.workspace_data()) + + ctx.l2_cache_size() / sizeof(float); + lite::arm::math::prepackA(packed_x, x_data, k, 0, m, 0, k, false, &ctx); + lite::arm::math::sgemm_prepack(packed_x, y_data, nullptr, o_data, m, n, k, + false, false, is_tranposed_y, &ctx); + } +} } // namespace arm } // namespace kernels diff --git a/paddle/fluid/lite/kernels/arm/mul_compute.h b/paddle/fluid/lite/kernels/arm/mul_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..c18995e5a5c3cceb749465382b284c0a52c188a4 --- /dev/null +++ b/paddle/fluid/lite/kernels/arm/mul_compute.h @@ -0,0 +1,39 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/lite/core/kernel.h" +#include "paddle/fluid/lite/core/op_registry.h" +#include "paddle/fluid/lite/core/types.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace arm { + +class MulCompute : public KernelLite { + public: + using param_t = operators::MulParam; + + void PrepareForRun() override; + + void Run() override; + + virtual ~MulCompute() = default; +}; + +} // namespace arm +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/paddle/fluid/lite/kernels/arm/mul_compute_test.cc b/paddle/fluid/lite/kernels/arm/mul_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..5e3d17ec93ae9d73028343b3d4dd1e77a0fe86f0 --- /dev/null +++ b/paddle/fluid/lite/kernels/arm/mul_compute_test.cc @@ -0,0 +1,152 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/lite/kernels/arm/mul_compute.h" +#include +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/lite/arm/math/funcs.h" +#include "paddle/fluid/lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace arm { + +template +void FillData(T* a, const int n, const T lower = static_cast(-2.f), + const T upper = static_cast(2.f)) { + static unsigned int seed = 100; + std::mt19937 rng(seed++); + std::uniform_real_distribution uniform_dist(0, 1); + for (int i = 0; i < n; ++i) { + a[i] = static_cast(uniform_dist(rng) * (upper - lower) + lower); + } +} + +TEST(mul_arm, retrive_op) { + auto mul = + KernelRegistry::Global().Create("mul"); + ASSERT_FALSE(mul.empty()); + ASSERT_TRUE(mul.front()); +} + +TEST(mul_arm, init) { + MulCompute mul; + ASSERT_EQ(mul.precision(), PRECISION(kFloat)); + ASSERT_EQ(mul.target(), TARGET(kARM)); +} + +TEST(mul_arm, compare_test) { + using T = float; + + for (int m : {1, 2, 3, 4}) { + for (int n : {1, 2, 3, 4}) { + for (int k : {1, 2, 3, 4}) { + VLOG(3) << "m: " << m << ", n: " << n << ", k: " << k; + lite::Tensor x, y, out, ref; + x.Resize({m, k}); + y.Resize({k, n}); + out.Resize({m, n}); + ref.Resize({m, n}); + + auto* x_data = x.mutable_data(); + auto* y_data = y.mutable_data(); + auto* out_data = out.mutable_data(); + auto* ref_data = ref.mutable_data(); + + FillData(x_data, x.dims().production()); + FillData(y_data, y.dims().production()); + FillData(out_data, out.dims().production(), 0, 0); + FillData(ref_data, ref.dims().production(), 0, 0); + + MulCompute mul; + operators::MulParam param; + + param.x = &x; + param.y = &y; + param.output = &out; + + DeviceInfo::Init(); + std::unique_ptr ctx(new KernelContext); + ctx->As(); + mul.SetParam(param); + mul.SetContext(std::move(ctx)); + mul.PrepareForRun(); + + mul.Run(); + + lite::arm::math::mul_compute_eigen(x_data, m, k, y_data, k, n, + ref_data); + for (int i = 0; i < out.dims().production(); i++) { + EXPECT_NEAR(out_data[i], ref_data[i], 1e-3); + } + } + } + } +} + +TEST(mul_arm, num_col_dims) { + using T = float; + + lite::Tensor x, y, out, ref; + x.Resize({2, 3, 4}); + y.Resize({3, 4, 5}); + out.Resize({2, 5}); + ref.Resize({2, 5}); + + auto* x_data = x.mutable_data(); + auto* y_data = y.mutable_data(); + auto* out_data = out.mutable_data(); + auto* ref_data = ref.mutable_data(); + + FillData(x_data, x.dims().production()); + FillData(y_data, y.dims().production()); + FillData(out_data, out.dims().production()); + FillData(ref_data, out.dims().production()); + + MulCompute mul; + operators::MulParam param; + + param.x = &x; + param.y = &y; + param.output = &out; + param.x_num_col_dims = 1; + param.y_num_col_dims = 2; + + DeviceInfo::Init(); + std::unique_ptr ctx(new KernelContext); + ctx->As(); + mul.SetParam(param); + mul.SetContext(std::move(ctx)); + mul.PrepareForRun(); + + mul.Run(); + + lite::arm::math::mul_compute_eigen(x_data, 2, 12, y_data, 12, 5, ref_data); + for (int i = 0; i < out.dims().production(); i++) { + EXPECT_NEAR(out_data[i], ref_data[i], 1e-3); + } +} + +} // namespace arm +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def); diff --git a/paddle/fluid/lite/kernels/arm/pool_compute_test.cc b/paddle/fluid/lite/kernels/arm/pool_compute_test.cc index 35873a9d2cc3fa922f48cc87e8e2c4191ac8ee60..b024ccef9d526d56bcf52c1600940ff0804eaf1f 100644 --- a/paddle/fluid/lite/kernels/arm/pool_compute_test.cc +++ b/paddle/fluid/lite/kernels/arm/pool_compute_test.cc @@ -182,7 +182,7 @@ TEST(pool_arm, compute) { for (auto stride : {2}) { for (auto pad : {0}) { for (auto n : {1, 3, 4, 11}) { - for (auto c : {1, 3, 11, 4, 1024}) { + for (auto c : {1, 3, 11 /* ,1024 */}) { // speedup for ci for (auto h : {3, 1, 11, 4, 1}) { for (auto w : {1, 3, 4, 12, 1}) { VLOG(3) << "n:" << n << " c:" << c << " h:" << h << " w:" << w diff --git a/paddle/fluid/lite/kernels/arm/scale_compute_test.cc b/paddle/fluid/lite/kernels/arm/scale_compute_test.cc index fee47d7eb7a6c093524bb0af617c60d069add01a..b1277792286429b666b3479c0655bb211a69db30 100644 --- a/paddle/fluid/lite/kernels/arm/scale_compute_test.cc +++ b/paddle/fluid/lite/kernels/arm/scale_compute_test.cc @@ -54,6 +54,15 @@ TEST(scale_arm, compute) { lite::Tensor output; lite::Tensor output_ref; +#if 1 // for ci speedup + for (auto n : {1, 3}) { + for (auto c : {1, 3}) { + for (auto h : {3, 4}) { + for (auto w : {4, 3}) { + for (auto bias_after_scale : {true, false}) { + for (auto s : {-1.0f, 0.13f}) { + for (auto b : {-15.f, 0.11234f}) { +#else for (auto n : {1, 3, 4, 11}) { for (auto c : {1, 3, 11, 4}) { for (auto h : {3, 1, 11, 4}) { @@ -61,6 +70,8 @@ TEST(scale_arm, compute) { for (auto bias_after_scale : {true, false}) { for (auto s : {-100.25f, -1.0f, 0.13f, 3840.975f}) { for (auto b : {-3075.495f, -15.f, 0.11234f, 128.15f}) { +#endif + x.Resize(DDim(std::vector({n, c, h, w}))); output.Resize(DDim(std::vector({n, c, h, w}))); output_ref.Resize(DDim(std::vector({n, c, h, w}))); diff --git a/paddle/fluid/lite/kernels/cuda/CMakeLists.txt b/paddle/fluid/lite/kernels/cuda/CMakeLists.txt index f35f634a217fabd539c9b124c44bc6cdeb186dd6..b7a48946257cb03e311949dd0aa51e31ad239eca 100644 --- a/paddle/fluid/lite/kernels/cuda/CMakeLists.txt +++ b/paddle/fluid/lite/kernels/cuda/CMakeLists.txt @@ -9,3 +9,4 @@ cc_library(io_copy_compute_cuda SRCS io_copy_compute.cc DEPS ${tensor_lite}) nv_library(kernels_cuda DEPS mul_compute_cuda io_copy_compute_cuda cuda_blas_lite) + diff --git a/paddle/fluid/lite/kernels/host/CMakeLists.txt b/paddle/fluid/lite/kernels/host/CMakeLists.txt index a71a8e13ab8fe1667dc7d0dc8477d58182d5139f..7e8e6bcb6db82c570885b32aeed8542ed10209a5 100644 --- a/paddle/fluid/lite/kernels/host/CMakeLists.txt +++ b/paddle/fluid/lite/kernels/host/CMakeLists.txt @@ -13,3 +13,4 @@ set(host_kernels ) set(host_kernels "${host_kernels}" CACHE GLOBAL "host kernels") + diff --git a/paddle/fluid/lite/kernels/x86/CMakeLists.txt b/paddle/fluid/lite/kernels/x86/CMakeLists.txt index 3747351d5626b9cb5e0e5afda6b01e6d7a464ad5..c2845fb9b21b2e4d0bb7ff378676d4531212db52 100644 --- a/paddle/fluid/lite/kernels/x86/CMakeLists.txt +++ b/paddle/fluid/lite/kernels/x86/CMakeLists.txt @@ -35,3 +35,4 @@ set(x86_kernels ) set(x86_kernels "${x86_kernels}" CACHE INTERNAL "x86 kernels") + diff --git a/paddle/fluid/lite/kernels/x86/sgd_compute.cc b/paddle/fluid/lite/kernels/x86/sgd_compute.cc index 27261fd14d643f82c8847887c6788f4bb6820439..2b50c9172a0bcbea5c4fb269c295ee37ee99ab05 100644 --- a/paddle/fluid/lite/kernels/x86/sgd_compute.cc +++ b/paddle/fluid/lite/kernels/x86/sgd_compute.cc @@ -29,9 +29,9 @@ class SGDCompute : public KernelLite { using param_t = operators::ActivationParam; void Run() override { - auto &context = context_->As(); + auto &context = ctx_->As(); auto &sgd_param = *param_.get_mutable(); - CHECK(context.x86_device_context); + CHECK(context.x86_device_context()); // param.Out->template mutable_data(); @@ -45,12 +45,12 @@ class SGDCompute : public KernelLite { PADDLE_ENFORCE_EQ(grad->numel(), sz); paddle::operators::jit::sgd_attr_t attr(1, sz, 1, sz, 1); - const T *lr = learning_rate->data(); - const T *param_data = param->data(); - const T *grad_data = grad->data(); + const T *lr = learning_rate->template data(); + const T *param_data = param->template data(); + const T *grad_data = grad->template data(); int64_t rows_idx = 0; - T *out_data = - param_out->mutable_data(context.x86_device_context->GetPlace()); + T *out_data = param_out->template mutable_data( + context.x86_device_context()->GetPlace()); auto sgd = paddle::operators::jit::KernelFuncs, diff --git a/paddle/fluid/lite/model_parser/CMakeLists.txt b/paddle/fluid/lite/model_parser/CMakeLists.txt index 63fe21abdafb916be72fddb99023d6ba4b8530c0..d179e0350ac0edd89912377cc668c6b8888c2638 100644 --- a/paddle/fluid/lite/model_parser/CMakeLists.txt +++ b/paddle/fluid/lite/model_parser/CMakeLists.txt @@ -27,3 +27,4 @@ lite_cc_test(test_op_desc_lite SRCS op_desc_test.cc DEPS cpp_op_desc_lite op_des add_subdirectory(pb) add_subdirectory(cpp) + diff --git a/paddle/fluid/lite/model_parser/cpp/CMakeLists.txt b/paddle/fluid/lite/model_parser/cpp/CMakeLists.txt index 71073179991294aadef40d5df6d23662ec41fcfe..e6e2fc77f00c691176aa5c20c455964bd9bd5e66 100644 --- a/paddle/fluid/lite/model_parser/cpp/CMakeLists.txt +++ b/paddle/fluid/lite/model_parser/cpp/CMakeLists.txt @@ -1 +1,2 @@ cc_library(cpp_op_desc_lite SRCS op_desc.cc DEPS any_lite) + diff --git a/paddle/fluid/lite/model_parser/pb/CMakeLists.txt b/paddle/fluid/lite/model_parser/pb/CMakeLists.txt index 22d88aeabf479e9c234cfa1e9660a6d2af9439b4..6910542f2a17f1ec5cdbe5f77203197ae3d57b89 100644 --- a/paddle/fluid/lite/model_parser/pb/CMakeLists.txt +++ b/paddle/fluid/lite/model_parser/pb/CMakeLists.txt @@ -1,2 +1,3 @@ cc_library(var_desc_lite SRCS var_desc.cc DEPS framework_proto_lite) cc_library(op_desc_lite SRCS op_desc.cc DEPS framework_proto_lite) + diff --git a/paddle/fluid/lite/operators/CMakeLists.txt b/paddle/fluid/lite/operators/CMakeLists.txt index 9a90666420e94bdb585feeac689d9227fc6a2104..ac3dc1285e4ef8f7b6caba63884ec9966957613a 100644 --- a/paddle/fluid/lite/operators/CMakeLists.txt +++ b/paddle/fluid/lite/operators/CMakeLists.txt @@ -8,6 +8,7 @@ cc_library(mul_op_lite SRCS mul_op.cc DEPS ${op_DEPS}) cc_library(scale_op_lite SRCS scale_op.cc DEPS ${op_DEPS}) cc_library(softmax_op_lite SRCS softmax_op.cc DEPS ${op_DEPS}) cc_library(reshape_op_lite SRCS reshape_op.cc DEPS ${op_DEPS} ) +cc_library(batch_norm_op_lite SRCS batch_norm_op.cc DEPS ${op_DEPS}) cc_library(feed_op_lite SRCS feed_op.cc DEPS ${op_DEPS}) cc_library(fetch_op_lite SRCS fetch_op.cc DEPS ${op_DEPS}) cc_library(io_copy_op_lite SRCS io_copy_op.cc DEPS ${op_DEPS}) @@ -30,6 +31,7 @@ set(ops_lite scale_op_lite softmax_op_lite reshape_op_lite + batch_norm_op_lite feed_op_lite fetch_op_lite io_copy_op_lite @@ -52,4 +54,6 @@ lite_cc_test(test_pool_op_lite SRCS pool_op_test.cc lite_cc_test(test_scale_op_lite SRCS scale_op_test.cc DEPS scale_op_lite memory_lite) lite_cc_test(test_softmax_op_lite SRCS softmax_op_test.cc DEPS softmax_op_lite memory_lite) lite_cc_test(test_reshape_op_lite SRCS reshape_op_test.cc DEPS reshape_op_lite memory_lite) +lite_cc_test(test_batch_norm_op_lite SRCS batch_norm_op_test.cc DEPS batch_norm_op_lite memory_lite) lite_cc_test(test_concat_op_lite SRCS concat_op_test.cc DEPS concat_op_lite memory_lite) + diff --git a/paddle/fluid/lite/operators/batch_norm_op.cc b/paddle/fluid/lite/operators/batch_norm_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..e974d0134dad93a2241c265687a190b10d5ff85d --- /dev/null +++ b/paddle/fluid/lite/operators/batch_norm_op.cc @@ -0,0 +1,110 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/lite/operators/batch_norm_op.h" +#include "paddle/fluid/lite/core/op_registry.h" +namespace paddle { +namespace lite { +namespace operators { + +bool BatchNormOp::CheckShape() const { + CHECK_OR_FALSE(param_.x); + CHECK_OR_FALSE(param_.bias); + CHECK_OR_FALSE(param_.scale); + CHECK_OR_FALSE(param_.mean); + CHECK_OR_FALSE(param_.variance); + CHECK_OR_FALSE(param_.y); + if (!param_.is_test) { + CHECK_OR_FALSE(param_.mean_out); + CHECK_OR_FALSE(param_.variance_out); + CHECK_OR_FALSE(param_.saved_mean); + CHECK_OR_FALSE(param_.saved_variance); + } + auto x_dims = param_.x->dims(); + auto scale_dims = param_.scale->dims(); + auto bias_dims = param_.bias->dims(); + auto mean_dims = param_.mean->dims(); + auto variance_dims = param_.variance->dims(); + CHECK(x_dims.size() >= 2 && x_dims.size() <= 5) + << "Input X must have 2 to 5 dimensions."; + CHECK_EQ(scale_dims.size(), 1UL) << "Input Scale must have 1 dimensions."; + CHECK_EQ(bias_dims.size(), 1UL) << "Input Bias must have 1 dimensions."; + CHECK_EQ(mean_dims.size(), 1UL) << "Input Mean must have 1 dimensions."; + CHECK_EQ(variance_dims.size(), 1UL) + << "Input Variance must have 1 dimensions."; + return true; +} + +bool BatchNormOp::InferShape() const { + auto x_dims = param_.x->dims(); + int64_t channel_size = 0; + switch (param_.data_layout) { + case DATALAYOUT(kNCHW): + channel_size = x_dims[1]; + break; + // case DATALAYOUT(kNHWC): + // channel_size = x_dims[x_dims.size() - 1]; + // break; + default: + LOG(FATAL) << "Unknown storage order: " + << DataLayoutToStr(param_.data_layout); + break; + } + if (!param_.is_test) { + param_.mean_out->Resize({channel_size}); + param_.variance_out->Resize({channel_size}); + param_.saved_mean->Resize({channel_size}); + param_.saved_variance->Resize({channel_size}); + } + param_.y->Resize(x_dims); + return true; +} + +bool BatchNormOp::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) { + param_.x = scope->FindVar(op_desc.Input("X").front())->GetMutable(); + param_.bias = + scope->FindVar(op_desc.Input("Bias").front())->GetMutable(); + param_.scale = + scope->FindVar(op_desc.Input("Scale").front())->GetMutable(); + param_.mean = + scope->FindVar(op_desc.Input("Mean").front())->GetMutable(); + param_.variance = + scope->FindVar(op_desc.Input("Variance").front())->GetMutable(); + param_.y = scope->FindVar(op_desc.Output("Y").front())->GetMutable(); + param_.is_test = op_desc.GetAttr("is_test"); + param_.use_global_stats = op_desc.GetAttr("use_global_stats"); + if (!param_.is_test) { + param_.mean_out = + scope->FindVar(op_desc.Output("MeanOut").front())->GetMutable(); + param_.variance_out = scope->FindVar(op_desc.Output("VarianceOut").front()) + ->GetMutable(); + param_.saved_mean = scope->FindVar(op_desc.Output("SavedMean").front()) + ->GetMutable(); + param_.saved_variance = + scope->FindVar(op_desc.Output("SavedVariance").front()) + ->GetMutable(); + } + param_.epsilon = op_desc.GetAttr("epsilon"); + param_.momentum = op_desc.GetAttr("momentum"); + std::string data_layout = op_desc.GetAttr("data_layout"); + CHECK_EQ(data_layout, "NCHW") << "TODO(hong19860320): Only support NCHW."; + // param_.data_layout = StringToDataLayout(data_layout); + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(batch_norm, paddle::lite::operators::BatchNormOp); diff --git a/paddle/fluid/lite/operators/batch_norm_op.h b/paddle/fluid/lite/operators/batch_norm_op.h new file mode 100644 index 0000000000000000000000000000000000000000..30e8747319b1575b0c63e4b2919ed1363ad10bef --- /dev/null +++ b/paddle/fluid/lite/operators/batch_norm_op.h @@ -0,0 +1,46 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "paddle/fluid/lite/core/op_lite.h" +#include "paddle/fluid/lite/core/scope.h" +#include "paddle/fluid/lite/utils/all.h" + +namespace paddle { +namespace lite { +namespace operators { + +class BatchNormOp : public OpLite { + public: + BatchNormOp() {} + explicit BatchNormOp(const std::string &op_type) : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShape() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + std::string DebugString() const override { return "batch_norm"; } + + private: + mutable BatchNormParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/paddle/fluid/lite/operators/batch_norm_op_test.cc b/paddle/fluid/lite/operators/batch_norm_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..b91c367d92b721c1f96fd5fc92ec0b4f877408e4 --- /dev/null +++ b/paddle/fluid/lite/operators/batch_norm_op_test.cc @@ -0,0 +1,139 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/lite/operators/batch_norm_op.h" +#include +#include "paddle/fluid/lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +TEST(batch_norm_op_lite, test) { + // prepare variables + Scope scope; + auto* x = scope.Var("x")->GetMutable(); + auto* scale = scope.Var("scale")->GetMutable(); + auto* bias = scope.Var("bias")->GetMutable(); + auto* mean = scope.Var("mean")->GetMutable(); + auto* variance = scope.Var("variance")->GetMutable(); + auto* y = scope.Var("y")->GetMutable(); + x->Resize({2, 32, 10, 20}); + auto x_dims = x->dims(); + const int64_t channel_size = x_dims[1]; // NCHW + scale->Resize({channel_size}); + bias->Resize({channel_size}); + mean->Resize({channel_size}); + variance->Resize({channel_size}); + + // prepare op desc + cpp::OpDesc desc; + desc.SetType("batch_norm"); + desc.SetInput("X", {"x"}); + desc.SetInput("Scale", {"scale"}); + desc.SetInput("Bias", {"bias"}); + desc.SetInput("Mean", {"mean"}); + desc.SetInput("Variance", {"variance"}); + desc.SetOutput("Y", {"y"}); + desc.SetAttr("is_test", true); + desc.SetAttr("use_global_stats", false); + desc.SetAttr("epsilon", 1e-5f); + desc.SetAttr("momentum", 0.9f); + desc.SetAttr("data_layout", std::string("NCHW")); + + BatchNormOp batch_norm("batch_norm"); + + batch_norm.SetValidPlaces({Place{TARGET(kHost), PRECISION(kFloat)}}); + batch_norm.Attach(desc, &scope); + batch_norm.CheckShape(); + batch_norm.InferShape(); + + // check output dims + auto y_dims = y->dims(); + CHECK_EQ(y_dims.size(), x_dims.size()); + for (size_t i = 0; i < y_dims.size(); i++) { + CHECK_EQ(y_dims[i], x_dims[i]); + } +} + +TEST(batch_norm_op_lite, test_enable_is_test) { + // prepare variables + Scope scope; + auto* x = scope.Var("x")->GetMutable(); + auto* scale = scope.Var("scale")->GetMutable(); + auto* bias = scope.Var("bias")->GetMutable(); + auto* mean = scope.Var("mean")->GetMutable(); + auto* variance = scope.Var("variance")->GetMutable(); + auto* y = scope.Var("y")->GetMutable(); + auto* mean_out = scope.Var("mean_out")->GetMutable(); + auto* variance_out = scope.Var("variance_out")->GetMutable(); + auto* saved_mean = scope.Var("saved_mean")->GetMutable(); + auto* saved_variance = scope.Var("saved_variance")->GetMutable(); + x->Resize({2, 32, 10, 20}); + auto x_dims = x->dims(); + const int64_t channel_size = x_dims[1]; // NCHW + scale->Resize({channel_size}); + bias->Resize({channel_size}); + mean->Resize({channel_size}); + variance->Resize({channel_size}); + + // prepare op desc + cpp::OpDesc desc; + desc.SetType("batch_norm"); + desc.SetInput("X", {"x"}); + desc.SetInput("Scale", {"scale"}); + desc.SetInput("Bias", {"bias"}); + desc.SetInput("Mean", {"mean"}); + desc.SetInput("Variance", {"variance"}); + desc.SetOutput("Y", {"y"}); + desc.SetOutput("MeanOut", {"mean_out"}); + desc.SetOutput("VarianceOut", {"variance_out"}); + desc.SetOutput("SavedMean", {"saved_mean"}); + desc.SetOutput("SavedVariance", {"saved_variance"}); + desc.SetAttr("is_test", false); + desc.SetAttr("use_global_stats", false); + desc.SetAttr("epsilon", 1e-5f); + desc.SetAttr("momentum", 0.9f); + desc.SetAttr("data_layout", std::string("NCHW")); + + BatchNormOp batch_norm("batch_norm"); + + batch_norm.SetValidPlaces({Place{TARGET(kHost), PRECISION(kFloat)}}); + batch_norm.Attach(desc, &scope); + batch_norm.CheckShape(); + batch_norm.InferShape(); + + // check output dims + auto y_dims = y->dims(); + CHECK_EQ(y_dims.size(), x_dims.size()); + for (size_t i = 0; i < y_dims.size(); i++) { + CHECK_EQ(y_dims[i], x_dims[i]); + } + auto mean_out_dims = mean_out->dims(); + auto variance_out_dims = variance_out->dims(); + auto saved_mean_dims = saved_mean->dims(); + auto saved_variance_dims = saved_variance->dims(); + CHECK_EQ(mean_out_dims.size(), 1UL); + CHECK_EQ(variance_out_dims.size(), 1UL); + CHECK_EQ(saved_mean_dims.size(), 1UL); + CHECK_EQ(saved_variance_dims.size(), 1UL); + CHECK_EQ(mean_out_dims[0], channel_size); + CHECK_EQ(variance_out_dims[0], channel_size); + CHECK_EQ(saved_mean_dims[0], channel_size); + CHECK_EQ(saved_variance_dims[0], channel_size); +} + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/paddle/fluid/lite/operators/op_params.h b/paddle/fluid/lite/operators/op_params.h index 9986d199393d262ddf6aa9e54f6553fbb5e7627a..91a6067959854f608e31a6151a4e63e26df7eb64 100644 --- a/paddle/fluid/lite/operators/op_params.h +++ b/paddle/fluid/lite/operators/op_params.h @@ -57,6 +57,7 @@ struct FcParam { lite::Tensor* output{}; lite::DDim in_mat_dims; int in_num_col_dims{1}; + bool weight_transposed{false}; }; struct ReluParam { @@ -145,6 +146,25 @@ struct ConvParam { std::string data_format{"Anylayout"}; }; +// For BatchNorm op +struct BatchNormParam { + lite::Tensor* x{}; + lite::Tensor* bias{}; + lite::Tensor* scale{}; + lite::Tensor* mean{}; + lite::Tensor* variance{}; + lite::Tensor* y{}; + lite::Tensor* mean_out{}; + lite::Tensor* variance_out{}; + lite::Tensor* saved_mean{}; + lite::Tensor* saved_variance{}; + bool is_test{true}; + bool use_global_stats{false}; + float epsilon; + float momentum; + DataLayoutType data_layout{DATALAYOUT(kNCHW)}; +}; + // For Pooling op struct PoolParam { lite::Tensor* x{}; diff --git a/paddle/fluid/lite/operators/pool_op_test.cc b/paddle/fluid/lite/operators/pool_op_test.cc index bf46a2ecbd8a465fa5a52bc099389ff3838a5840..9ab2865f1d04f2ca173b9d2f5f7d9e457f6754e8 100644 --- a/paddle/fluid/lite/operators/pool_op_test.cc +++ b/paddle/fluid/lite/operators/pool_op_test.cc @@ -74,7 +74,11 @@ TEST(pool_op_lite, test) { pool.Attach(desc, &scope); auto kernels = pool.CreateKernels({Place{TARGET(kARM), PRECISION(kFloat)}}); LOG(INFO) << "kernels.size(): " << kernels.size(); +#ifdef LITE_WITH_ARM ASSERT_FALSE(kernels.empty()); +#else + ASSERT_TRUE(kernels.empty()); +#endif } } // namespace operators diff --git a/paddle/fluid/lite/operators/split_op.cc b/paddle/fluid/lite/operators/split_op.cc index 9b4b7662ab7ba7228ee215bf051601150e2b6bb7..0d5075b0971e4bd98de8aac9810bbe7514c1a562 100644 --- a/paddle/fluid/lite/operators/split_op.cc +++ b/paddle/fluid/lite/operators/split_op.cc @@ -37,7 +37,7 @@ bool SplitOp::InferShape() const { const auto §ions = param_.sections; const int outs_number = outs.size(); - std::vector outs_dims; + std::vector outs_dims; outs_dims.reserve(outs_number); if (num > 0) { diff --git a/paddle/fluid/lite/tools/Dockerfile.mobile b/paddle/fluid/lite/tools/Dockerfile.mobile index fa3a57c3e28cfb62bf85b5af8b9b9076b0affa67..6bba15b7b70594262941f8df7a088840d2cab065 100644 --- a/paddle/fluid/lite/tools/Dockerfile.mobile +++ b/paddle/fluid/lite/tools/Dockerfile.mobile @@ -88,4 +88,4 @@ RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple wheel RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple pre-commit RUN apt-get autoremove -y && apt-get clean RUN rm -rf /sdk-tools-linux-4333796.zip /tmp/android-ndk-r17c-linux-x86_64.zip /cmake-3.10.3-Linux-x86_64.tar.gz - + \ No newline at end of file diff --git a/paddle/fluid/lite/tools/build.sh b/paddle/fluid/lite/tools/build.sh index c73f7bf952fcfd5d5581d29d3211df8cf39babc7..392e9b82bb5e66bc835f8a1c1edc21f8fc9c81d5 100755 --- a/paddle/fluid/lite/tools/build.sh +++ b/paddle/fluid/lite/tools/build.sh @@ -13,6 +13,11 @@ function prepare_for_codegen { mkdir -p ./paddle/fluid/lite/gen_code touch ./paddle/fluid/lite/gen_code/__generated_code__.cc } + +function check_need_ci { + git log -1 --oneline | grep "test=develop" || exit -1 +} + function cmake_x86 { prepare_for_codegen cmake .. -DWITH_GPU=OFF -DWITH_MKLDNN=OFF -DLITE_WITH_X86=ON ${common_flags} @@ -28,6 +33,17 @@ function cmake_gpu { cmake .. " -DWITH_GPU=ON {common_flags} -DLITE_WITH_GPU=ON" } +function check_style { + export PATH=/usr/bin:$PATH + #pre-commit install + clang-format --version + + if ! pre-commit run -a ; then + git diff + exit 1 + fi +} + function cmake_arm { # $1: ARM_TARGET_OS in "android" , "armlinux" # $2: ARM_TARGET_ARCH_ABI in "arm64-v8a", "armeabi-v7a" ,"armeabi-v7a-hf" @@ -43,10 +59,15 @@ function cmake_arm { -DARM_TARGET_OS=$1 -DARM_TARGET_ARCH_ABI=$2 } +function build_single { + #make $1 -j$(expr $(nproc) - 2) + make $1 -j8 +} + function build { file=$1 for _test in $(cat $file); do - make $_test -j$(expr $(nproc) - 2) + build_single $_test done } @@ -58,44 +79,12 @@ function test_lite { for _test in $(cat $file); do # We move the build phase here to make the 'gen_code' test compiles after the # corresponding test is executed and the C++ code generates. - make $_test -j$(expr $(nproc) - 2) + #make $_test -j$(expr $(nproc) - 2) + make $_test -j8 ctest -R $_test -V done } -port_armv8=5554 -port_armv7=5556 - -# Run test on android -function test_lite_android { - local file=$1 - local adb_abi=$2 - local port= - if [[ ${adb_abi} == "armeabi-v7a" ]]; then - port=${port_armv7} - fi - - if [[ ${adb_abi} == "arm64-v8a" ]]; then - port=${port_armv8} - fi - if [[ "${port}x" == "x" ]]; then - echo "Port can not be empty" - exit 1 - fi - - echo "file: ${file}" - # push all to adb and test - adb_work_dir="/data/local/tmp" - skip_list="test_model_parser_lite" - for _test in $(cat $file); do - [[ $skip_list =~ (^|[[:space:]])$_test($|[[:space:]]) ]] && continue || echo 'skip $_test' - testpath=$(find ./paddle/fluid -name ${_test}) - adb -s emulator-${port} push ${testpath} ${adb_work_dir} - adb -s emulator-${port} shell chmod +x "${adb_work_dir}/${_test}" - adb -s emulator-${port} shell "./${adb_work_dir}/${_test}" - done -} - # Build the code and run lite server tests. This is executed in the CI system. function build_test_server { mkdir -p ./build @@ -108,8 +97,34 @@ function build_test_server { build $LIBS_FILE } -# Build the code and run lite server tests. This is executed in the CI system. +# test_arm_android +function test_arm_android { + test_name=$1 + port=$2 + if [[ "${test_name}x" == "x" ]]; then + echo "test_name can not be empty" + exit 1 + fi + if [[ "${port}x" == "x" ]]; then + echo "Port can not be empty" + exit 1 + fi + + echo "test name: ${test_name}" + adb_work_dir="/data/local/tmp" + skip_list="test_model_parser_lite" # add more with space + [[ $skip_list =~ (^|[[:space:]])$test_name($|[[:space:]]) ]] && continue || echo 'skip $test_name' + testpath=$(find ./paddle/fluid -name ${test_name}) + adb -s emulator-${port} push ${testpath} ${adb_work_dir} + adb -s emulator-${port} shell chmod +x "${adb_work_dir}/${test_name}" + adb -s emulator-${port} shell "./${adb_work_dir}/${test_name}" +} + +# Build the code and run lite arm tests. This is executed in the CI system. function build_test_arm { + port_armv8=5554 + port_armv7=5556 + adb kill-server adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done # start android arm64-v8a armeabi-v7a emulators first @@ -122,6 +137,7 @@ function build_test_arm { for os in "android" "armlinux" ; do for abi in "arm64-v8a" "armeabi-v7a" "armeabi-v7a-hf" ; do + # TODO(TJ): enable compile on v7-hf on andorid and all v7 on armlinux if [[ ${abi} == "armeabi-v7a-hf" ]]; then echo "armeabi-v7a-hf is not supported on both android and armlinux" continue @@ -138,17 +154,30 @@ function build_test_arm { cmake_arm ${os} ${abi} build $TESTS_FILE + # armlinux need in another docker + # TODO(TJ): enable test with armlinux if [[ ${os} == "android" ]]; then adb_abi=${abi} if [[ ${adb_abi} == "armeabi-v7a-hf" ]]; then adb_abi="armeabi-v7a" fi if [[ ${adb_abi} == "armeabi-v7a" ]]; then - # skip v7 tests + # skip all armv7 tests + # TODO(TJ): enable test with armv7 continue fi - test_lite_android $TESTS_FILE ${adb_abi} - # armlinux need in another docker + local port= + if [[ ${adb_abi} == "armeabi-v7a" ]]; then + port=${port_armv7} + fi + + if [[ ${adb_abi} == "arm64-v8a" ]]; then + port=${port_armv8} + fi + echo "test file: ${TESTS_FILE}" + for _test in $(cat $TESTS_FILE); do + test_arm_android $_test $port + done fi cd - done @@ -164,12 +193,13 @@ function print_usage { echo "----------------------------------------" echo -e "cmake_x86: run cmake with X86 mode" echo -e "cmake_cuda: run cmake with CUDA mode" - echo -e "cmake_arm: run cmake with ARM mode" + echo -e "--arm_os= --arm_abi= cmake_arm: run cmake with ARM mode" echo echo -e "build: compile the tests" + echo -e "--test_name= build_single: compile single test" echo echo -e "test_server: run server tests" - echo -e "test_mobile: run mobile tests" + echo -e "--test_name= --adb_port_number= test_arm_android: run arm test" echo "----------------------------------------" echo } @@ -182,11 +212,31 @@ function main { TESTS_FILE="${i#*=}" shift ;; + --test_name=*) + TEST_NAME="${i#*=}" + shift + ;; + --arm_os=*) + ARM_OS="${i#*=}" + shift + ;; + --arm_abi=*) + ARM_ABI="${i#*=}" + shift + ;; + --arm_port=*) + ARM_PORT="${i#*=}" + shift + ;; build) build $TESTS_FILE build $LIBS_FILE shift ;; + build_single) + build_single $TEST_NAME + shift + ;; cmake_x86) cmake_x86 shift @@ -196,15 +246,15 @@ function main { shift ;; cmake_arm) - cmake_arm $2 $3 + cmake_arm $ARM_OS $ARM_ABI shift ;; test_server) test_lite $TESTS_FILE shift ;; - test_mobile) - test_lite $TESTS_FILE + test_arm_android) + test_arm_android $TEST_NAME $ARM_PORT shift ;; build_test_server) @@ -215,6 +265,14 @@ function main { build_test_arm shift ;; + check_style) + check_style + shift + ;; + check_need_ci) + check_need_ci + shift + ;; *) # unknown option print_usage @@ -224,7 +282,5 @@ function main { done } -print_usage - main $@ diff --git a/paddle/fluid/lite/utils/CMakeLists.txt b/paddle/fluid/lite/utils/CMakeLists.txt index 08eeaa54f8eacd359fa154762b6a1bff379686c5..f610b7aab5c25cec5d9b4fc18aecc65b3651332b 100644 --- a/paddle/fluid/lite/utils/CMakeLists.txt +++ b/paddle/fluid/lite/utils/CMakeLists.txt @@ -9,3 +9,4 @@ set(utils_DEPS glog) lite_cc_test(test_varient SRCS varient_test.cc DEPS utils_lite) cc_library(any_lite SRCS any.cc) cc_library(utils_lite SRCS cp_logging.cc string.cc DEPS ${utils_DEPS} any_lite) + diff --git a/paddle/fluid/lite/x86/CMakeLists.txt b/paddle/fluid/lite/x86/CMakeLists.txt index 0347593e38af4af7cf2dd421801524bcb4d6d052..515933e2588844f2795ca676269965db9a9770fd 100644 --- a/paddle/fluid/lite/x86/CMakeLists.txt +++ b/paddle/fluid/lite/x86/CMakeLists.txt @@ -4,3 +4,4 @@ endif() cc_library(target_wrapper_x86 SRCS target_wrapper.cc) + diff --git a/python/paddle/proto/__init__.py b/python/paddle/proto/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..07406a841ec90a79fbe5d0aca7b19d19d85e008a --- /dev/null +++ b/python/paddle/proto/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.proto.TrainerConfig_pb2 import OptimizationConfig, TrainerConfig +from paddle.proto.ModelConfig_pb2 import ModelConfig