提交 ace19269 编写于 作者: S superjomn

Merge branch 'incubate/lite' of http://10.87.145.36/inference/paddlelite into HEAD

before_script:
- env
image: $SERVER_LITE_DOCKER_IMAGE
stages:
- ci
- build_server
- build_mobile
check:prebuilt:
tags:
- lite
stage: ci
script:
#- pip3 install pre-commit
#- alias python=python3
- rm -rf ~/.pip
- pip install pre-commit
- pre-commit install
- ./paddle/fluid/lite/tools/build.sh check_style
#- ./paddle/fluid/lite/tools/build.sh check_need_ci
cache:
key: check_style
paths:
- /root/.cache
build:server:
tags:
- lite
image: $SERVER_LITE_DOCKER_IMAGE
stage: build_server
cache:
key: server_thirdparty
paths:
- build/third_party
- /root/.ccache
script:
- apt install ccache
- export http_proxy=http://172.19.57.45:3128
- export https_proxy=http://172.19.57.45:3128
#- export http_proxy=http://agent.baidu.com:8118
#- export https_proxy=http://agent.baidu.com:8118
- mkdir -p build
- cd build
- ../paddle/fluid/lite/tools/build.sh cmake_x86
- make extern_eigen3
- make extern_boost
- make framework_proto
- make extern_warpctc
- cd ..
- export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PWD/build/third_party/install/mklml/lib
- ./paddle/fluid/lite/tools/build.sh build_test_server
dependencies:
- check:prebuilt
build:mobile:
tags:
- lite
stage: build_mobile
image: $MOBILE_LITE_DOCKER_IMAGE
cache:
key: mobile_thirdparty
paths:
- $MOBILE_LITE_CACHE0
- $MOBILE_LITE_CACHE1
- /root/.ccache
script:
- apt install ccache
- export http_proxy=http://172.19.57.45:3128
- export https_proxy=http://172.19.57.45:3128
- ./paddle/fluid/lite/tools/build.sh build_test_arm
dependencies:
- build:server
...@@ -166,6 +166,7 @@ if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) ...@@ -166,6 +166,7 @@ if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
#include(external/zlib) # download, build, install gtest #include(external/zlib) # download, build, install gtest
include(external/protobuf) # download, build, install protobuf include(external/protobuf) # download, build, install protobuf
include(external/eigen) # download eigen3 include(external/eigen) # download eigen3
include(ccache) # set ccache for compilation
include(generic) # simplify cmake module include(generic) # simplify cmake module
include(configure) # add paddle env configuration include(configure) # add paddle env configuration
......
...@@ -172,3 +172,4 @@ add_subdirectory(model_parser) ...@@ -172,3 +172,4 @@ add_subdirectory(model_parser)
add_subdirectory(utils) add_subdirectory(utils)
add_subdirectory(api) add_subdirectory(api)
add_subdirectory(gen_code) add_subdirectory(gen_code)
...@@ -14,7 +14,7 @@ if(LITE_WITH_CUDA) ...@@ -14,7 +14,7 @@ if(LITE_WITH_CUDA)
set(light_api_deps ${light_api_deps} target_wrapper_cuda) set(light_api_deps ${light_api_deps} target_wrapper_cuda)
endif() endif()
cc_library(light_api_lite SRCS light_api.cc DEPS ${light_api_deps} ${ops_lite} ${host_kernels}) #cc_library(light_api_lite SRCS light_api.cc DEPS ${light_api_deps} ${ops_lite} ${host_kernels})
message(STATUS "get ops ${ops_lite}") message(STATUS "get ops ${ops_lite}")
message(STATUS "get Host kernels ${host_kernels}") message(STATUS "get Host kernels ${host_kernels}")
......
...@@ -66,7 +66,7 @@ USE_LITE_OP(fetch); ...@@ -66,7 +66,7 @@ USE_LITE_OP(fetch);
USE_LITE_OP(io_copy); USE_LITE_OP(io_copy);
USE_LITE_OP(con2d); USE_LITE_OP(con2d);
USE_LITE_OP(batch_norm); // USE_LITE_OP(batch_norm);
USE_LITE_OP(relu); USE_LITE_OP(relu);
USE_LITE_OP(depthwise_conv2d); USE_LITE_OP(depthwise_conv2d);
USE_LITE_OP(pool2d); USE_LITE_OP(pool2d);
......
add_subdirectory(math) add_subdirectory(math)
...@@ -32,5 +32,7 @@ cc_library(math_arm SRCS ...@@ -32,5 +32,7 @@ cc_library(math_arm SRCS
conv_winograd_3x3.cc conv_winograd_3x3.cc
conv_winograd.cc conv_winograd.cc
split.cc split.cc
DEPS ${lite_kernel_deps} eigen3) DEPS ${lite_kernel_deps} eigen3 framework_proto_lite)
# TODO(TJ): fix me do not deps proto
...@@ -58,6 +58,111 @@ void scale<float>(const float* din, float* dout, int num, float scale, ...@@ -58,6 +58,111 @@ void scale<float>(const float* din, float* dout, int num, float scale,
} }
} }
template <>
void scale<float>(const float* din, float* dout, int outer_dim, int scale_dim,
int inner_dim, const float* scale_data,
const float* bias_data) {
int cnt = inner_dim >> 4;
int remain = inner_dim % 16;
int size = inner_dim * scale_dim;
for (int n = 0; n < outer_dim; n++) {
const float* din_ptr_n = din + n * size;
float* dout_ptr_n = dout + n * size;
#pragma omp parallel for
for (int i = 0; i < scale_dim; i++) {
const float* din_ptr = din_ptr_n + i * inner_dim;
float* dout_ptr = dout_ptr_n + i * inner_dim;
float scale = scale_data[i];
float32x4_t vscale = vdupq_n_f32(scale);
float bias = bias_data[i];
float32x4_t vbias = vdupq_n_f32(bias);
for (int j = 0; j < cnt; j++) {
float32x4_t din0 = vld1q_f32(din_ptr);
float32x4_t din1 = vld1q_f32(din_ptr + 4);
float32x4_t din2 = vld1q_f32(din_ptr + 8);
float32x4_t din3 = vld1q_f32(din_ptr + 12);
float32x4_t vsum1 = vmlaq_f32(vbias, din0, vscale);
float32x4_t vsum2 = vmlaq_f32(vbias, din1, vscale);
float32x4_t vsum3 = vmlaq_f32(vbias, din2, vscale);
float32x4_t vsum4 = vmlaq_f32(vbias, din3, vscale);
din_ptr += 16;
vst1q_f32(dout_ptr, vsum1);
vst1q_f32(dout_ptr + 4, vsum2);
vst1q_f32(dout_ptr + 8, vsum3);
vst1q_f32(dout_ptr + 12, vsum4);
dout_ptr += 16;
}
for (int j = 0; j < remain; j++) {
*dout_ptr = *din_ptr * scale + bias;
dout_ptr++;
din_ptr++;
}
}
}
}
template <>
void scale<float>(const float* din, float* dout, int outer_dim, int scale_dim,
const float* scale_data, const float* bias_data) {
int cnt = scale_dim >> 4;
int remain = scale_dim % 16;
for (int n = 0; n < outer_dim; n++) {
const float* din_ptr_n = din + n * scale_dim;
float* dout_ptr_n = dout + n * scale_dim;
#pragma omp parallel for
for (int i = 0; i < cnt; i++) {
int idx = i << 4;
const float* din_ptr = din_ptr_n + idx;
const float* scale_ptr = scale_data + idx;
const float* bias_ptr = bias_data + idx;
float* dout_ptr = dout_ptr_n + idx;
float32x4_t din0 = vld1q_f32(din_ptr);
float32x4_t vscale0 = vld1q_f32(scale_ptr);
float32x4_t vbias0 = vld1q_f32(bias_ptr);
float32x4_t din1 = vld1q_f32(din_ptr + 4);
float32x4_t vscale1 = vld1q_f32(scale_ptr + 4);
float32x4_t vbias1 = vld1q_f32(bias_ptr + 4);
float32x4_t din2 = vld1q_f32(din_ptr + 8);
float32x4_t vscale2 = vld1q_f32(scale_ptr + 8);
float32x4_t vbias2 = vld1q_f32(bias_ptr + 8);
float32x4_t vsum1 = vmlaq_f32(vbias0, din0, vscale0);
float32x4_t vsum2 = vmlaq_f32(vbias1, din1, vscale1);
float32x4_t din3 = vld1q_f32(din_ptr + 12);
float32x4_t vscale3 = vld1q_f32(scale_ptr + 12);
float32x4_t vbias3 = vld1q_f32(bias_ptr + 12);
vst1q_f32(dout_ptr, vsum1);
vst1q_f32(dout_ptr + 4, vsum2);
float32x4_t vsum3 = vmlaq_f32(vbias2, din2, vscale2);
float32x4_t vsum4 = vmlaq_f32(vbias3, din3, vscale3);
vst1q_f32(dout_ptr + 8, vsum3);
vst1q_f32(dout_ptr + 12, vsum4);
}
int idx = cnt << 4;
const float* din_ptr = din_ptr_n + idx;
float* dout_ptr = dout_ptr_n + idx;
const float* scale_ptr = scale_data + idx;
const float* bias_ptr = bias_data + idx;
for (int j = 0; j < remain; j++) {
*dout_ptr = *din_ptr * (*scale_ptr) + (*bias_ptr);
dout_ptr++;
din_ptr++;
scale_ptr++;
bias_ptr++;
}
}
}
} // namespace math } // namespace math
} // namespace arm } // namespace arm
} // namespace lite } // namespace lite
......
...@@ -22,6 +22,14 @@ namespace math { ...@@ -22,6 +22,14 @@ namespace math {
template <typename T> template <typename T>
void scale(const T* din, T* dout, int num, float scale, float bias); void scale(const T* din, T* dout, int num, float scale, float bias);
template <typename T>
void scale(const T* din, T* dout, int outer_dim, int scale_dim, int inner_dim,
const float* scale_data, const float* bias_data);
template <typename T>
void scale(const T* din, T* dout, int outer_dim, int scale_dim,
const float* scale_data, const float* bias_data);
} // namespace math } // namespace math
} // namespace arm } // namespace arm
} // namespace lite } // namespace lite
......
...@@ -12,9 +12,10 @@ ...@@ -12,9 +12,10 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/lite/arm/math/saturate.h" #include "paddle/fluid/lite/arm/math/type_trans.h"
#include <arm_neon.h> #include <arm_neon.h>
#include <string.h> #include <string.h>
#include "paddle/fluid/lite/arm/math/saturate.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -23,563 +24,553 @@ namespace math { ...@@ -23,563 +24,553 @@ namespace math {
template <typename dtype> template <typename dtype>
void int32_to_dtype(const int* din, dtype* dout, const float* scale, void int32_to_dtype(const int* din, dtype* dout, const float* scale,
int axis_size, long long outer_size, long long inner_size); int axis_size, int64_t outer_size, int64_t inner_size);
void fp32_to_int8(const float* din, signed char* dout, const float* scale, void fp32_to_int8(const float* din, signed char* dout, const float* scale,
int axis_size, long long outer_size, long long inner_size) { int axis_size, int64_t outer_size, int64_t inner_size) {
int cnt = inner_size / 16;
int cnt = inner_size / 16; int remain = inner_size & 15;
int remain = inner_size & 15; int64_t loop_size = outer_size * axis_size;
long long loop_size = outer_size * axis_size;
#pragma omp parallel for #pragma omp parallel for
for (int j = 0; j < loop_size; ++j) { for (int j = 0; j < loop_size; ++j) {
float inv_scale = 1.f / scale[j % axis_size]; float inv_scale = 1.f / scale[j % axis_size];
float32x4_t vzero = vdupq_n_f32(0.f); float32x4_t vzero = vdupq_n_f32(0.f);
float32x4_t vscale = vdupq_n_f32(inv_scale); float32x4_t vscale = vdupq_n_f32(inv_scale);
float32x4_t vpoff = vdupq_n_f32(0.5f); float32x4_t vpoff = vdupq_n_f32(0.5f);
float32x4_t vnoff = vdupq_n_f32(-0.5f); float32x4_t vnoff = vdupq_n_f32(-0.5f);
const float* din_c = din + j * inner_size; const float* din_c = din + j * inner_size;
signed char* dout_c = dout + j * inner_size; signed char* dout_c = dout + j * inner_size;
if (cnt > 0) { if (cnt > 0) {
int cnt_loop = cnt; int cnt_loop = cnt;
const float* din_ptr = din_c; const float* din_ptr = din_c;
signed char* dout_ptr = dout_c; signed char* dout_ptr = dout_c;
#ifdef __aarch64__ #ifdef __aarch64__
asm volatile( asm volatile(
"ldp q0, q1, [%[in]], #32 \n" "ldp q0, q1, [%[in]], #32 \n"
"ldp q2, q3, [%[in]], #32 \n" "ldp q2, q3, [%[in]], #32 \n"
"0: \n" /* main loop */ "0: \n" /* main loop */
"fmul v4.4s, v0.4s, %[scale].4s \n" "fmul v4.4s, v0.4s, %[scale].4s \n"
"fmul v5.4s, v1.4s, %[scale].4s \n" "fmul v5.4s, v1.4s, %[scale].4s \n"
"fmul v6.4s, v2.4s, %[scale].4s \n" "fmul v6.4s, v2.4s, %[scale].4s \n"
"fmul v7.4s, v3.4s, %[scale].4s \n" "fmul v7.4s, v3.4s, %[scale].4s \n"
"ldp q0, q1, [%[in]], #32 \n" "ldp q0, q1, [%[in]], #32 \n"
"subs %[cnt], %[cnt], #1 \n" "subs %[cnt], %[cnt], #1 \n"
"FCVTAS v8.4s, v4.4s \n" "FCVTAS v8.4s, v4.4s \n"
"FCVTAS v9.4s, v5.4s \n" "FCVTAS v9.4s, v5.4s \n"
"FCVTAS v10.4s, v6.4s \n" "FCVTAS v10.4s, v6.4s \n"
"FCVTAS v11.4s, v7.4s \n" "FCVTAS v11.4s, v7.4s \n"
"ldp q2, q3, [%[in]], #32 \n" "ldp q2, q3, [%[in]], #32 \n"
"sqxtn v4.4h, v8.4s \n" "sqxtn v4.4h, v8.4s \n"
"sqxtn2 v4.8h, v9.4s \n" "sqxtn2 v4.8h, v9.4s \n"
"sqxtn v5.4h, v10.4s \n" "sqxtn v5.4h, v10.4s \n"
"sqxtn2 v5.8h, v11.4s \n" "sqxtn2 v5.8h, v11.4s \n"
"sqxtn v8.8b, v4.8h \n" "sqxtn v8.8b, v4.8h \n"
"sqxtn2 v8.16b, v5.8h \n" "sqxtn2 v8.16b, v5.8h \n"
"str q8, [%[out]], #16 \n" "str q8, [%[out]], #16 \n"
"bne 0b \n" "bne 0b \n"
: [in] "+r" (din_ptr), [out] "+r" (dout_ptr), [cnt] "+r" (cnt_loop) : [in] "+r"(din_ptr), [out] "+r"(dout_ptr), [cnt] "+r"(cnt_loop)
: [scale] "w" (vscale) : [scale] "w"(vscale)
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11" : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
); "v11");
#else #else
asm volatile( asm volatile(
"vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n"
"vld1.32 {d4-d7}, [%[din]]! @ load in8~in16\n" "vld1.32 {d4-d7}, [%[din]]! @ load in8~in16\n"
"0: @ main loop\n" "0: @ main loop\n"
"vand.i32 q4, %q[vpoff], %q[vpoff] @ set offset, 0.5\n" "vand.i32 q4, %q[vpoff], %q[vpoff] @ set offset, 0.5\n"
"vand.i32 q5, q4, q4 @ set offset, 0.5\n" "vand.i32 q5, q4, q4 @ set offset, 0.5\n"
"vand.i32 q6, q4, q4 @ set offset, 0.5\n" "vand.i32 q6, q4, q4 @ set offset, 0.5\n"
"vand.i32 q7, q4, q4 @ set offset, 0.5\n" "vand.i32 q7, q4, q4 @ set offset, 0.5\n"
"vcgt.f32 q8, q0, %q[vzero] @ get mask > 0, in0\n" "vcgt.f32 q8, q0, %q[vzero] @ get mask > 0, in0\n"
"vcgt.f32 q9, q1, %q[vzero] @ get mask > 0, in1\n" "vcgt.f32 q9, q1, %q[vzero] @ get mask > 0, in1\n"
"vcgt.f32 q10, q2, %q[vzero] @ get mask > 0, in2\n" "vcgt.f32 q10, q2, %q[vzero] @ get mask > 0, in2\n"
"vcgt.f32 q11, q3, %q[vzero] @ get mask > 0, in3\n" "vcgt.f32 q11, q3, %q[vzero] @ get mask > 0, in3\n"
"vbif.f32 q4, %q[vnoff], q8 @ get right offset\n" "vbif.f32 q4, %q[vnoff], q8 @ get right offset\n"
"vbif.f32 q5, %q[vnoff], q9 @ get right offset\n" "vbif.f32 q5, %q[vnoff], q9 @ get right offset\n"
"vbif.f32 q6, %q[vnoff], q10 @ get right offset\n" "vbif.f32 q6, %q[vnoff], q10 @ get right offset\n"
"vbif.f32 q7, %q[vnoff], q11 @ get right offset\n" "vbif.f32 q7, %q[vnoff], q11 @ get right offset\n"
"vmla.f32 q4, q0, %q[vscale] @ mul scale\n" "vmla.f32 q4, q0, %q[vscale] @ mul scale\n"
"vmla.f32 q5, q1, %q[vscale] @ mul scale\n" "vmla.f32 q5, q1, %q[vscale] @ mul scale\n"
"vmla.f32 q6, q2, %q[vscale] @ mul scale\n" "vmla.f32 q6, q2, %q[vscale] @ mul scale\n"
"vmla.f32 q7, q3, %q[vscale] @ mul scale\n" "vmla.f32 q7, q3, %q[vscale] @ mul scale\n"
"vcvt.s32.f32 q0, q4 @ cvt to int32\n" "vcvt.s32.f32 q0, q4 @ cvt to int32\n"
"vcvt.s32.f32 q1, q5 @ cvt to int32\n" "vcvt.s32.f32 q1, q5 @ cvt to int32\n"
"vcvt.s32.f32 q2, q6 @ cvt to int32\n" "vcvt.s32.f32 q2, q6 @ cvt to int32\n"
"vcvt.s32.f32 q3, q7 @ cvt to int32\n" "vcvt.s32.f32 q3, q7 @ cvt to int32\n"
"vqmovn.s32 d8, q0 @ cnt to int16\n" "vqmovn.s32 d8, q0 @ cnt to int16\n"
"vqmovn.s32 d9, q1 @ cnt to int16\n" "vqmovn.s32 d9, q1 @ cnt to int16\n"
"vqmovn.s32 d10, q2 @ cnt to int16\n" "vqmovn.s32 d10, q2 @ cnt to int16\n"
"vqmovn.s32 d11, q3 @ cnt to int16\n" "vqmovn.s32 d11, q3 @ cnt to int16\n"
"vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n"
"vqmovn.s16 d12, q4 @ cnt to int8\n" "vqmovn.s16 d12, q4 @ cnt to int8\n"
"vqmovn.s16 d13, q5 @ cnt to int8\n" "vqmovn.s16 d13, q5 @ cnt to int8\n"
"vld1.32 {d4-d7}, [%[din]]! @ load in8~in16\n" "vld1.32 {d4-d7}, [%[din]]! @ load in8~in16\n"
"vst1.32 {d12-d13}, [%[dout]]! @ write to output\n" "vst1.32 {d12-d13}, [%[dout]]! @ write to output\n"
"subs %[cnt], #1 @ loop count -1\n" "subs %[cnt], #1 @ loop count -1\n"
"bne 0b @ to main loop\n" "bne 0b @ to main loop\n"
:[dout]"+r"(dout_ptr), [din]"+r"(din_ptr), [cnt]"+r"(cnt_loop) : [dout] "+r"(dout_ptr), [din] "+r"(din_ptr), [cnt] "+r"(cnt_loop)
:[vscale]"w"(vscale), [vpoff]"w"(vpoff), [vnoff]"w"(vnoff), [vzero]"w"(vzero) : [vscale] "w"(vscale), [vpoff] "w"(vpoff), [vnoff] "w"(vnoff),
:"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11" [vzero] "w"(vzero)
); : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10",
"q11");
#endif #endif
}
const float* din_r = din_c + 16 * cnt;
signed char* dout_r = dout_c + 16 * cnt;
for (int i = 0; i < remain; ++i) {
dout_r[i] = saturate_cast<int8_t>(roundf(inv_scale * din_r[i]));
}
} }
const float* din_r = din_c + 16 * cnt;
signed char* dout_r = dout_c + 16 * cnt;
for (int i = 0; i < remain; ++i) {
dout_r[i] = saturate_cast<int8_t>(roundf(inv_scale * din_r[i]));
}
}
} }
void fp32_to_int16(const float* din, int16_t* dout, const float* scale, void fp32_to_int16(const float* din, int16_t* dout, const float* scale,
int axis_size, long long outer_size, long long inner_size) { int axis_size, int64_t outer_size, int64_t inner_size) {
int cnt = inner_size / 8;
int cnt = inner_size / 8; int remain = inner_size & 7;
int remain = inner_size & 7; int64_t loop_size = outer_size * axis_size;
long long loop_size = outer_size * axis_size;
#pragma omp parallel for #pragma omp parallel for
for (int j = 0; j < loop_size; ++j) { for (int j = 0; j < loop_size; ++j) {
float inv_scale = 1.f / scale[j % axis_size]; float inv_scale = 1.f / scale[j % axis_size];
float32x4_t vzero = vdupq_n_f32(0.f); float32x4_t vzero = vdupq_n_f32(0.f);
float32x4_t vscale = vdupq_n_f32(inv_scale); float32x4_t vscale = vdupq_n_f32(inv_scale);
float32x4_t vpoff = vdupq_n_f32(0.5f); float32x4_t vpoff = vdupq_n_f32(0.5f);
float32x4_t vnoff = vdupq_n_f32(-0.5f); float32x4_t vnoff = vdupq_n_f32(-0.5f);
const float* din_c = din + j * inner_size; const float* din_c = din + j * inner_size;
int16_t* dout_c = dout + j * inner_size; int16_t* dout_c = dout + j * inner_size;
if (cnt > 0) { if (cnt > 0) {
int cnt_loop = cnt; int cnt_loop = cnt;
const float* din_ptr = din_c; const float* din_ptr = din_c;
int16_t* dout_ptr = dout_c; int16_t* dout_ptr = dout_c;
#ifdef __aarch64__ #ifdef __aarch64__
asm volatile( asm volatile(
"ldp q0, q1, [%[in]], #32 \n" "ldp q0, q1, [%[in]], #32 \n"
"0: \n" /* main loop */ "0: \n" /* main loop */
"fmul v4.4s, v0.4s, %[scale].4s \n" "fmul v4.4s, v0.4s, %[scale].4s \n"
"fmul v5.4s, v1.4s, %[scale].4s \n" "fmul v5.4s, v1.4s, %[scale].4s \n"
"ldp q0, q1, [%[in]], #32 \n" "ldp q0, q1, [%[in]], #32 \n"
"subs %[cnt], %[cnt], #1 \n" "subs %[cnt], %[cnt], #1 \n"
"FCVTAS v8.4s, v4.4s \n" "FCVTAS v8.4s, v4.4s \n"
"FCVTAS v9.4s, v5.4s \n" "FCVTAS v9.4s, v5.4s \n"
"sqxtn v4.4h, v8.4s \n" "sqxtn v4.4h, v8.4s \n"
"sqxtn2 v4.8h, v9.4s \n" "sqxtn2 v4.8h, v9.4s \n"
"str q4, [%[out]], #16 \n" "str q4, [%[out]], #16 \n"
"bne 0b \n" "bne 0b \n"
: [in] "+r" (din_ptr), [out] "+r" (dout_ptr), [cnt] "+r" (cnt_loop) : [in] "+r"(din_ptr), [out] "+r"(dout_ptr), [cnt] "+r"(cnt_loop)
: [scale] "w" (vscale) : [scale] "w"(vscale)
: "v0", "v1", "v4", "v5", "v8", "v9" : "v0", "v1", "v4", "v5", "v8", "v9");
);
#else #else
asm volatile( asm volatile(
"vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n"
"0: @ main loop\n" "0: @ main loop\n"
"vand.i32 q4, %q[vpoff], %q[vpoff] @ set offset, 0.5\n" "vand.i32 q4, %q[vpoff], %q[vpoff] @ set offset, 0.5\n"
"vand.i32 q5, q4, q4 @ set offset, 0.5\n" "vand.i32 q5, q4, q4 @ set offset, 0.5\n"
"vand.i32 q6, q4, q4 @ set offset, 0.5\n" "vand.i32 q6, q4, q4 @ set offset, 0.5\n"
"vand.i32 q7, q4, q4 @ set offset, 0.5\n" "vand.i32 q7, q4, q4 @ set offset, 0.5\n"
"vcgt.f32 q8, q0, %q[vzero] @ get mask > 0, in0\n" "vcgt.f32 q8, q0, %q[vzero] @ get mask > 0, in0\n"
"vcgt.f32 q9, q1, %q[vzero] @ get mask > 0, in1\n" "vcgt.f32 q9, q1, %q[vzero] @ get mask > 0, in1\n"
"vbif.f32 q4, %q[vnoff], q8 @ get right offset\n" "vbif.f32 q4, %q[vnoff], q8 @ get right offset\n"
"vbif.f32 q5, %q[vnoff], q9 @ get right offset\n" "vbif.f32 q5, %q[vnoff], q9 @ get right offset\n"
"vmla.f32 q4, q0, %q[vscale] @ mul scale\n" "vmla.f32 q4, q0, %q[vscale] @ mul scale\n"
"vmla.f32 q5, q1, %q[vscale] @ mul scale\n" "vmla.f32 q5, q1, %q[vscale] @ mul scale\n"
"vcvt.s32.f32 q0, q4 @ cvt to int32\n" "vcvt.s32.f32 q0, q4 @ cvt to int32\n"
"vcvt.s32.f32 q1, q5 @ cvt to int32\n" "vcvt.s32.f32 q1, q5 @ cvt to int32\n"
"vqmovn.s32 d8, q0 @ cnt to int16\n" "vqmovn.s32 d8, q0 @ cnt to int16\n"
"vqmovn.s32 d9, q1 @ cnt to int16\n" "vqmovn.s32 d9, q1 @ cnt to int16\n"
"vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n"
"vst1.32 {d8-d9}, [%[dout]]! @ write to output\n" "vst1.32 {d8-d9}, [%[dout]]! @ write to output\n"
"subs %[cnt], #1 @ loop count -1\n" "subs %[cnt], #1 @ loop count -1\n"
"bne 0b @ to main loop\n" "bne 0b @ to main loop\n"
:[dout]"+r"(dout_ptr), [din]"+r"(din_ptr), [cnt]"+r"(cnt_loop) : [dout] "+r"(dout_ptr), [din] "+r"(din_ptr), [cnt] "+r"(cnt_loop)
:[vscale]"w"(vscale), [vpoff]"w"(vpoff), [vnoff]"w"(vnoff), [vzero]"w"(vzero) : [vscale] "w"(vscale), [vpoff] "w"(vpoff), [vnoff] "w"(vnoff),
:"q0", "q1", "q4", "q5", "q6", "q7", "q8", "q9" [vzero] "w"(vzero)
); : "q0", "q1", "q4", "q5", "q6", "q7", "q8", "q9");
#endif #endif
}
const float* din_r = din_c + 8 * cnt;
int16_t* dout_r = dout_c + 8 * cnt;
for (int i = 0; i < remain; ++i) {
dout_r[i] = saturate_cast<int16_t>(roundf(inv_scale * din_r[i]));
}
} }
const float* din_r = din_c + 8 * cnt;
int16_t* dout_r = dout_c + 8 * cnt;
for (int i = 0; i < remain; ++i) {
dout_r[i] = saturate_cast<int16_t>(roundf(inv_scale * din_r[i]));
}
}
} }
void int8_to_fp32(const signed char* in, float* out, const float* scale, void int8_to_fp32(const signed char* in, float* out, const float* scale,
int axis_size, long long outer_size, long long inner_size) { int axis_size, int64_t outer_size, int64_t inner_size) {
int cnt = inner_size / 16;
int cnt = inner_size / 16; int remain = inner_size & 15;
int remain = inner_size & 15; int64_t loop_size = axis_size * outer_size;
long long loop_size = axis_size * outer_size;
#pragma omp parallel for #pragma omp parallel for
for (long long n = 0; n < loop_size; ++n) { for (int64_t n = 0; n < loop_size; ++n) {
float in_scale = scale[n % axis_size]; float in_scale = scale[n % axis_size];
const signed char* din_c = in + n * inner_size; const signed char* din_c = in + n * inner_size;
float* dout_c = out + n * inner_size; float* dout_c = out + n * inner_size;
float32x4_t vscale = vdupq_n_f32(in_scale); float32x4_t vscale = vdupq_n_f32(in_scale);
if (cnt > 0) { if (cnt > 0) {
int loop = cnt; int loop = cnt;
const signed char* din_ptr = din_c; const signed char* din_ptr = din_c;
float* dout_ptr = dout_c; float* dout_ptr = dout_c;
#ifdef __aarch64__ #ifdef __aarch64__
asm volatile( asm volatile(
"ldp d0, d1, [%[in]], #16 \n" /* load 16 int8*/ "ldp d0, d1, [%[in]], #16 \n" /* load 16 int8*/
"0: \n" /* main loop */ "0: \n" /* main loop */
"sshll v2.8h, v0.8b, #0 \n" /* trans to int16*/ "sshll v2.8h, v0.8b, #0 \n" /* trans to int16*/
"sshll v3.8h, v1.8b, #0 \n" /* trans to int16*/ "sshll v3.8h, v1.8b, #0 \n" /* trans to int16*/
"sshll v4.4s, v2.4h, #0 \n" /* trans to int32*/ "sshll v4.4s, v2.4h, #0 \n" /* trans to int32*/
"sshll2 v5.4s, v2.8h, #0 \n" /* trans to int32*/ "sshll2 v5.4s, v2.8h, #0 \n" /* trans to int32*/
"sshll v6.4s, v3.4h, #0 \n" /* trans to int32*/ "sshll v6.4s, v3.4h, #0 \n" /* trans to int32*/
"sshll2 v7.4s, v3.8h, #0 \n" /* trans to int32*/ "sshll2 v7.4s, v3.8h, #0 \n" /* trans to int32*/
"ldp d0, d1, [%[in]], #16 \n" /* load 16 int8*/ "ldp d0, d1, [%[in]], #16 \n" /* load 16 int8*/
"scvtf v8.4s, v4.4s \n" /* trans to fp32*/ "scvtf v8.4s, v4.4s \n" /* trans to fp32*/
"scvtf v9.4s, v5.4s \n" /* trans to fp32*/ "scvtf v9.4s, v5.4s \n" /* trans to fp32*/
"scvtf v10.4s, v6.4s \n" /* trans to fp32*/ "scvtf v10.4s, v6.4s \n" /* trans to fp32*/
"scvtf v11.4s, v7.4s \n" /* trans to fp32*/ "scvtf v11.4s, v7.4s \n" /* trans to fp32*/
"subs %[loop], %[loop], #1 \n" "subs %[loop], %[loop], #1 \n"
"fmul v4.4s, v8.4s, %[scale].4s \n" /* mul with scale*/ "fmul v4.4s, v8.4s, %[scale].4s \n" /* mul with scale*/
"fmul v5.4s, v9.4s, %[scale].4s \n" /* mul with scale*/ "fmul v5.4s, v9.4s, %[scale].4s \n" /* mul with scale*/
"fmul v6.4s, v10.4s, %[scale].4s \n" /* mul with scale*/ "fmul v6.4s, v10.4s, %[scale].4s \n" /* mul with scale*/
"fmul v7.4s, v11.4s, %[scale].4s \n" /* mul with scale*/ "fmul v7.4s, v11.4s, %[scale].4s \n" /* mul with scale*/
"stp q4, q5, [%[out]], #32 \n" /* write to memory*/ "stp q4, q5, [%[out]], #32 \n" /* write to memory*/
"stp q6, q7, [%[out]], #32 \n" /* write to memory*/ "stp q6, q7, [%[out]], #32 \n" /* write to memory*/
"bne 0b \n" "bne 0b \n"
:[loop] "+r" (loop), [in] "+r" (din_ptr), [out] "+r" (dout_ptr) : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
:[scale] "w" (vscale) : [scale] "w"(vscale)
:"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11" : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
); "v11");
#else #else
asm volatile( asm volatile(
"vld1.32 {d0-d1}, [%[in]]! @ load 16 int8\n" "vld1.32 {d0-d1}, [%[in]]! @ load 16 int8\n"
"0: @ main loop\n" "0: @ main loop\n"
"vmovl.s8 q2, d0 @ trans to int16\n" "vmovl.s8 q2, d0 @ trans to int16\n"
"vmovl.s8 q3, d1 @ trans to int16\n" "vmovl.s8 q3, d1 @ trans to int16\n"
"vmovl.s16 q4, d4 @ trans to int32\n" "vmovl.s16 q4, d4 @ trans to int32\n"
"vmovl.s16 q5, d5 @ trans to int32\n" "vmovl.s16 q5, d5 @ trans to int32\n"
"vmovl.s16 q6, d6 @ trans to int32\n" "vmovl.s16 q6, d6 @ trans to int32\n"
"vmovl.s16 q7, d7 @ trans to int32\n" "vmovl.s16 q7, d7 @ trans to int32\n"
"vcvt.f32.s32 q0, q4 @ trans to fp32\n" "vcvt.f32.s32 q0, q4 @ trans to fp32\n"
"vcvt.f32.s32 q1, q5 @ trans to fp32\n" "vcvt.f32.s32 q1, q5 @ trans to fp32\n"
"vcvt.f32.s32 q2, q6 @ trans to fp32\n" "vcvt.f32.s32 q2, q6 @ trans to fp32\n"
"vcvt.f32.s32 q3, q7 @ trans to fp32\n" "vcvt.f32.s32 q3, q7 @ trans to fp32\n"
"vmul.f32 q4, q0, %q[scale] @ mul with scale\n" "vmul.f32 q4, q0, %q[scale] @ mul with scale\n"
"vmul.f32 q5, q1, %q[scale] @ mul with scale\n" "vmul.f32 q5, q1, %q[scale] @ mul with scale\n"
"vmul.f32 q6, q2, %q[scale] @ mul with scale\n" "vmul.f32 q6, q2, %q[scale] @ mul with scale\n"
"vmul.f32 q7, q3, %q[scale] @ mul with scale\n" "vmul.f32 q7, q3, %q[scale] @ mul with scale\n"
"vld1.32 {d0-d1}, [%[in]]! @ load 16 int8\n" "vld1.32 {d0-d1}, [%[in]]! @ load 16 int8\n"
"subs %[loop], #1 \n" "subs %[loop], #1 \n"
"vst1.f32 {d8-d11}, [%[out]]! @ write to memory\n" "vst1.f32 {d8-d11}, [%[out]]! @ write to memory\n"
"vst1.f32 {d12-d15}, [%[out]]! @ write to memory\n" "vst1.f32 {d12-d15}, [%[out]]! @ write to memory\n"
"bne 0b \n" "bne 0b \n"
:[loop] "+r" (loop), [in] "+r" (din_ptr), [out] "+r" (dout_ptr) : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
:[scale] "w" (vscale) : [scale] "w"(vscale)
:"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7" : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
); #endif // __aarch64__
#endif //__aarch64__ }
} const signed char* din_r = din_c + 16 * cnt;
const signed char* din_r = din_c + 16 * cnt; float* dout_r = dout_c + 16 * cnt;
float* dout_r = dout_c + 16 * cnt; for (int i = 0; i < remain; ++i) {
for (int i = 0; i < remain; ++i) { dout_r[i] = in_scale * din_r[i];
dout_r[i] = in_scale * din_r[i];
}
} }
}
} }
void int16_to_fp32(const short* in, float* out, const float* scale, void int16_to_fp32(const int16_t* in, float* out, const float* scale,
int axis_size, long long outer_size, long long inner_size) { int axis_size, int64_t outer_size, int64_t inner_size) {
int cnt = inner_size / 16;
int cnt = inner_size / 16; int remain = inner_size & 15;
int remain = inner_size & 15; int64_t loop_size = axis_size * outer_size;
long long loop_size = axis_size * outer_size;
#pragma omp parallel for #pragma omp parallel for
for (long long n = 0; n < loop_size; ++n) { for (int64_t n = 0; n < loop_size; ++n) {
float in_scale = scale[n % axis_size]; float in_scale = scale[n % axis_size];
const short* din_c = in + n * inner_size; const int16_t* din_c = in + n * inner_size;
float* dout_c = out + n * inner_size; float* dout_c = out + n * inner_size;
float32x4_t vscale = vdupq_n_f32(in_scale); float32x4_t vscale = vdupq_n_f32(in_scale);
if (cnt > 0) { if (cnt > 0) {
int loop = cnt; int loop = cnt;
const short* din_ptr = din_c; const int16_t* din_ptr = din_c;
float* dout_ptr = dout_c; float* dout_ptr = dout_c;
#ifdef __aarch64__ #ifdef __aarch64__
asm volatile( asm volatile(
"ldp q0, q1, [%[in]], #32 \n" /* load 16 int16*/ "ldp q0, q1, [%[in]], #32 \n" /* load 16 int16*/
"0: \n" /* main loop */ "0: \n" /* main loop */
"sshll v4.4s, v0.4h, #0 \n" /* trans to int32*/ "sshll v4.4s, v0.4h, #0 \n" /* trans to int32*/
"sshll2 v5.4s, v0.8h, #0 \n" /* trans to int32*/ "sshll2 v5.4s, v0.8h, #0 \n" /* trans to int32*/
"sshll v6.4s, v1.4h, #0 \n" /* trans to int32*/ "sshll v6.4s, v1.4h, #0 \n" /* trans to int32*/
"sshll2 v7.4s, v1.8h, #0 \n" /* trans to int32*/ "sshll2 v7.4s, v1.8h, #0 \n" /* trans to int32*/
"ldp q0, q1, [%[in]], #32 \n" /* load 16 int16*/ "ldp q0, q1, [%[in]], #32 \n" /* load 16 int16*/
"scvtf v8.4s, v4.4s \n" /* trans to fp32*/ "scvtf v8.4s, v4.4s \n" /* trans to fp32*/
"scvtf v9.4s, v5.4s \n" /* trans to fp32*/ "scvtf v9.4s, v5.4s \n" /* trans to fp32*/
"scvtf v10.4s, v6.4s \n" /* trans to fp32*/ "scvtf v10.4s, v6.4s \n" /* trans to fp32*/
"scvtf v11.4s, v7.4s \n" /* trans to fp32*/ "scvtf v11.4s, v7.4s \n" /* trans to fp32*/
"subs %[loop], %[loop], #1 \n" "subs %[loop], %[loop], #1 \n"
"fmul v4.4s, v8.4s, %[scale].4s \n" /* mul with scale*/ "fmul v4.4s, v8.4s, %[scale].4s \n" /* mul with scale*/
"fmul v5.4s, v9.4s, %[scale].4s \n" /* mul with scale*/ "fmul v5.4s, v9.4s, %[scale].4s \n" /* mul with scale*/
"fmul v6.4s, v10.4s, %[scale].4s \n" /* mul with scale*/ "fmul v6.4s, v10.4s, %[scale].4s \n" /* mul with scale*/
"fmul v7.4s, v11.4s, %[scale].4s \n" /* mul with scale*/ "fmul v7.4s, v11.4s, %[scale].4s \n" /* mul with scale*/
"stp q4, q5, [%[out]], #32 \n" /* write to memory*/ "stp q4, q5, [%[out]], #32 \n" /* write to memory*/
"stp q6, q7, [%[out]], #32 \n" /* write to memory*/ "stp q6, q7, [%[out]], #32 \n" /* write to memory*/
"bne 0b \n" "bne 0b \n"
:[loop] "+r" (loop), [in] "+r" (din_ptr), [out] "+r" (dout_ptr) : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
:[scale] "w" (vscale) : [scale] "w"(vscale)
:"v0", "v1", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11" : "v0", "v1", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11");
);
#else #else
asm volatile( asm volatile(
"vld1.32 {d0-d3}, [%[in]]! @ load 16 int16\n" "vld1.32 {d0-d3}, [%[in]]! @ load 16 int16\n"
"0: @ main loop\n" "0: @ main loop\n"
"vmovl.s16 q4, d0 @ trans to int32\n" "vmovl.s16 q4, d0 @ trans to int32\n"
"vmovl.s16 q5, d1 @ trans to int32\n" "vmovl.s16 q5, d1 @ trans to int32\n"
"vmovl.s16 q6, d2 @ trans to int32\n" "vmovl.s16 q6, d2 @ trans to int32\n"
"vmovl.s16 q7, d3 @ trans to int32\n" "vmovl.s16 q7, d3 @ trans to int32\n"
"vcvt.f32.s32 q0, q4 @ trans to fp32\n" "vcvt.f32.s32 q0, q4 @ trans to fp32\n"
"vcvt.f32.s32 q1, q5 @ trans to fp32\n" "vcvt.f32.s32 q1, q5 @ trans to fp32\n"
"vcvt.f32.s32 q2, q6 @ trans to fp32\n" "vcvt.f32.s32 q2, q6 @ trans to fp32\n"
"vcvt.f32.s32 q3, q7 @ trans to fp32\n" "vcvt.f32.s32 q3, q7 @ trans to fp32\n"
"vmul.f32 q4, q0, %q[scale] @ mul with scale\n" "vmul.f32 q4, q0, %q[scale] @ mul with scale\n"
"vmul.f32 q5, q1, %q[scale] @ mul with scale\n" "vmul.f32 q5, q1, %q[scale] @ mul with scale\n"
"vmul.f32 q6, q2, %q[scale] @ mul with scale\n" "vmul.f32 q6, q2, %q[scale] @ mul with scale\n"
"vmul.f32 q7, q3, %q[scale] @ mul with scale\n" "vmul.f32 q7, q3, %q[scale] @ mul with scale\n"
"vld1.32 {d0-d3}, [%[in]]! @ load 16 int8\n" "vld1.32 {d0-d3}, [%[in]]! @ load 16 int8\n"
"subs %[loop], #1 \n" "subs %[loop], #1 \n"
"vst1.f32 {d8-d11}, [%[out]]! @ write to memory\n" "vst1.f32 {d8-d11}, [%[out]]! @ write to memory\n"
"vst1.f32 {d12-d15}, [%[out]]! @ write to memory\n" "vst1.f32 {d12-d15}, [%[out]]! @ write to memory\n"
"bne 0b \n" "bne 0b \n"
:[loop] "+r" (loop), [in] "+r" (din_ptr), [out] "+r" (dout_ptr) : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
:[scale] "w" (vscale) : [scale] "w"(vscale)
:"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7" : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
); #endif // __aarch64__
#endif //__aarch64__ }
} const int16_t* din_r = din_c + 16 * cnt;
const short* din_r = din_c + 16 * cnt; float* dout_r = dout_c + 16 * cnt;
float* dout_r = dout_c + 16 * cnt; for (int i = 0; i < remain; ++i) {
for (int i = 0; i < remain; ++i) { dout_r[i] = in_scale * din_r[i];
dout_r[i] = in_scale * din_r[i];
}
} }
}
} }
void int32_to_fp32(const int* din, float* dout, const float* scale, void int32_to_fp32(const int* din, float* dout, const float* scale,
int axis_size, long long outer_size, long long inner_size) { int axis_size, int64_t outer_size, int64_t inner_size) {
int cnt = inner_size / 16; int cnt = inner_size / 16;
int remain = inner_size & 15; int remain = inner_size & 15;
long long loop_size = axis_size * outer_size; int64_t loop_size = axis_size * outer_size;
#pragma omp parallel for #pragma omp parallel for
for (long long n = 0; n < loop_size; ++n) { for (int64_t n = 0; n < loop_size; ++n) {
float in_scale = scale[n % axis_size]; float in_scale = scale[n % axis_size];
const int* din_c = din + n * inner_size; const int* din_c = din + n * inner_size;
float* dout_c = dout + n * inner_size; float* dout_c = dout + n * inner_size;
float32x4_t vscale = vdupq_n_f32(in_scale); float32x4_t vscale = vdupq_n_f32(in_scale);
if (cnt > 0) { if (cnt > 0) {
int loop = cnt; int loop = cnt;
const int* din_ptr = din_c; const int* din_ptr = din_c;
float* dout_ptr = dout_c; float* dout_ptr = dout_c;
#ifdef __aarch64__ #ifdef __aarch64__
asm volatile( asm volatile(
"ldp q0, q1, [%[in]], #32 \n" "ldp q0, q1, [%[in]], #32 \n"
"ldp q2, q3, [%[in]], #32 \n" "ldp q2, q3, [%[in]], #32 \n"
"0: \n" "0: \n"
"scvtf v4.4s, v0.4s \n" "scvtf v4.4s, v0.4s \n"
"scvtf v5.4s, v1.4s \n" "scvtf v5.4s, v1.4s \n"
"scvtf v6.4s, v2.4s \n" "scvtf v6.4s, v2.4s \n"
"scvtf v7.4s, v3.4s \n" "scvtf v7.4s, v3.4s \n"
"ldp q0, q1, [%[in]], #32 \n" "ldp q0, q1, [%[in]], #32 \n"
"fmul v8.4s, v4.4s, %[scale].4s \n" "fmul v8.4s, v4.4s, %[scale].4s \n"
"fmul v9.4s, v5.4s, %[scale].4s \n" "fmul v9.4s, v5.4s, %[scale].4s \n"
"fmul v10.4s, v6.4s, %[scale].4s \n" "fmul v10.4s, v6.4s, %[scale].4s \n"
"fmul v11.4s, v7.4s, %[scale].4s \n" "fmul v11.4s, v7.4s, %[scale].4s \n"
"ldp q2, q3, [%[in]], #32 \n" "ldp q2, q3, [%[in]], #32 \n"
"stp q8, q9, [%[out]], #32 \n" "stp q8, q9, [%[out]], #32 \n"
"stp q10, q11, [%[out]], #32 \n" "stp q10, q11, [%[out]], #32 \n"
"subs %[loop], %[loop], #1 \n" "subs %[loop], %[loop], #1 \n"
"bne 0b \n" "bne 0b \n"
:[loop] "+r" (loop), [in] "+r" (din_ptr), [out] "+r" (dout_ptr) : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
:[scale] "w" (vscale) : [scale] "w"(vscale)
:"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11" : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
); "v11");
#else #else
asm volatile( asm volatile(
"vld1.s32 {d0-d3}, [%[in]]! \n" "vld1.s32 {d0-d3}, [%[in]]! \n"
"vld1.s32 {d4-d7}, [%[in]]! \n" "vld1.s32 {d4-d7}, [%[in]]! \n"
"0: \n" "0: \n"
"vcvt.f32.s32 q4, q0 \n" "vcvt.f32.s32 q4, q0 \n"
"vcvt.f32.s32 q5, q1 \n" "vcvt.f32.s32 q5, q1 \n"
"vcvt.f32.s32 q6, q2 \n" "vcvt.f32.s32 q6, q2 \n"
"vcvt.f32.s32 q7, q3 \n" "vcvt.f32.s32 q7, q3 \n"
"vld1.s32 {d0-d3}, [%[in]]! \n" "vld1.s32 {d0-d3}, [%[in]]! \n"
"vmul.f32 q8, q4, %q[scale] \n" "vmul.f32 q8, q4, %q[scale] \n"
"vmul.f32 q9, q5, %q[scale] \n" "vmul.f32 q9, q5, %q[scale] \n"
"vmul.f32 q10, q6, %q[scale] \n" "vmul.f32 q10, q6, %q[scale] \n"
"vmul.f32 q11, q7, %q[scale] \n" "vmul.f32 q11, q7, %q[scale] \n"
"vld1.s32 {d4-d7}, [%[in]]! \n" "vld1.s32 {d4-d7}, [%[in]]! \n"
"subs %[loop], #1 \n" "subs %[loop], #1 \n"
"vst1.f32 {d16-d19}, [%[out]]! \n" "vst1.f32 {d16-d19}, [%[out]]! \n"
"vst1.f32 {d20-d23}, [%[out]]! \n" "vst1.f32 {d20-d23}, [%[out]]! \n"
"bne 0b \n" "bne 0b \n"
:[loop] "+r" (loop), [in] "+r" (din_ptr), [out] "+r" (dout_ptr) : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
:[scale] "w" (vscale) : [scale] "w"(vscale)
:"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11" : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10",
); "q11");
#endif //__aarch64__ #endif // __aarch64__
}
const int* din_r = din_c + 16 * cnt;
float* dout_r = dout_c + 16 * cnt;
for (int i = 0; i < remain; ++i) {
dout_r[i] = in_scale * din_r[i];
}
} }
const int* din_r = din_c + 16 * cnt;
float* dout_r = dout_c + 16 * cnt;
for (int i = 0; i < remain; ++i) {
dout_r[i] = in_scale * din_r[i];
}
}
} }
void int32_to_int8(const int* din, signed char* dout, const float* scale, \ void int32_to_int8(const int* din, signed char* dout, const float* scale,
int axis_size, long long outer_size, long long inner_size) { int axis_size, int64_t outer_size, int64_t inner_size) {
int cnt = inner_size / 16; int cnt = inner_size / 16;
int remain = inner_size & 15; int remain = inner_size & 15;
long long loop_size = outer_size * axis_size; int64_t loop_size = outer_size * axis_size;
#pragma omp parallel for #pragma omp parallel for
for (long long n = 0; n < loop_size; ++n) { for (int64_t n = 0; n < loop_size; ++n) {
float in_scale = scale[n % axis_size]; float in_scale = scale[n % axis_size];
const int* din_c = din + n * inner_size; const int* din_c = din + n * inner_size;
signed char* dout_c = dout + n * inner_size; signed char* dout_c = dout + n * inner_size;
float32x4_t vscale = vdupq_n_f32(in_scale); float32x4_t vscale = vdupq_n_f32(in_scale);
float32x4_t vzero = vdupq_n_f32(0.f); float32x4_t vzero = vdupq_n_f32(0.f);
float32x4_t vpoff = vdupq_n_f32(0.5f); float32x4_t vpoff = vdupq_n_f32(0.5f);
float32x4_t vnoff = vdupq_n_f32(-0.5f); float32x4_t vnoff = vdupq_n_f32(-0.5f);
if (cnt > 0) { if (cnt > 0) {
int loop = cnt; int loop = cnt;
const int* din_ptr = din_c; const int* din_ptr = din_c;
signed char* dout_ptr = dout_c; signed char* dout_ptr = dout_c;
#ifdef __aarch64__ #ifdef __aarch64__
asm volatile( asm volatile(
"0: \n" "0: \n"
"ld1 {v0.4s, v1.4s}, [%[in]], #32 \n" "ld1 {v0.4s, v1.4s}, [%[in]], #32 \n"
"ld1 {v2.4s, v3.4s}, [%[in]], #32 \n" "ld1 {v2.4s, v3.4s}, [%[in]], #32 \n"
"scvtf v4.4s, v0.4s \n" "scvtf v4.4s, v0.4s \n"
"scvtf v5.4s, v1.4s \n" "scvtf v5.4s, v1.4s \n"
"scvtf v6.4s, v2.4s \n" "scvtf v6.4s, v2.4s \n"
"scvtf v7.4s, v3.4s \n" "scvtf v7.4s, v3.4s \n"
"fmul v0.4s, v4.4s, %[scale].4s \n" "fmul v0.4s, v4.4s, %[scale].4s \n"
"fmul v1.4s, v5.4s, %[scale].4s \n" "fmul v1.4s, v5.4s, %[scale].4s \n"
"fmul v2.4s, v6.4s, %[scale].4s \n" "fmul v2.4s, v6.4s, %[scale].4s \n"
"fmul v3.4s, v7.4s, %[scale].4s \n" "fmul v3.4s, v7.4s, %[scale].4s \n"
"fcvtas v4.4s, v0.4s \n" "fcvtas v4.4s, v0.4s \n"
"fcvtas v5.4s, v1.4s \n" "fcvtas v5.4s, v1.4s \n"
"fcvtas v6.4s, v2.4s \n" "fcvtas v6.4s, v2.4s \n"
"fcvtas v7.4s, v3.4s \n" "fcvtas v7.4s, v3.4s \n"
"sqxtn v0.4h, v4.4s \n" "sqxtn v0.4h, v4.4s \n"
"sqxtn2 v0.8h, v5.4s \n" "sqxtn2 v0.8h, v5.4s \n"
"sqxtn v1.4h, v6.4s \n" "sqxtn v1.4h, v6.4s \n"
"sqxtn2 v1.8h, v7.4s \n" "sqxtn2 v1.8h, v7.4s \n"
"sqxtn v2.8b, v0.8h \n" "sqxtn v2.8b, v0.8h \n"
"sqxtn2 v2.16b, v1.8h \n" "sqxtn2 v2.16b, v1.8h \n"
"st1 {v2.16b}, [%[out]], #16 \n" "st1 {v2.16b}, [%[out]], #16 \n"
"subs %[loop], %[loop], #1 \n" "subs %[loop], %[loop], #1 \n"
"bne 0b \n" "bne 0b \n"
:[loop] "+r" (loop), [in] "+r" (din_ptr), [out] "+r" (dout_ptr) : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
:[scale] "w" (vscale) : [scale] "w"(vscale)
:"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
);
#else #else
asm volatile( asm volatile(
"vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n"
"vld1.32 {d4-d7}, [%[din]]! @ load in8~in16\n" "vld1.32 {d4-d7}, [%[din]]! @ load in8~in16\n"
"0: @ main loop\n" "0: @ main loop\n"
"vcvt.f32.s32 q4, q0 @ cvt to float\n" "vcvt.f32.s32 q4, q0 @ cvt to float\n"
"vcvt.f32.s32 q5, q1 @ cvt to float\n" "vcvt.f32.s32 q5, q1 @ cvt to float\n"
"vcvt.f32.s32 q6, q2 @ cvt to float\n" "vcvt.f32.s32 q6, q2 @ cvt to float\n"
"vcvt.f32.s32 q7, q3 @ cvt to float\n" "vcvt.f32.s32 q7, q3 @ cvt to float\n"
"vand.i32 q0, %q[vpoff], %q[vpoff] @ set offset, 0.5\n" "vand.i32 q0, %q[vpoff], %q[vpoff] @ set offset, 0.5\n"
"vand.i32 q1, q0, q0 @ set offset, 0.5\n" "vand.i32 q1, q0, q0 @ set offset, 0.5\n"
"vand.i32 q2, q0, q0 @ set offset, 0.5\n" "vand.i32 q2, q0, q0 @ set offset, 0.5\n"
"vand.i32 q3, q0, q0 @ set offset, 0.5\n" "vand.i32 q3, q0, q0 @ set offset, 0.5\n"
"vcgt.f32 q8, q4, %q[vzero] @ get mask > 0, in0\n" "vcgt.f32 q8, q4, %q[vzero] @ get mask > 0, in0\n"
"vcgt.f32 q9, q5, %q[vzero] @ get mask > 0, in1\n" "vcgt.f32 q9, q5, %q[vzero] @ get mask > 0, in1\n"
"vcgt.f32 q10, q6, %q[vzero] @ get mask > 0, in2\n" "vcgt.f32 q10, q6, %q[vzero] @ get mask > 0, in2\n"
"vcgt.f32 q11, q7, %q[vzero] @ get mask > 0, in3\n" "vcgt.f32 q11, q7, %q[vzero] @ get mask > 0, in3\n"
"vbif.f32 q0, %q[vnoff], q8 @ get right offset\n" "vbif.f32 q0, %q[vnoff], q8 @ get right offset\n"
"vbif.f32 q1, %q[vnoff], q9 @ get right offset\n" "vbif.f32 q1, %q[vnoff], q9 @ get right offset\n"
"vbif.f32 q2, %q[vnoff], q10 @ get right offset\n" "vbif.f32 q2, %q[vnoff], q10 @ get right offset\n"
"vbif.f32 q3, %q[vnoff], q11 @ get right offset\n" "vbif.f32 q3, %q[vnoff], q11 @ get right offset\n"
"vmla.f32 q0, q4, %q[vscale] @ mul scale\n" "vmla.f32 q0, q4, %q[vscale] @ mul scale\n"
"vmla.f32 q1, q5, %q[vscale] @ mul scale\n" "vmla.f32 q1, q5, %q[vscale] @ mul scale\n"
"vmla.f32 q2, q6, %q[vscale] @ mul scale\n" "vmla.f32 q2, q6, %q[vscale] @ mul scale\n"
"vmla.f32 q3, q7, %q[vscale] @ mul scale\n" "vmla.f32 q3, q7, %q[vscale] @ mul scale\n"
"vcvt.s32.f32 q4, q0 @ cvt to int32\n" "vcvt.s32.f32 q4, q0 @ cvt to int32\n"
"vcvt.s32.f32 q5, q1 @ cvt to int32\n" "vcvt.s32.f32 q5, q1 @ cvt to int32\n"
"vcvt.s32.f32 q6, q2 @ cvt to int32\n" "vcvt.s32.f32 q6, q2 @ cvt to int32\n"
"vcvt.s32.f32 q7, q3 @ cvt to int32\n" "vcvt.s32.f32 q7, q3 @ cvt to int32\n"
"vqmovn.s32 d16, q4 @ cnt to int16\n" "vqmovn.s32 d16, q4 @ cnt to int16\n"
"vqmovn.s32 d17, q5 @ cnt to int16\n" "vqmovn.s32 d17, q5 @ cnt to int16\n"
"vqmovn.s32 d18, q6 @ cnt to int16\n" "vqmovn.s32 d18, q6 @ cnt to int16\n"
"vqmovn.s32 d19, q7 @ cnt to int16\n" "vqmovn.s32 d19, q7 @ cnt to int16\n"
"vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n" "vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n"
"vqmovn.s16 d8, q8 @ cnt to int8\n" "vqmovn.s16 d8, q8 @ cnt to int8\n"
"vqmovn.s16 d9, q9 @ cnt to int8\n" "vqmovn.s16 d9, q9 @ cnt to int8\n"
"vld1.32 {d4-d7}, [%[din]]! @ load in8~in16\n" "vld1.32 {d4-d7}, [%[din]]! @ load in8~in16\n"
"vst1.32 {d8-d9}, [%[dout]]! @ write to output\n" "vst1.32 {d8-d9}, [%[dout]]! @ write to output\n"
"subs %[loop], #1 @ loop count -1\n" "subs %[loop], #1 @ loop count -1\n"
"bne 0b @ to main loop\n" "bne 0b @ to main loop\n"
:[loop] "+r" (loop), [din] "+r" (din_ptr), [dout] "+r" (dout_ptr) : [loop] "+r"(loop), [din] "+r"(din_ptr), [dout] "+r"(dout_ptr)
:[vscale] "w" (vscale), [vzero] "w"(vzero), [vnoff] "w" (vnoff), [vpoff] "w" (vpoff) : [vscale] "w"(vscale), [vzero] "w"(vzero), [vnoff] "w"(vnoff),
:"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11" [vpoff] "w"(vpoff)
); : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10",
#endif //__aarch64__ "q11");
} #endif // __aarch64__
const int* din_r = din_c + 16 * cnt; }
int8_t* dout_r = dout_c + 16 * cnt; const int* din_r = din_c + 16 * cnt;
for (int i = 0; i < remain; ++i) { int8_t* dout_r = dout_c + 16 * cnt;
dout_r[i] = saturate_cast<int8_t>(roundf(in_scale * din_r[i])); for (int i = 0; i < remain; ++i) {
} dout_r[i] = saturate_cast<int8_t>(roundf(in_scale * din_r[i]));
} }
}
} }
void int32_to_int32(const int* din, int* dout, const float* scale, \ void int32_to_int32(const int* din, int* dout, const float* scale,
int axis_size, long long outer_size, long long inner_size) { int axis_size, int64_t outer_size, int64_t inner_size) {
int size_all = outer_size * axis_size * inner_size; int size_all = outer_size * axis_size * inner_size;
memmove(dout, din, size_all*sizeof(int)); memmove(dout, din, size_all * sizeof(int));
} }
template <> template <>
void int32_to_dtype(const int* din, float* dout, const float* scale, void int32_to_dtype(const int* din, float* dout, const float* scale,
int axis_size, long long outer_size, long long inner_size) { int axis_size, int64_t outer_size, int64_t inner_size) {
return int32_to_fp32(din, dout, scale, axis_size, outer_size, inner_size);
return int32_to_fp32(din, dout, scale, axis_size, outer_size, inner_size);
} }
template <> template <>
void int32_to_dtype(const int* din, signed char* dout, const float* scale, void int32_to_dtype(const int* din, signed char* dout, const float* scale,
int axis_size, long long outer_size, long long inner_size) { int axis_size, int64_t outer_size, int64_t inner_size) {
return int32_to_int8(din, dout, scale, axis_size, outer_size, inner_size);
return int32_to_int8(din, dout, scale, axis_size, outer_size, inner_size);
} }
template <> template <>
void int32_to_dtype(const int* din, int* dout, const float* scale, void int32_to_dtype(const int* din, int* dout, const float* scale,
int axis_size, long long outer_size, long long inner_size) { int axis_size, int64_t outer_size, int64_t inner_size) {
return int32_to_int32(din, dout, scale, axis_size, outer_size, inner_size);
return int32_to_int32(din, dout, scale, axis_size, outer_size, inner_size);
} }
} // namespace math } // namespace math
......
...@@ -24,13 +24,14 @@ cc_library(variable_lite SRCS variable.cc) ...@@ -24,13 +24,14 @@ cc_library(variable_lite SRCS variable.cc)
cc_library(op_registry_lite SRCS op_registry.cc DEPS framework_proto_lite) cc_library(op_registry_lite SRCS op_registry.cc DEPS framework_proto_lite)
cc_library(scope_lite SRCS scope.cc DEPS ${tensor_lite}) cc_library(scope_lite SRCS scope.cc DEPS ${tensor_lite})
cc_library(cpu_info_lite SRCS cpu_info.cc) cc_library(cpu_info_lite SRCS cpu_info.cc)
cc_library(context_lite SRCS context.cc DEPS ${tensor_lite} any_lite cpu_info_lite) lite_cc_library(context_lite SRCS context.cc DEPS ${tensor_lite} any_lite cpu_info_lite eigen3)
cc_library(op_lite SRCS op_lite.cc DEPS scope_lite op_registry_lite target_wrapper_lite cc_library(op_lite SRCS op_lite.cc DEPS scope_lite op_registry_lite target_wrapper_lite
cpp_op_desc_lite ${tensor_lite}) cpp_op_desc_lite ${tensor_lite})
cc_library(types_lite SRCS types.cc) cc_library(types_lite SRCS types.cc)
cc_library(type_system SRCS type_system.cc DEPS ${tensor_lite} target_wrapper_lite) cc_library(type_system SRCS type_system.cc DEPS ${tensor_lite} target_wrapper_lite)
lite_cc_library(program_lite SRCS program.cc DEPS op_lite kernel_lite compatible_pb_lite model_parser_lite HVY_DEPS framework_proto) lite_cc_library(program_lite SRCS program.cc DEPS op_lite kernel_lite compatible_pb_lite model_parser_lite HVY_DEPS framework_proto
PROFILE_DEPS basic_profiler_lite)
cc_library(optimizer_lite SRCS optimizer.cc DEPS mir_pass_manager model_parser_lite program_lite) cc_library(optimizer_lite SRCS optimizer.cc DEPS mir_pass_manager model_parser_lite program_lite)
add_subdirectory(mir) add_subdirectory(mir)
...@@ -56,3 +57,4 @@ lite_cc_test(test_type_system SRCS type_system_test.cc DEPS type_system utils_li ...@@ -56,3 +57,4 @@ lite_cc_test(test_type_system SRCS type_system_test.cc DEPS type_system utils_li
lite_cc_test(test_types_lite SRCS types_test.cc DEPS types_lite) lite_cc_test(test_types_lite SRCS types_test.cc DEPS types_lite)
lite_cc_test(test_memory_lite SRCS memory_test.cc DEPS memory_lite) lite_cc_test(test_memory_lite SRCS memory_test.cc DEPS memory_lite)
lite_cc_test(test_context_lite SRCS context_test.cc DEPS context_lite X86_DEPS operator) lite_cc_test(test_context_lite SRCS context_test.cc DEPS context_lite X86_DEPS operator)
...@@ -54,15 +54,15 @@ void DeviceInfo::InitInternal(DeviceInfo* dev) { ...@@ -54,15 +54,15 @@ void DeviceInfo::InitInternal(DeviceInfo* dev) {
<< ", cluster ID: " << dev->cluster_ids_[dev->core_ids_[i]] << ", cluster ID: " << dev->cluster_ids_[dev->core_ids_[i]]
<< ", CPU ARCH: A" << dev->archs_[i]; << ", CPU ARCH: A" << dev->archs_[i];
} }
LOG(INFO) << "L1 DataCache size is: "; VLOG(1) << "L1 DataCache size is: ";
for (int i = 0; i < dev->compute_core_num_; ++i) { for (int i = 0; i < dev->compute_core_num_; ++i) {
LOG(INFO) << dev->L1_cache_[i] / 1024 << " KB"; VLOG(1) << dev->L1_cache_[i] / 1024 << " KB";
} }
LOG(INFO) << "L2 Cache size is: "; VLOG(1) << "L2 Cache size is: ";
for (int i = 0; i < dev->compute_core_num_; ++i) { for (int i = 0; i < dev->compute_core_num_; ++i) {
LOG(INFO) << dev->L2_cache_[i] / 1024 << " KB"; VLOG(1) << dev->L2_cache_[i] / 1024 << " KB";
} }
LOG(INFO) << "Total memory: " << dev->max_memory_ << "KB"; VLOG(1) << "Total memory: " << dev->max_memory_ << "KB";
dev->max_freq_ = max_freq[0]; dev->max_freq_ = max_freq[0];
for (int j = 1; j < dev->compute_core_num_; ++j) { for (int j = 1; j < dev->compute_core_num_; ++j) {
......
...@@ -107,6 +107,8 @@ class TensorHvy : public TensorBase<TensorHvy> { ...@@ -107,6 +107,8 @@ class TensorHvy : public TensorBase<TensorHvy> {
data_.Resize(framework::make_ddim(dims.Vectorize())); data_.Resize(framework::make_ddim(dims.Vectorize()));
} }
void Resize(const std::vector<int64_t>& x) { Resize(DDimHvy(x)); }
void ShareDataWith(const TensorHvy& other) { void ShareDataWith(const TensorHvy& other) {
data_.ShareDataWith(other.data_); data_.ShareDataWith(other.data_);
} }
......
cc_library(mir_node SRCS node.cc DEPS framework_proto_lite) cc_library(mir_node SRCS node.cc DEPS framework_proto_lite)
cc_library(mir_ssa_graph SRCS ssa_graph.cc DEPS mir_node) cc_library(mir_ssa_graph SRCS ssa_graph.cc DEPS mir_node program_lite)
cc_library(mir_pass SRCS pass.cc DEPS mir_ssa_graph) cc_library(mir_pass SRCS pass.cc DEPS mir_ssa_graph)
cc_library(mir_pass_manager SRCS pass_manager.cc DEPS mir_pass mir_ssa_graph mir_passes) cc_library(mir_pass_manager SRCS pass_manager.cc DEPS mir_pass mir_ssa_graph mir_passes)
cc_library(mir_pass_registry SRCS pass_registry.cc DEPS mir_pass_manager) cc_library(mir_pass_registry SRCS pass_registry.cc DEPS mir_pass_manager)
...@@ -20,14 +20,14 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) ...@@ -20,14 +20,14 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
return() return()
endif() endif()
cc_test(test_mir_pass_manager SRCS pass_manager_test.cc DEPS mir_pass_manager mir_passes) cc_test(test_mir_pass_manager SRCS pass_manager_test.cc DEPS mir_pass_manager mir_passes)
cc_test(test_ssa_graph SRCS ssa_graph_test.cc DEPS #cc_test(test_ssa_graph SRCS ssa_graph_test.cc DEPS
mir_ssa_graph scope_lite op_lite #mir_ssa_graph scope_lite op_lite
fc_op_lite #fc_op_lite
${host_kernels} #${host_kernels}
mir_passes #mir_passes
mir_pass_manager #mir_pass_manager
program_fake_utils #program_fake_utils
) #)
# lite_cc_test(test_variable_place_infrence_pass SRCS variable_place_inference_pass_test.cc # lite_cc_test(test_variable_place_infrence_pass SRCS variable_place_inference_pass_test.cc
# DEPS # DEPS
# mul_op_lite # mul_op_lite
...@@ -59,3 +59,4 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) ...@@ -59,3 +59,4 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
pattern_matcher_high_api proto_desc mir_pass_manager fc_op_lite mul_op_lite elementwise_ops_lite pattern_matcher_high_api proto_desc mir_pass_manager fc_op_lite mul_op_lite elementwise_ops_lite
mir_passes compatible_pb_lite program_lite ${ops_lite}) mir_passes compatible_pb_lite program_lite ${ops_lite})
endif() endif()
...@@ -18,10 +18,10 @@ import numpy as np ...@@ -18,10 +18,10 @@ import numpy as np
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid.backward import append_backward from paddle.fluid.backward import append_backward
a = fluid.layers.data(name="a", shape=[100], dtype='float32') a = fluid.layers.data(name="a", shape=[2], dtype='float32')
label = fluid.layers.data(name="label", shape=[100], dtype='float32') label = fluid.layers.data(name="label", shape=[10], dtype='float32')
a1 = fluid.layers.fc(input=a, size=500, act=None, bias_attr=False) a1 = fluid.layers.fc(input=a, size=3, act=None, bias_attr=False)
cost = fluid.layers.square_error_cost(a1, label) cost = fluid.layers.square_error_cost(a1, label)
avg_cost = fluid.layers.mean(cost) avg_cost = fluid.layers.mean(cost)
...@@ -36,7 +36,7 @@ exe.run(fluid.default_startup_program()) ...@@ -36,7 +36,7 @@ exe.run(fluid.default_startup_program())
with open('startup_program.pb', 'wb') as f: with open('startup_program.pb', 'wb') as f:
f.write(fluid.default_startup_program().desc.serialize_to_string()) f.write(fluid.default_startup_program().desc.serialize_to_string())
data_1 = np.array(numpy.random.random([100, 100]), dtype='float32') #data_1 = np.array(numpy.random.random([100, 100]), dtype='float32')
#fluid.default_main_program().desc. #fluid.default_main_program().desc.
...@@ -50,7 +50,7 @@ with open('main_program.pb', 'wb') as f: ...@@ -50,7 +50,7 @@ with open('main_program.pb', 'wb') as f:
#outs = exe.run(program=prog, feed={'a':data_1, }, fetch_list=[cost]) #outs = exe.run(program=prog, feed={'a':data_1, }, fetch_list=[cost])
sys.exit(0) #sys.exit(0)
fluid.io.save_inference_model("./model2", [a.name], [a1], exe) fluid.io.save_inference_model("./model2", [a.name], [a1], exe)
print(numpy.array(outs)) #print(numpy.array(outs))
...@@ -4,3 +4,4 @@ endif() ...@@ -4,3 +4,4 @@ endif()
lite_cc_library(basic_profiler_lite SRCS basic_profiler.cc) lite_cc_library(basic_profiler_lite SRCS basic_profiler.cc)
lite_cc_test(test_basic_profiler SRCS basic_profiler_test.cc DEPS basic_profiler_lite) lite_cc_test(test_basic_profiler SRCS basic_profiler_test.cc DEPS basic_profiler_lite)
...@@ -4,3 +4,4 @@ endif() ...@@ -4,3 +4,4 @@ endif()
nv_library(target_wrapper_cuda SRCS target_wrapper.cc) nv_library(target_wrapper_cuda SRCS target_wrapper.cc)
nv_library(cuda_blas_lite SRCS blas.cc) nv_library(cuda_blas_lite SRCS blas.cc)
...@@ -18,10 +18,11 @@ if (NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) ...@@ -18,10 +18,11 @@ if (NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
DEPS scope_lite op_lite kernel_lite paddle_infer_gencode DEPS scope_lite op_lite kernel_lite paddle_infer_gencode
) )
lite_cc_test(test_generated_code SRCS generated_code_test.cc DEPS __generated_code__ # lite_cc_test(test_generated_code SRCS generated_code_test.cc DEPS __generated_code__
${ops_lite} ${host_kernels} # ${ops_lite} ${host_kernels}
X86_DEPS ${x86_kernels} # X86_DEPS ${x86_kernels}
) # )
add_dependencies(__generated_code__ test_gen_code_lite) # add_dependencies(__generated_code__ test_gen_code_lite)
endif() endif()
cc_library(target_wrapper_host SRCS target_wrapper.cc) cc_library(target_wrapper_host SRCS target_wrapper.cc)
...@@ -5,3 +5,4 @@ add_subdirectory(arm) ...@@ -5,3 +5,4 @@ add_subdirectory(arm)
add_subdirectory(cuda) add_subdirectory(cuda)
add_subdirectory(x86) add_subdirectory(x86)
...@@ -6,10 +6,11 @@ message(STATUS "compile with lite ARM kernels") ...@@ -6,10 +6,11 @@ message(STATUS "compile with lite ARM kernels")
cc_library(fc_compute_arm SRCS fc_compute.cc DEPS ${lite_kernel_deps} math_arm) cc_library(fc_compute_arm SRCS fc_compute.cc DEPS ${lite_kernel_deps} math_arm)
cc_library(relu_compute_arm SRCS relu_compute.cc DEPS ${lite_kernel_deps}) cc_library(relu_compute_arm SRCS relu_compute.cc DEPS ${lite_kernel_deps})
cc_library(mul_compute_arm SRCS mul_compute.cc DEPS ${lite_kernel_deps} eigen3) cc_library(mul_compute_arm SRCS mul_compute.cc DEPS ${lite_kernel_deps} math_arm)
cc_library(scale_compute_arm SRCS scale_compute.cc DEPS ${lite_kernel_deps} math_arm) cc_library(scale_compute_arm SRCS scale_compute.cc DEPS ${lite_kernel_deps} math_arm)
cc_library(softmax_compute_arm SRCS softmax_compute.cc DEPS ${lite_kernel_deps} math_arm) cc_library(softmax_compute_arm SRCS softmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
cc_library(conv_compute_arm SRCS conv_compute.cc DEPS ${lite_kernel_deps} math_arm) cc_library(conv_compute_arm SRCS conv_compute.cc DEPS ${lite_kernel_deps} math_arm)
cc_library(batch_norm_compute_arm SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps} math_arm)
cc_library(elementwise_add_compute_arm SRCS elementwise_add_compute.cc DEPS ${lite_kernel_deps} math_arm) cc_library(elementwise_add_compute_arm SRCS elementwise_add_compute.cc DEPS ${lite_kernel_deps} math_arm)
cc_library(pool_compute_arm SRCS pool_compute.cc DEPS ${lite_kernel_deps} math_arm) cc_library(pool_compute_arm SRCS pool_compute.cc DEPS ${lite_kernel_deps} math_arm)
cc_library(split_compute_arm SRCS split_compute.cc DEPS ${lite_kernel_deps} math_arm) cc_library(split_compute_arm SRCS split_compute.cc DEPS ${lite_kernel_deps} math_arm)
...@@ -18,8 +19,10 @@ lite_cc_test(test_fc_compute_arm SRCS fc_compute_test.cc DEPS fc_compute_arm mat ...@@ -18,8 +19,10 @@ lite_cc_test(test_fc_compute_arm SRCS fc_compute_test.cc DEPS fc_compute_arm mat
lite_cc_test(test_scale_compute_arm SRCS scale_compute_test.cc DEPS scale_compute_arm) lite_cc_test(test_scale_compute_arm SRCS scale_compute_test.cc DEPS scale_compute_arm)
lite_cc_test(test_softmax_compute_arm SRCS softmax_compute_test.cc DEPS softmax_compute_arm) lite_cc_test(test_softmax_compute_arm SRCS softmax_compute_test.cc DEPS softmax_compute_arm)
lite_cc_test(test_conv_compute_arm SRCS conv_compute_test.cc DEPS conv_compute_arm) lite_cc_test(test_conv_compute_arm SRCS conv_compute_test.cc DEPS conv_compute_arm)
lite_cc_test(test_batch_norm_compute_arm SRCS batch_norm_compute_test.cc DEPS batch_norm_compute_arm)
lite_cc_test(test_elementwise_add_compute_arm SRCS elementwise_add_compute_test.cc DEPS elementwise_add_compute_arm) lite_cc_test(test_elementwise_add_compute_arm SRCS elementwise_add_compute_test.cc DEPS elementwise_add_compute_arm)
lite_cc_test(test_pool_compute_arm SRCS pool_compute_test.cc DEPS pool_compute_arm) lite_cc_test(test_pool_compute_arm SRCS pool_compute_test.cc DEPS pool_compute_arm)
lite_cc_test(test_mul_compute_arm SRCS mul_compute_test.cc DEPS mul_compute_arm)
lite_cc_test(test_split_compute_arm SRCS split_compute_test.cc DEPS split_compute_arm) lite_cc_test(test_split_compute_arm SRCS split_compute_test.cc DEPS split_compute_arm)
set(arm_kernels set(arm_kernels
...@@ -29,6 +32,7 @@ set(arm_kernels ...@@ -29,6 +32,7 @@ set(arm_kernels
scale_compute_arm scale_compute_arm
softmax_compute_arm softmax_compute_arm
conv_compute_arm conv_compute_arm
batch_norm_compute_arm
elementwise_add_compute_arm elementwise_add_compute_arm
pool_compute_arm pool_compute_arm
split_compute_arm split_compute_arm
...@@ -36,3 +40,4 @@ set(arm_kernels ...@@ -36,3 +40,4 @@ set(arm_kernels
set(arm_kernels "${arm_kernels}" CACHE INTERNAL "arm kernels") set(arm_kernels "${arm_kernels}" CACHE INTERNAL "arm kernels")
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/lite/kernels/arm/batch_norm_compute.h"
#include "paddle/fluid/lite/arm/math/funcs.h"
#include "paddle/fluid/lite/core/op_registry.h"
#include "paddle/fluid/lite/core/type_system.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace arm {
void BatchNormCompute::PrepareForRun() {
auto& param = this->Param<param_t>();
auto x_dims = param.x->dims();
bool global_stats = param.is_test || param.use_global_stats;
if (global_stats) {
int64_t channel_size = 0;
switch (param.data_layout) {
case DATALAYOUT(kNCHW):
channel_size = x_dims[1];
break;
// case DATALAYOUT(kNHWC):
// channel_size = x_dims[x_dims.size() - 1];
// break;
default:
LOG(FATAL) << "Unknown storage order: "
<< DataLayoutToStr(param.data_layout);
break;
}
new_scale.Resize({channel_size});
new_bias.Resize({channel_size});
auto* scale_data = param.scale->mutable_data<float>();
auto* bias_data = param.bias->mutable_data<float>();
auto* mean_data = param.mean->mutable_data<float>();
auto* variance_data = param.variance->mutable_data<float>();
auto* new_scale_data = new_scale.mutable_data<float>();
auto* new_bias_data = new_bias.mutable_data<float>();
for (int c = 0; c < channel_size; c++) {
float inv_scale = 1.f / (std::sqrt(variance_data[c] + param.epsilon));
new_bias_data[c] =
bias_data[c] - inv_scale * scale_data[c] * mean_data[c];
new_scale_data[c] = inv_scale * scale_data[c];
}
}
}
void BatchNormCompute::Run() {
auto& param = this->Param<param_t>();
auto x_dims = param.x->dims();
auto x_data = param.x->mutable_data<float>();
auto y_data = param.y->mutable_data<float>();
bool global_stats = param.is_test || param.use_global_stats;
if (global_stats) {
auto* new_scale_data = new_scale.mutable_data<float>();
auto* new_bias_data = new_bias.mutable_data<float>();
int64_t outer_size = 0;
int64_t channel_size = 0;
int64_t inner_size = 0;
switch (param.data_layout) {
case DATALAYOUT(kNCHW):
outer_size = x_dims[0];
channel_size = x_dims[1];
inner_size = x_dims.Slice(2, x_dims.size()).production();
lite::arm::math::scale(x_data, y_data, outer_size, channel_size,
inner_size, new_scale_data, new_bias_data);
break;
// case DATALAYOUT(kNHWC):
// outer_size = x_dims.Slice(0, x_dims.size() - 1).production();
// channel_size = x_dims[x_dims.size() - 1];
// lite::arm::math::scale(x_data, y_data, outer_size, channel_size,
// new_scale_data, new_bias_data);
// break;
default:
LOG(FATAL) << "Unknown storage order: "
<< DataLayoutToStr(param.data_layout);
break;
}
} else {
// TODO(hong19860320) calculate mean_out, variance_out, saved_mean and
// saved_variance
}
}
} // namespace arm
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(batch_norm, kARM, kFloat, kNCHW,
paddle::lite::kernels::arm::BatchNormCompute, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Mean", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Variance", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("MeanOut", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("VarianceOut", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("SavedMean", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("SavedVariance", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/lite/core/kernel.h"
#include "paddle/fluid/lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace arm {
class BatchNormCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
public:
using param_t = operators::BatchNormParam;
void PrepareForRun() override;
void Run() override;
virtual ~BatchNormCompute() = default;
private:
Tensor new_scale;
Tensor new_bias;
};
} // namespace arm
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/lite/kernels/arm/batch_norm_compute.h"
#include <gtest/gtest.h>
#include <memory>
#include <utility>
#include <vector>
#include "paddle/fluid/lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace arm {
template <typename dtype>
void batch_norm_compute_ref(const operators::BatchNormParam& param) {
DDim x_dims = param.x->dims();
auto x_data = param.x->mutable_data<dtype>();
auto scale_data = param.scale->mutable_data<dtype>();
auto bias_data = param.bias->mutable_data<dtype>();
auto mean_data = param.mean->mutable_data<dtype>();
auto variance_data = param.variance->mutable_data<dtype>();
auto y_data = param.y->mutable_data<dtype>();
float epsilon = param.epsilon;
float momentum = param.momentum;
DataLayoutType data_layout = param.data_layout;
bool global_stats = param.is_test || param.use_global_stats;
if (global_stats) {
int64_t outer_size = 0;
int64_t channel_size = 0;
int64_t inner_size = 0;
switch (data_layout) {
case DATALAYOUT(kNCHW):
outer_size = x_dims[0];
channel_size = x_dims[1];
inner_size = x_dims.Slice(2, x_dims.size()).production();
break;
// case DATALAYOUT(kNHWC):
// outer_size = x_dims.Slice(0, x_dims.size() - 1).production();
// channel_size = x_dims[x_dims.size() - 1];
// inner_size = 1;
// break;
default:
LOG(FATAL) << "Unknown storage order: " << DataLayoutToStr(data_layout);
break;
}
auto x_ptr = x_data;
auto y_ptr = y_data;
for (int o = 0; o < outer_size; o++) {
for (int c = 0; c < channel_size; c++) {
for (int i = 0; i < inner_size; i++) {
dtype norm_x =
(*x_ptr - mean_data[c]) / std::sqrt(variance_data[c] + epsilon);
*y_ptr = norm_x * scale_data[c] + bias_data[c];
x_ptr++;
y_ptr++;
}
}
}
} else {
// TODO(hong19860320) calculate mean_out, variance_out, saved_mean and
// saved_variance
}
}
TEST(batch_norm_arm, retrive_op) {
auto batch_norm =
KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
"batch_norm");
ASSERT_FALSE(batch_norm.empty());
ASSERT_TRUE(batch_norm.front());
}
TEST(batch_norm_arm, init) {
BatchNormCompute batch_norm;
ASSERT_EQ(batch_norm.precision(), PRECISION(kFloat));
ASSERT_EQ(batch_norm.target(), TARGET(kARM));
}
TEST(batch_norm_arm, compute) {
DeviceInfo::Init();
for (auto n : {1, 2}) {
for (auto c : {6, 32 /*, 128*/}) {
for (auto h : {9, 18 /*, 56 , 112, 224, 512*/}) {
for (auto w : {9, 18 /*, 56, 112, 224, 512*/}) {
for (auto is_test : {/*false, */ true}) {
for (auto use_global_stats : {false, true}) {
for (auto epsilon : {1e-4f, 1e-5f}) {
for (auto momentum : {0.9f, 0.99f}) {
for (auto data_layout :
{DATALAYOUT(kNCHW) /*, DATALAYOUT(kNHWC)*/}) {
Tensor x;
Tensor scale;
Tensor bias;
Tensor mean;
Tensor variance;
Tensor y;
Tensor mean_out;
Tensor variance_out;
Tensor saved_mean;
Tensor saved_variance;
Tensor y_ref;
Tensor mean_out_ref;
Tensor variance_out_ref;
Tensor saved_mean_ref;
Tensor saved_variance_ref;
// set the dims of input, output, ref output tensors
std::vector<int64_t> in_out_shape;
switch (data_layout) {
case DATALAYOUT(kNCHW):
in_out_shape = {n, c, h, w};
break;
// case DATALAYOUT(kNHWC):
// in_out_shape = {n, h, w, c};
// break;
default:
LOG(FATAL) << "Unknown storage order: "
<< DataLayoutToStr(data_layout);
break;
}
x.Resize(in_out_shape);
scale.Resize({c});
bias.Resize({c});
mean.Resize({c});
variance.Resize({c});
y.Resize(in_out_shape);
mean_out.Resize({c});
variance_out.Resize({c});
saved_mean.Resize({c});
saved_variance.Resize({c});
y_ref.Resize(in_out_shape);
mean_out_ref.Resize({c});
variance_out_ref.Resize({c});
saved_mean_ref.Resize({c});
saved_variance_ref.Resize({c});
// initialize the data of input tensors
auto* x_data = x.mutable_data<float>();
auto* scale_data = scale.mutable_data<float>();
auto* bias_data = bias.mutable_data<float>();
auto* mean_data = mean.mutable_data<float>();
auto* variance_data = variance.mutable_data<float>();
auto* y_data = y.mutable_data<float>();
for (int i = 0; i < x.dims().production(); i++) {
x_data[i] = static_cast<float>(i % 64);
}
for (int i = 0; i < scale.dims().production(); i++) {
scale_data[i] = static_cast<float>(i) * 0.01f + 0.03f;
}
for (int i = 0; i < bias.dims().production(); i++) {
bias_data[i] = static_cast<float>(i) * 0.065f + 0.1f;
}
for (int i = 0; i < mean.dims().production(); i++) {
mean_data[i] = static_cast<float>(i) * 0.0565f;
}
for (int i = 0; i < variance.dims().production(); i++) {
variance_data[i] = static_cast<float>(i) * 2.08f + 1.5f;
}
// prepare kernel params and run
BatchNormCompute batch_norm;
std::unique_ptr<KernelContext> ctx(new KernelContext);
ctx->As<ARMContext>();
batch_norm.SetContext(std::move(ctx));
operators::BatchNormParam param;
param.x = &x;
param.scale = &scale;
param.bias = &bias;
param.mean = &mean;
param.variance = &variance;
param.is_test = is_test;
param.use_global_stats = use_global_stats;
param.epsilon = epsilon;
param.momentum = momentum;
param.data_layout = data_layout;
param.y = &y;
param.mean_out = &mean_out;
param.variance_out = &variance_out;
param.saved_mean = &saved_mean;
param.saved_variance = &saved_variance;
batch_norm.SetParam(param);
batch_norm.Launch();
// invoking ref implementation and compare results
param.y = &y_ref;
param.mean_out = &mean_out_ref;
param.variance_out = &variance_out_ref;
param.saved_mean = &saved_mean_ref;
param.saved_variance = &saved_variance_ref;
batch_norm_compute_ref<float>(param);
auto* y_ref_data = y_ref.mutable_data<float>();
for (int i = 0; i < y.dims().production(); i++) {
EXPECT_NEAR(y_data[i], y_ref_data[i], 1e-5);
}
}
}
}
}
}
}
}
}
}
}
} // namespace arm
} // namespace kernels
} // namespace lite
} // namespace paddle
USE_LITE_KERNEL(batch_norm, kARM, kFloat, kNCHW, def);
...@@ -124,6 +124,20 @@ TEST(conv_arm, init) { ...@@ -124,6 +124,20 @@ TEST(conv_arm, init) {
TEST(conv_arm, compute) { TEST(conv_arm, compute) {
DeviceInfo::Init(); DeviceInfo::Init();
#if 1
for (auto n : {2}) {
for (auto ic : {6}) {
for (auto oc : {6}) {
for (auto ih : {9}) {
for (auto iw : {9}) {
for (auto flag_bias : {false, true}) {
for (auto flag_relu : {false, true}) {
for (auto depthwise : {false, true}) {
for (auto dilation : {1}) {
for (auto stride : {1, 2}) {
for (auto padding : {0, 1, 2}) {
for (auto ks : {1, 3, 5}) {
#else
for (auto n : {1, 2}) { for (auto n : {1, 2}) {
for (auto ic : {6, 32 /*, 128*/}) { for (auto ic : {6, 32 /*, 128*/}) {
for (auto oc : {6, 32 /*, 128*/}) { for (auto oc : {6, 32 /*, 128*/}) {
...@@ -136,6 +150,7 @@ TEST(conv_arm, compute) { ...@@ -136,6 +150,7 @@ TEST(conv_arm, compute) {
for (auto stride : {1, 2}) { for (auto stride : {1, 2}) {
for (auto padding : {0, 1, 2}) { for (auto padding : {0, 1, 2}) {
for (auto ks : {1, 3, 5}) { for (auto ks : {1, 3, 5}) {
#endif
int group = 1; int group = 1;
if (depthwise) { // depthwise convolution ? if (depthwise) { // depthwise convolution ?
group = oc = ic; group = oc = ic;
......
...@@ -22,7 +22,7 @@ namespace lite { ...@@ -22,7 +22,7 @@ namespace lite {
namespace kernels { namespace kernels {
namespace arm { namespace arm {
void FcCompute::Run() { void FcCompute::PrepareForRun() {
auto& param = this->Param<operators::FcParam>(); auto& param = this->Param<operators::FcParam>();
auto x_dims = param.input->dims(); auto x_dims = param.input->dims();
auto w_dims = param.w->dims(); auto w_dims = param.w->dims();
...@@ -31,39 +31,56 @@ void FcCompute::Run() { ...@@ -31,39 +31,56 @@ void FcCompute::Run() {
CHECK_EQ(w_dims.size(), 2UL); CHECK_EQ(w_dims.size(), 2UL);
CHECK_EQ(param.output->dims().size(), 2UL); CHECK_EQ(param.output->dims().size(), 2UL);
m_ = x_dims.Slice(0, param.in_num_col_dims).production();
k_ = x_dims.Slice(param.in_num_col_dims, x_dims.size()).production();
n_ = w_dims[1];
CHECK_EQ(k_, static_cast<int>(w_dims[0]));
if (m_ == 1) {
if (!transed_weight_) {
transed_weight_ = new Tensor;
}
transed_weight_->Resize({n_, k_});
const auto* w_data = param.w->data<float>();
auto* t_data = transed_weight_->mutable_data<float>();
int i = 0;
for (int nn = 0; nn < n_; ++nn) {
for (int kk = 0; kk < k_; ++kk) {
t_data[i++] = w_data[kk * n_ + nn];
}
}
}
}
void FcCompute::Run() {
auto& param = this->Param<operators::FcParam>();
const auto* i_data = param.input->data<float>(); const auto* i_data = param.input->data<float>();
const auto* w_data = param.w->data<float>(); const auto* w_data = param.w->data<float>();
const auto* b_data = param.bias ? param.bias->data<float>() : nullptr; const auto* b_data = param.bias ? param.bias->data<float>() : nullptr;
auto* o_data = param.output->mutable_data<float>(); auto* o_data = param.output->mutable_data<float>();
int x_h = x_dims.Slice(0, param.in_num_col_dims).production();
int x_w = x_dims.Slice(param.in_num_col_dims, x_dims.size()).production();
int n = w_dims[1];
CHECK_EQ(x_w, static_cast<int>(w_dims[0]));
auto& ctx = this->ctx_->template As<ARMContext>(); auto& ctx = this->ctx_->template As<ARMContext>();
if (x_h > 1) { if (m_ > 1) {
float* packed_in = static_cast<float*>(ctx.workspace_data<float>()) + float* packed_in = static_cast<float*>(ctx.workspace_data<float>()) +
ctx.l2_cache_size() / sizeof(float); ctx.l2_cache_size() / sizeof(float);
lite::arm::math::prepackA(packed_in, i_data, x_w, 0, x_h, 0, x_w, false, lite::arm::math::prepackA(packed_in, i_data, k_, 0, m_, 0, k_, false, &ctx);
&ctx); lite::arm::math::sgemm_prepack(packed_in, w_data, b_data, o_data, m_, n_,
lite::arm::math::sgemm_prepack(packed_in, w_data, b_data, o_data, x_h, n, k_, false, false, false, &ctx);
x_w, false, false, false, &ctx);
if (param.bias) { if (param.bias) {
CHECK_EQ(param.bias->numel(), n); CHECK_EQ(param.bias->numel(), n_);
lite::arm::math::fill_bias_fc(o_data, b_data, x_h, n); lite::arm::math::fill_bias_fc(o_data, b_data, m_, n_);
} }
} else { } else {
// use sgemmv CHECK(transed_weight_);
// sgemv((const float*)weights, (const float*)din, (float*)dout, const auto* t_data = transed_weight_->data<float>();
// false, n, x_w, _param->_flag_bias, (float*)bias, false);
lite::arm::math::sgemv(t_data, i_data, o_data, false, n_, k_,
b_data != nullptr, b_data, false);
} }
} }
TargetType FcCompute::target() const { return TARGET(kARM); }
PrecisionType FcCompute::precision() const { return PRECISION(kFloat); }
} // namespace arm } // namespace arm
} // namespace kernels } // namespace kernels
} // namespace lite } // namespace lite
......
...@@ -25,12 +25,19 @@ class FcCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> { ...@@ -25,12 +25,19 @@ class FcCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
public: public:
using param_t = operators::FcParam; using param_t = operators::FcParam;
void PrepareForRun() override;
void Run() override; void Run() override;
TargetType target() const override; ~FcCompute() override {
PrecisionType precision() const override; if (transed_weight_) {
delete transed_weight_;
}
};
virtual ~FcCompute() = default; private:
lite::Tensor* transed_weight_{nullptr};
int m_, n_, k_;
}; };
} // namespace arm } // namespace arm
......
...@@ -14,6 +14,11 @@ ...@@ -14,6 +14,11 @@
#include "paddle/fluid/lite/kernels/arm/fc_compute.h" #include "paddle/fluid/lite/kernels/arm/fc_compute.h"
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <algorithm>
#include <iostream>
#include <memory>
#include <random>
#include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/lite/arm/math/funcs.h" #include "paddle/fluid/lite/arm/math/funcs.h"
#include "paddle/fluid/lite/core/op_registry.h" #include "paddle/fluid/lite/core/op_registry.h"
...@@ -23,6 +28,17 @@ namespace lite { ...@@ -23,6 +28,17 @@ namespace lite {
namespace kernels { namespace kernels {
namespace arm { namespace arm {
template <typename T>
void FillData(T* a, const int n, const T lower = static_cast<T>(-2.f),
const T upper = static_cast<T>(2.f)) {
static unsigned int seed = 100;
std::mt19937 rng(seed++);
std::uniform_real_distribution<double> uniform_dist(0, 1);
for (int i = 0; i < n; ++i) {
a[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
}
}
TEST(fc_arm, retrive_op) { TEST(fc_arm, retrive_op) {
auto fc = auto fc =
KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>("fc"); KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>("fc");
...@@ -37,108 +53,117 @@ TEST(fc_arm, init) { ...@@ -37,108 +53,117 @@ TEST(fc_arm, init) {
} }
TEST(fc_arm, compare_test) { TEST(fc_arm, compare_test) {
lite::Tensor x, w, b, out, ref; using T = float;
constexpr int batch_size = 2;
x.Resize({batch_size, 3}); for (int m : {1, 2, 3, 4}) {
w.Resize({3, 4}); for (int n : {1, 2, 3, 4}) {
b.Resize({1, 4}); for (int k : {1, 2, 3, 4}) {
out.Resize({batch_size, 4}); for (bool with_bias : {true, false}) {
ref.Resize({batch_size, 4}); VLOG(3) << "m: " << m << ", n: " << n << ", k: " << k
<< (with_bias ? ", with bias" : "");
auto x_data = x.mutable_data<float>(); lite::Tensor x, w, b, out, ref;
auto w_data = w.mutable_data<float>();
auto b_data = b.mutable_data<float>(); x.Resize({m, k});
auto out_data = out.mutable_data<float>(); w.Resize({k, n});
auto ref_data = ref.mutable_data<float>(); b.Resize({1, n});
out.Resize({m, n});
for (int64_t i = 0; i < x.dims().product(); i++) { ref.Resize({m, n});
x_data[i] = static_cast<float>(i);
} auto* x_data = x.mutable_data<T>();
for (int64_t i = 0; i < w.dims().product(); i++) { auto* w_data = w.mutable_data<T>();
w_data[i] = static_cast<float>(i); auto* b_data = with_bias ? b.mutable_data<T>() : nullptr;
}
for (int64_t i = 0; i < b.dims().product(); i++) { auto* out_data = out.mutable_data<T>();
b_data[i] = static_cast<float>(i); auto* ref_data = ref.mutable_data<T>();
}
FillData<T>(x_data, x.dims().production());
lite::arm::math::fc_compute_eigen(x_data, batch_size, 3, // FillData<T>(w_data, w.dims().production());
w_data, 3, 4, // FillData<T>(out_data, out.dims().production(), 0, 0);
b_data, ref_data); FillData<T>(ref_data, ref.dims().production(), 0, 0);
// fc compute kernel if (with_bias) {
FcCompute fc; FillData<T>(b_data, b.dims().production());
operators::FcParam param; }
param.in_num_col_dims = 1; FcCompute fc;
param.input = &x; operators::FcParam param;
param.w = &w;
param.bias = &b; param.input = &x;
param.output = &out; param.w = &w;
param.in_mat_dims = x.dims(); param.bias = with_bias ? &b : nullptr;
param.output = &out;
DeviceInfo::Init(); param.in_num_col_dims = 1;
std::unique_ptr<KernelContext> ctx(new KernelContext); param.in_mat_dims = x.dims();
ctx->As<ARMContext>();
fc.SetParam(param); DeviceInfo::Init();
fc.SetContext(std::move(ctx)); std::unique_ptr<KernelContext> ctx(new KernelContext);
fc.Run(); ctx->As<ARMContext>();
fc.SetParam(param);
VLOG(3) << "output vs ref"; fc.SetContext(std::move(ctx));
for (int i = 0; i < out.dims().product(); i++) { fc.PrepareForRun();
VLOG(3) << out_data[i] << " vs " << ref_data[i]; fc.Run();
}
lite::arm::math::fc_compute_eigen(x_data, m, k, w_data, k, n, b_data,
for (int i = 0; i < out.dims().product(); ++i) { ref_data);
EXPECT_NEAR(out_data[i], ref_data[i], 1e-5); for (int i = 0; i < out.dims().production(); i++) {
EXPECT_NEAR(out_data[i], ref_data[i], 1e-3);
}
}
}
}
} }
} }
TEST(fc_arm, num_col_dims) { TEST(fc_arm, num_col_dims) {
FcCompute fc; using T = float;
operators::FcParam param;
for (bool with_bias : {true, false}) {
lite::Tensor x; lite::Tensor x, w, b, out, ref;
lite::Tensor w;
lite::Tensor bias; x.Resize({1, 2, 3});
lite::Tensor output; w.Resize({3, 4});
b.Resize({1, 4});
x.Resize({1, 2, 3}); out.Resize({2, 4});
w.Resize({3, 4}); ref.Resize({2, 4});
bias.Resize({1, 4});
output.Resize({2, 4}); auto* x_data = x.mutable_data<float>();
auto* w_data = w.mutable_data<float>();
auto* x_data = x.mutable_data<float>(); auto* b_data = with_bias ? b.mutable_data<T>() : nullptr;
auto* w_data = w.mutable_data<float>();
auto* bias_data = bias.mutable_data<float>(); auto* out_data = out.mutable_data<T>();
auto* output_data = output.mutable_data<float>(); auto* ref_data = ref.mutable_data<T>();
for (int64_t i = 0; i < x.dims().product(); i++) { FillData<T>(x_data, x.dims().production());
x_data[i] = static_cast<float>(i); FillData<T>(w_data, w.dims().production());
} FillData<T>(out_data, out.dims().production(), 0, 0);
for (int64_t i = 0; i < w.dims().product(); i++) { FillData<T>(ref_data, ref.dims().production(), 0, 0);
w_data[i] = static_cast<float>(i); if (with_bias) {
FillData<T>(b_data, b.dims().production());
}
FcCompute fc;
operators::FcParam param;
param.input = &x;
param.w = &w;
param.bias = with_bias ? &b : nullptr;
param.output = &out;
param.in_num_col_dims = 2;
param.in_mat_dims = x.dims();
std::unique_ptr<KernelContext> ctx(new KernelContext);
ctx->As<ARMContext>();
DeviceInfo::Init();
fc.SetParam(param);
fc.SetContext(std::move(ctx));
fc.PrepareForRun();
fc.Run();
lite::arm::math::fc_compute_eigen(x_data, 2, 3, w_data, 3, 4, b_data,
ref_data);
for (int i = 0; i < out.dims().production(); i++) {
EXPECT_NEAR(out_data[i], ref_data[i], 1e-3);
}
} }
for (int64_t i = 0; i < bias.dims().product(); i++) {
bias_data[i] = static_cast<float>(i);
}
for (int64_t i = 0; i < output.dims().product(); i++) {
output_data[i] = static_cast<float>(i);
}
param.in_num_col_dims = 2;
param.input = &x;
param.w = &w;
param.bias = &bias;
param.output = &output;
param.in_mat_dims = x.dims();
std::unique_ptr<KernelContext> ctx(new KernelContext);
ctx->As<ARMContext>();
DeviceInfo::Init();
fc.SetParam(param);
fc.SetContext(std::move(ctx));
fc.Run();
} }
} // namespace arm } // namespace arm
......
...@@ -12,57 +12,57 @@ ...@@ -12,57 +12,57 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include <Eigen/Core> #include "paddle/fluid/lite/kernels/arm/mul_compute.h"
#include "paddle/fluid/lite/core/kernel.h" #include "paddle/fluid/lite/arm/math/funcs.h"
#include "paddle/fluid/lite/core/op_registry.h" #include "paddle/fluid/lite/core/op_registry.h"
#include "paddle/fluid/lite/core/types.h" #include "paddle/fluid/lite/core/type_system.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace kernels { namespace kernels {
namespace arm { namespace arm {
template <typename T> void MulCompute::PrepareForRun() {
void mul_compute_eigen(const T* x, int x_h, int x_w, const T* y, int y_h, // TODO(TJ): transpose x or y if necessary
int y_w, T* out) { }
using matrix_t =
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
Eigen::Map<const matrix_t> X(x, x_h, x_w); void MulCompute::Run() {
Eigen::Map<const matrix_t> Y(y, y_h, y_w); auto& param = Param<param_t>();
Eigen::Map<matrix_t> Out(out, x_h, y_w);
Out = X * Y; const auto* x_data = param.x->data<float>();
} const auto* y_data = param.y->data<float>();
auto* o_data = param.output->mutable_data<float>();
class MulCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> { int m = static_cast<int>(
public: param.x->dims().Slice(0, param.x_num_col_dims).production());
using param_t = operators::MulParam; int x_w =
static_cast<int>(param.x->dims()
.Slice(param.x_num_col_dims, param.x->dims().size())
.production());
int y_h = static_cast<int>(
param.y->dims().Slice(0, param.y_num_col_dims).production());
int n =
static_cast<int>(param.y->dims()
.Slice(param.y_num_col_dims, param.y->dims().size())
.production());
void Run() override { CHECK_EQ(x_w, y_h) << "x_w must be equal with y_h";
auto& param = Param<operators::MulParam>(); auto k = x_w;
core::dim2 x_shape( if (n == 1) {
{static_cast<int>( lite::arm::math::sgemv(x_data, y_data, o_data, false, m, k, false, nullptr,
param.x->dims().Slice(0, param.x_num_col_dims).production()), false);
static_cast<int>(
param.x->dims()
.Slice(param.x_num_col_dims, param.x->dims().size())
.production())});
core::dim2 y_shape(
{static_cast<int>(
param.y->dims().Slice(0, param.y_num_col_dims).production()),
static_cast<int>(
param.y->dims()
.Slice(param.y_num_col_dims, param.y->dims().size())
.production())});
mul_compute_eigen(param.x->data<float>(), x_shape.x, x_shape.y, // } else {
param.y->data<float>(), y_shape.x, y_shape.y, // constexpr bool is_tranposed_y = false;
param.output->mutable_data<float>()); auto& ctx = this->ctx_->template As<ARMContext>();
}
virtual ~MulCompute() = default; float* packed_x = static_cast<float*>(ctx.workspace_data<float>()) +
}; ctx.l2_cache_size() / sizeof(float);
lite::arm::math::prepackA(packed_x, x_data, k, 0, m, 0, k, false, &ctx);
lite::arm::math::sgemm_prepack(packed_x, y_data, nullptr, o_data, m, n, k,
false, false, is_tranposed_y, &ctx);
}
}
} // namespace arm } // namespace arm
} // namespace kernels } // namespace kernels
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/lite/core/kernel.h"
#include "paddle/fluid/lite/core/op_registry.h"
#include "paddle/fluid/lite/core/types.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace arm {
class MulCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
public:
using param_t = operators::MulParam;
void PrepareForRun() override;
void Run() override;
virtual ~MulCompute() = default;
};
} // namespace arm
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/lite/kernels/arm/mul_compute.h"
#include <gtest/gtest.h>
#include <algorithm>
#include <iostream>
#include <memory>
#include <random>
#include <utility>
#include <vector>
#include "paddle/fluid/lite/arm/math/funcs.h"
#include "paddle/fluid/lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace arm {
template <typename T>
void FillData(T* a, const int n, const T lower = static_cast<T>(-2.f),
const T upper = static_cast<T>(2.f)) {
static unsigned int seed = 100;
std::mt19937 rng(seed++);
std::uniform_real_distribution<double> uniform_dist(0, 1);
for (int i = 0; i < n; ++i) {
a[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
}
}
TEST(mul_arm, retrive_op) {
auto mul =
KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>("mul");
ASSERT_FALSE(mul.empty());
ASSERT_TRUE(mul.front());
}
TEST(mul_arm, init) {
MulCompute mul;
ASSERT_EQ(mul.precision(), PRECISION(kFloat));
ASSERT_EQ(mul.target(), TARGET(kARM));
}
TEST(mul_arm, compare_test) {
using T = float;
for (int m : {1, 2, 3, 4}) {
for (int n : {1, 2, 3, 4}) {
for (int k : {1, 2, 3, 4}) {
VLOG(3) << "m: " << m << ", n: " << n << ", k: " << k;
lite::Tensor x, y, out, ref;
x.Resize({m, k});
y.Resize({k, n});
out.Resize({m, n});
ref.Resize({m, n});
auto* x_data = x.mutable_data<T>();
auto* y_data = y.mutable_data<T>();
auto* out_data = out.mutable_data<T>();
auto* ref_data = ref.mutable_data<T>();
FillData<T>(x_data, x.dims().production());
FillData<T>(y_data, y.dims().production());
FillData<T>(out_data, out.dims().production(), 0, 0);
FillData<T>(ref_data, ref.dims().production(), 0, 0);
MulCompute mul;
operators::MulParam param;
param.x = &x;
param.y = &y;
param.output = &out;
DeviceInfo::Init();
std::unique_ptr<KernelContext> ctx(new KernelContext);
ctx->As<ARMContext>();
mul.SetParam(param);
mul.SetContext(std::move(ctx));
mul.PrepareForRun();
mul.Run();
lite::arm::math::mul_compute_eigen(x_data, m, k, y_data, k, n,
ref_data);
for (int i = 0; i < out.dims().production(); i++) {
EXPECT_NEAR(out_data[i], ref_data[i], 1e-3);
}
}
}
}
}
TEST(mul_arm, num_col_dims) {
using T = float;
lite::Tensor x, y, out, ref;
x.Resize({2, 3, 4});
y.Resize({3, 4, 5});
out.Resize({2, 5});
ref.Resize({2, 5});
auto* x_data = x.mutable_data<T>();
auto* y_data = y.mutable_data<T>();
auto* out_data = out.mutable_data<T>();
auto* ref_data = ref.mutable_data<T>();
FillData<T>(x_data, x.dims().production());
FillData<T>(y_data, y.dims().production());
FillData<T>(out_data, out.dims().production());
FillData<T>(ref_data, out.dims().production());
MulCompute mul;
operators::MulParam param;
param.x = &x;
param.y = &y;
param.output = &out;
param.x_num_col_dims = 1;
param.y_num_col_dims = 2;
DeviceInfo::Init();
std::unique_ptr<KernelContext> ctx(new KernelContext);
ctx->As<ARMContext>();
mul.SetParam(param);
mul.SetContext(std::move(ctx));
mul.PrepareForRun();
mul.Run();
lite::arm::math::mul_compute_eigen(x_data, 2, 12, y_data, 12, 5, ref_data);
for (int i = 0; i < out.dims().production(); i++) {
EXPECT_NEAR(out_data[i], ref_data[i], 1e-3);
}
}
} // namespace arm
} // namespace kernels
} // namespace lite
} // namespace paddle
USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
...@@ -182,7 +182,7 @@ TEST(pool_arm, compute) { ...@@ -182,7 +182,7 @@ TEST(pool_arm, compute) {
for (auto stride : {2}) { for (auto stride : {2}) {
for (auto pad : {0}) { for (auto pad : {0}) {
for (auto n : {1, 3, 4, 11}) { for (auto n : {1, 3, 4, 11}) {
for (auto c : {1, 3, 11, 4, 1024}) { for (auto c : {1, 3, 11 /* ,1024 */}) { // speedup for ci
for (auto h : {3, 1, 11, 4, 1}) { for (auto h : {3, 1, 11, 4, 1}) {
for (auto w : {1, 3, 4, 12, 1}) { for (auto w : {1, 3, 4, 12, 1}) {
VLOG(3) << "n:" << n << " c:" << c << " h:" << h << " w:" << w VLOG(3) << "n:" << n << " c:" << c << " h:" << h << " w:" << w
......
...@@ -54,6 +54,15 @@ TEST(scale_arm, compute) { ...@@ -54,6 +54,15 @@ TEST(scale_arm, compute) {
lite::Tensor output; lite::Tensor output;
lite::Tensor output_ref; lite::Tensor output_ref;
#if 1 // for ci speedup
for (auto n : {1, 3}) {
for (auto c : {1, 3}) {
for (auto h : {3, 4}) {
for (auto w : {4, 3}) {
for (auto bias_after_scale : {true, false}) {
for (auto s : {-1.0f, 0.13f}) {
for (auto b : {-15.f, 0.11234f}) {
#else
for (auto n : {1, 3, 4, 11}) { for (auto n : {1, 3, 4, 11}) {
for (auto c : {1, 3, 11, 4}) { for (auto c : {1, 3, 11, 4}) {
for (auto h : {3, 1, 11, 4}) { for (auto h : {3, 1, 11, 4}) {
...@@ -61,6 +70,8 @@ TEST(scale_arm, compute) { ...@@ -61,6 +70,8 @@ TEST(scale_arm, compute) {
for (auto bias_after_scale : {true, false}) { for (auto bias_after_scale : {true, false}) {
for (auto s : {-100.25f, -1.0f, 0.13f, 3840.975f}) { for (auto s : {-100.25f, -1.0f, 0.13f, 3840.975f}) {
for (auto b : {-3075.495f, -15.f, 0.11234f, 128.15f}) { for (auto b : {-3075.495f, -15.f, 0.11234f, 128.15f}) {
#endif
x.Resize(DDim(std::vector<int64_t>({n, c, h, w}))); x.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
output.Resize(DDim(std::vector<int64_t>({n, c, h, w}))); output.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
output_ref.Resize(DDim(std::vector<int64_t>({n, c, h, w}))); output_ref.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
......
...@@ -9,3 +9,4 @@ cc_library(io_copy_compute_cuda SRCS io_copy_compute.cc DEPS ${tensor_lite}) ...@@ -9,3 +9,4 @@ cc_library(io_copy_compute_cuda SRCS io_copy_compute.cc DEPS ${tensor_lite})
nv_library(kernels_cuda DEPS mul_compute_cuda io_copy_compute_cuda cuda_blas_lite) nv_library(kernels_cuda DEPS mul_compute_cuda io_copy_compute_cuda cuda_blas_lite)
...@@ -13,3 +13,4 @@ set(host_kernels ...@@ -13,3 +13,4 @@ set(host_kernels
) )
set(host_kernels "${host_kernels}" CACHE GLOBAL "host kernels") set(host_kernels "${host_kernels}" CACHE GLOBAL "host kernels")
...@@ -35,3 +35,4 @@ set(x86_kernels ...@@ -35,3 +35,4 @@ set(x86_kernels
) )
set(x86_kernels "${x86_kernels}" CACHE INTERNAL "x86 kernels") set(x86_kernels "${x86_kernels}" CACHE INTERNAL "x86 kernels")
...@@ -29,9 +29,9 @@ class SGDCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> { ...@@ -29,9 +29,9 @@ class SGDCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
using param_t = operators::ActivationParam; using param_t = operators::ActivationParam;
void Run() override { void Run() override {
auto &context = context_->As<X86Context>(); auto &context = ctx_->As<X86Context>();
auto &sgd_param = *param_.get_mutable<operators::SGDParam>(); auto &sgd_param = *param_.get_mutable<operators::SGDParam>();
CHECK(context.x86_device_context); CHECK(context.x86_device_context());
// param.Out->template mutable_data<T>(); // param.Out->template mutable_data<T>();
...@@ -45,12 +45,12 @@ class SGDCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> { ...@@ -45,12 +45,12 @@ class SGDCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
PADDLE_ENFORCE_EQ(grad->numel(), sz); PADDLE_ENFORCE_EQ(grad->numel(), sz);
paddle::operators::jit::sgd_attr_t attr(1, sz, 1, sz, 1); paddle::operators::jit::sgd_attr_t attr(1, sz, 1, sz, 1);
const T *lr = learning_rate->data<T>(); const T *lr = learning_rate->template data<T>();
const T *param_data = param->data<T>(); const T *param_data = param->template data<T>();
const T *grad_data = grad->data<T>(); const T *grad_data = grad->template data<T>();
int64_t rows_idx = 0; int64_t rows_idx = 0;
T *out_data = T *out_data = param_out->template mutable_data<T>(
param_out->mutable_data<T>(context.x86_device_context->GetPlace()); context.x86_device_context()->GetPlace());
auto sgd = auto sgd =
paddle::operators::jit::KernelFuncs<paddle::operators::jit::SgdTuple<T>, paddle::operators::jit::KernelFuncs<paddle::operators::jit::SgdTuple<T>,
......
...@@ -27,3 +27,4 @@ lite_cc_test(test_op_desc_lite SRCS op_desc_test.cc DEPS cpp_op_desc_lite op_des ...@@ -27,3 +27,4 @@ lite_cc_test(test_op_desc_lite SRCS op_desc_test.cc DEPS cpp_op_desc_lite op_des
add_subdirectory(pb) add_subdirectory(pb)
add_subdirectory(cpp) add_subdirectory(cpp)
cc_library(cpp_op_desc_lite SRCS op_desc.cc DEPS any_lite) cc_library(cpp_op_desc_lite SRCS op_desc.cc DEPS any_lite)
cc_library(var_desc_lite SRCS var_desc.cc DEPS framework_proto_lite) cc_library(var_desc_lite SRCS var_desc.cc DEPS framework_proto_lite)
cc_library(op_desc_lite SRCS op_desc.cc DEPS framework_proto_lite) cc_library(op_desc_lite SRCS op_desc.cc DEPS framework_proto_lite)
...@@ -8,6 +8,7 @@ cc_library(mul_op_lite SRCS mul_op.cc DEPS ${op_DEPS}) ...@@ -8,6 +8,7 @@ cc_library(mul_op_lite SRCS mul_op.cc DEPS ${op_DEPS})
cc_library(scale_op_lite SRCS scale_op.cc DEPS ${op_DEPS}) cc_library(scale_op_lite SRCS scale_op.cc DEPS ${op_DEPS})
cc_library(softmax_op_lite SRCS softmax_op.cc DEPS ${op_DEPS}) cc_library(softmax_op_lite SRCS softmax_op.cc DEPS ${op_DEPS})
cc_library(reshape_op_lite SRCS reshape_op.cc DEPS ${op_DEPS} ) cc_library(reshape_op_lite SRCS reshape_op.cc DEPS ${op_DEPS} )
cc_library(batch_norm_op_lite SRCS batch_norm_op.cc DEPS ${op_DEPS})
cc_library(feed_op_lite SRCS feed_op.cc DEPS ${op_DEPS}) cc_library(feed_op_lite SRCS feed_op.cc DEPS ${op_DEPS})
cc_library(fetch_op_lite SRCS fetch_op.cc DEPS ${op_DEPS}) cc_library(fetch_op_lite SRCS fetch_op.cc DEPS ${op_DEPS})
cc_library(io_copy_op_lite SRCS io_copy_op.cc DEPS ${op_DEPS}) cc_library(io_copy_op_lite SRCS io_copy_op.cc DEPS ${op_DEPS})
...@@ -30,6 +31,7 @@ set(ops_lite ...@@ -30,6 +31,7 @@ set(ops_lite
scale_op_lite scale_op_lite
softmax_op_lite softmax_op_lite
reshape_op_lite reshape_op_lite
batch_norm_op_lite
feed_op_lite feed_op_lite
fetch_op_lite fetch_op_lite
io_copy_op_lite io_copy_op_lite
...@@ -52,4 +54,6 @@ lite_cc_test(test_pool_op_lite SRCS pool_op_test.cc ...@@ -52,4 +54,6 @@ lite_cc_test(test_pool_op_lite SRCS pool_op_test.cc
lite_cc_test(test_scale_op_lite SRCS scale_op_test.cc DEPS scale_op_lite memory_lite) lite_cc_test(test_scale_op_lite SRCS scale_op_test.cc DEPS scale_op_lite memory_lite)
lite_cc_test(test_softmax_op_lite SRCS softmax_op_test.cc DEPS softmax_op_lite memory_lite) lite_cc_test(test_softmax_op_lite SRCS softmax_op_test.cc DEPS softmax_op_lite memory_lite)
lite_cc_test(test_reshape_op_lite SRCS reshape_op_test.cc DEPS reshape_op_lite memory_lite) lite_cc_test(test_reshape_op_lite SRCS reshape_op_test.cc DEPS reshape_op_lite memory_lite)
lite_cc_test(test_batch_norm_op_lite SRCS batch_norm_op_test.cc DEPS batch_norm_op_lite memory_lite)
lite_cc_test(test_concat_op_lite SRCS concat_op_test.cc DEPS concat_op_lite memory_lite) lite_cc_test(test_concat_op_lite SRCS concat_op_test.cc DEPS concat_op_lite memory_lite)
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/lite/operators/batch_norm_op.h"
#include "paddle/fluid/lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace operators {
bool BatchNormOp::CheckShape() const {
CHECK_OR_FALSE(param_.x);
CHECK_OR_FALSE(param_.bias);
CHECK_OR_FALSE(param_.scale);
CHECK_OR_FALSE(param_.mean);
CHECK_OR_FALSE(param_.variance);
CHECK_OR_FALSE(param_.y);
if (!param_.is_test) {
CHECK_OR_FALSE(param_.mean_out);
CHECK_OR_FALSE(param_.variance_out);
CHECK_OR_FALSE(param_.saved_mean);
CHECK_OR_FALSE(param_.saved_variance);
}
auto x_dims = param_.x->dims();
auto scale_dims = param_.scale->dims();
auto bias_dims = param_.bias->dims();
auto mean_dims = param_.mean->dims();
auto variance_dims = param_.variance->dims();
CHECK(x_dims.size() >= 2 && x_dims.size() <= 5)
<< "Input X must have 2 to 5 dimensions.";
CHECK_EQ(scale_dims.size(), 1UL) << "Input Scale must have 1 dimensions.";
CHECK_EQ(bias_dims.size(), 1UL) << "Input Bias must have 1 dimensions.";
CHECK_EQ(mean_dims.size(), 1UL) << "Input Mean must have 1 dimensions.";
CHECK_EQ(variance_dims.size(), 1UL)
<< "Input Variance must have 1 dimensions.";
return true;
}
bool BatchNormOp::InferShape() const {
auto x_dims = param_.x->dims();
int64_t channel_size = 0;
switch (param_.data_layout) {
case DATALAYOUT(kNCHW):
channel_size = x_dims[1];
break;
// case DATALAYOUT(kNHWC):
// channel_size = x_dims[x_dims.size() - 1];
// break;
default:
LOG(FATAL) << "Unknown storage order: "
<< DataLayoutToStr(param_.data_layout);
break;
}
if (!param_.is_test) {
param_.mean_out->Resize({channel_size});
param_.variance_out->Resize({channel_size});
param_.saved_mean->Resize({channel_size});
param_.saved_variance->Resize({channel_size});
}
param_.y->Resize(x_dims);
return true;
}
bool BatchNormOp::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
param_.x = scope->FindVar(op_desc.Input("X").front())->GetMutable<Tensor>();
param_.bias =
scope->FindVar(op_desc.Input("Bias").front())->GetMutable<Tensor>();
param_.scale =
scope->FindVar(op_desc.Input("Scale").front())->GetMutable<Tensor>();
param_.mean =
scope->FindVar(op_desc.Input("Mean").front())->GetMutable<Tensor>();
param_.variance =
scope->FindVar(op_desc.Input("Variance").front())->GetMutable<Tensor>();
param_.y = scope->FindVar(op_desc.Output("Y").front())->GetMutable<Tensor>();
param_.is_test = op_desc.GetAttr<bool>("is_test");
param_.use_global_stats = op_desc.GetAttr<bool>("use_global_stats");
if (!param_.is_test) {
param_.mean_out =
scope->FindVar(op_desc.Output("MeanOut").front())->GetMutable<Tensor>();
param_.variance_out = scope->FindVar(op_desc.Output("VarianceOut").front())
->GetMutable<Tensor>();
param_.saved_mean = scope->FindVar(op_desc.Output("SavedMean").front())
->GetMutable<Tensor>();
param_.saved_variance =
scope->FindVar(op_desc.Output("SavedVariance").front())
->GetMutable<Tensor>();
}
param_.epsilon = op_desc.GetAttr<float>("epsilon");
param_.momentum = op_desc.GetAttr<float>("momentum");
std::string data_layout = op_desc.GetAttr<std::string>("data_layout");
CHECK_EQ(data_layout, "NCHW") << "TODO(hong19860320): Only support NCHW.";
// param_.data_layout = StringToDataLayout(data_layout);
return true;
}
} // namespace operators
} // namespace lite
} // namespace paddle
REGISTER_LITE_OP(batch_norm, paddle::lite::operators::BatchNormOp);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/lite/core/op_lite.h"
#include "paddle/fluid/lite/core/scope.h"
#include "paddle/fluid/lite/utils/all.h"
namespace paddle {
namespace lite {
namespace operators {
class BatchNormOp : public OpLite {
public:
BatchNormOp() {}
explicit BatchNormOp(const std::string &op_type) : OpLite(op_type) {}
bool CheckShape() const override;
bool InferShape() const override;
bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
std::string DebugString() const override { return "batch_norm"; }
private:
mutable BatchNormParam param_;
};
} // namespace operators
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/lite/operators/batch_norm_op.h"
#include <gtest/gtest.h>
#include "paddle/fluid/lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace operators {
TEST(batch_norm_op_lite, test) {
// prepare variables
Scope scope;
auto* x = scope.Var("x")->GetMutable<Tensor>();
auto* scale = scope.Var("scale")->GetMutable<Tensor>();
auto* bias = scope.Var("bias")->GetMutable<Tensor>();
auto* mean = scope.Var("mean")->GetMutable<Tensor>();
auto* variance = scope.Var("variance")->GetMutable<Tensor>();
auto* y = scope.Var("y")->GetMutable<Tensor>();
x->Resize({2, 32, 10, 20});
auto x_dims = x->dims();
const int64_t channel_size = x_dims[1]; // NCHW
scale->Resize({channel_size});
bias->Resize({channel_size});
mean->Resize({channel_size});
variance->Resize({channel_size});
// prepare op desc
cpp::OpDesc desc;
desc.SetType("batch_norm");
desc.SetInput("X", {"x"});
desc.SetInput("Scale", {"scale"});
desc.SetInput("Bias", {"bias"});
desc.SetInput("Mean", {"mean"});
desc.SetInput("Variance", {"variance"});
desc.SetOutput("Y", {"y"});
desc.SetAttr("is_test", true);
desc.SetAttr("use_global_stats", false);
desc.SetAttr("epsilon", 1e-5f);
desc.SetAttr("momentum", 0.9f);
desc.SetAttr("data_layout", std::string("NCHW"));
BatchNormOp batch_norm("batch_norm");
batch_norm.SetValidPlaces({Place{TARGET(kHost), PRECISION(kFloat)}});
batch_norm.Attach(desc, &scope);
batch_norm.CheckShape();
batch_norm.InferShape();
// check output dims
auto y_dims = y->dims();
CHECK_EQ(y_dims.size(), x_dims.size());
for (size_t i = 0; i < y_dims.size(); i++) {
CHECK_EQ(y_dims[i], x_dims[i]);
}
}
TEST(batch_norm_op_lite, test_enable_is_test) {
// prepare variables
Scope scope;
auto* x = scope.Var("x")->GetMutable<Tensor>();
auto* scale = scope.Var("scale")->GetMutable<Tensor>();
auto* bias = scope.Var("bias")->GetMutable<Tensor>();
auto* mean = scope.Var("mean")->GetMutable<Tensor>();
auto* variance = scope.Var("variance")->GetMutable<Tensor>();
auto* y = scope.Var("y")->GetMutable<Tensor>();
auto* mean_out = scope.Var("mean_out")->GetMutable<Tensor>();
auto* variance_out = scope.Var("variance_out")->GetMutable<Tensor>();
auto* saved_mean = scope.Var("saved_mean")->GetMutable<Tensor>();
auto* saved_variance = scope.Var("saved_variance")->GetMutable<Tensor>();
x->Resize({2, 32, 10, 20});
auto x_dims = x->dims();
const int64_t channel_size = x_dims[1]; // NCHW
scale->Resize({channel_size});
bias->Resize({channel_size});
mean->Resize({channel_size});
variance->Resize({channel_size});
// prepare op desc
cpp::OpDesc desc;
desc.SetType("batch_norm");
desc.SetInput("X", {"x"});
desc.SetInput("Scale", {"scale"});
desc.SetInput("Bias", {"bias"});
desc.SetInput("Mean", {"mean"});
desc.SetInput("Variance", {"variance"});
desc.SetOutput("Y", {"y"});
desc.SetOutput("MeanOut", {"mean_out"});
desc.SetOutput("VarianceOut", {"variance_out"});
desc.SetOutput("SavedMean", {"saved_mean"});
desc.SetOutput("SavedVariance", {"saved_variance"});
desc.SetAttr("is_test", false);
desc.SetAttr("use_global_stats", false);
desc.SetAttr("epsilon", 1e-5f);
desc.SetAttr("momentum", 0.9f);
desc.SetAttr("data_layout", std::string("NCHW"));
BatchNormOp batch_norm("batch_norm");
batch_norm.SetValidPlaces({Place{TARGET(kHost), PRECISION(kFloat)}});
batch_norm.Attach(desc, &scope);
batch_norm.CheckShape();
batch_norm.InferShape();
// check output dims
auto y_dims = y->dims();
CHECK_EQ(y_dims.size(), x_dims.size());
for (size_t i = 0; i < y_dims.size(); i++) {
CHECK_EQ(y_dims[i], x_dims[i]);
}
auto mean_out_dims = mean_out->dims();
auto variance_out_dims = variance_out->dims();
auto saved_mean_dims = saved_mean->dims();
auto saved_variance_dims = saved_variance->dims();
CHECK_EQ(mean_out_dims.size(), 1UL);
CHECK_EQ(variance_out_dims.size(), 1UL);
CHECK_EQ(saved_mean_dims.size(), 1UL);
CHECK_EQ(saved_variance_dims.size(), 1UL);
CHECK_EQ(mean_out_dims[0], channel_size);
CHECK_EQ(variance_out_dims[0], channel_size);
CHECK_EQ(saved_mean_dims[0], channel_size);
CHECK_EQ(saved_variance_dims[0], channel_size);
}
} // namespace operators
} // namespace lite
} // namespace paddle
...@@ -57,6 +57,7 @@ struct FcParam { ...@@ -57,6 +57,7 @@ struct FcParam {
lite::Tensor* output{}; lite::Tensor* output{};
lite::DDim in_mat_dims; lite::DDim in_mat_dims;
int in_num_col_dims{1}; int in_num_col_dims{1};
bool weight_transposed{false};
}; };
struct ReluParam { struct ReluParam {
...@@ -145,6 +146,25 @@ struct ConvParam { ...@@ -145,6 +146,25 @@ struct ConvParam {
std::string data_format{"Anylayout"}; std::string data_format{"Anylayout"};
}; };
// For BatchNorm op
struct BatchNormParam {
lite::Tensor* x{};
lite::Tensor* bias{};
lite::Tensor* scale{};
lite::Tensor* mean{};
lite::Tensor* variance{};
lite::Tensor* y{};
lite::Tensor* mean_out{};
lite::Tensor* variance_out{};
lite::Tensor* saved_mean{};
lite::Tensor* saved_variance{};
bool is_test{true};
bool use_global_stats{false};
float epsilon;
float momentum;
DataLayoutType data_layout{DATALAYOUT(kNCHW)};
};
// For Pooling op // For Pooling op
struct PoolParam { struct PoolParam {
lite::Tensor* x{}; lite::Tensor* x{};
......
...@@ -74,7 +74,11 @@ TEST(pool_op_lite, test) { ...@@ -74,7 +74,11 @@ TEST(pool_op_lite, test) {
pool.Attach(desc, &scope); pool.Attach(desc, &scope);
auto kernels = pool.CreateKernels({Place{TARGET(kARM), PRECISION(kFloat)}}); auto kernels = pool.CreateKernels({Place{TARGET(kARM), PRECISION(kFloat)}});
LOG(INFO) << "kernels.size(): " << kernels.size(); LOG(INFO) << "kernels.size(): " << kernels.size();
#ifdef LITE_WITH_ARM
ASSERT_FALSE(kernels.empty()); ASSERT_FALSE(kernels.empty());
#else
ASSERT_TRUE(kernels.empty());
#endif
} }
} // namespace operators } // namespace operators
......
...@@ -37,7 +37,7 @@ bool SplitOp::InferShape() const { ...@@ -37,7 +37,7 @@ bool SplitOp::InferShape() const {
const auto &sections = param_.sections; const auto &sections = param_.sections;
const int outs_number = outs.size(); const int outs_number = outs.size();
std::vector<lite::DDimLite> outs_dims; std::vector<lite::DDimHvy> outs_dims;
outs_dims.reserve(outs_number); outs_dims.reserve(outs_number);
if (num > 0) { if (num > 0) {
......
...@@ -88,4 +88,4 @@ RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple wheel ...@@ -88,4 +88,4 @@ RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple wheel
RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple pre-commit RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple pre-commit
RUN apt-get autoremove -y && apt-get clean RUN apt-get autoremove -y && apt-get clean
RUN rm -rf /sdk-tools-linux-4333796.zip /tmp/android-ndk-r17c-linux-x86_64.zip /cmake-3.10.3-Linux-x86_64.tar.gz RUN rm -rf /sdk-tools-linux-4333796.zip /tmp/android-ndk-r17c-linux-x86_64.zip /cmake-3.10.3-Linux-x86_64.tar.gz
\ No newline at end of file
...@@ -13,6 +13,11 @@ function prepare_for_codegen { ...@@ -13,6 +13,11 @@ function prepare_for_codegen {
mkdir -p ./paddle/fluid/lite/gen_code mkdir -p ./paddle/fluid/lite/gen_code
touch ./paddle/fluid/lite/gen_code/__generated_code__.cc touch ./paddle/fluid/lite/gen_code/__generated_code__.cc
} }
function check_need_ci {
git log -1 --oneline | grep "test=develop" || exit -1
}
function cmake_x86 { function cmake_x86 {
prepare_for_codegen prepare_for_codegen
cmake .. -DWITH_GPU=OFF -DWITH_MKLDNN=OFF -DLITE_WITH_X86=ON ${common_flags} cmake .. -DWITH_GPU=OFF -DWITH_MKLDNN=OFF -DLITE_WITH_X86=ON ${common_flags}
...@@ -28,6 +33,17 @@ function cmake_gpu { ...@@ -28,6 +33,17 @@ function cmake_gpu {
cmake .. " -DWITH_GPU=ON {common_flags} -DLITE_WITH_GPU=ON" cmake .. " -DWITH_GPU=ON {common_flags} -DLITE_WITH_GPU=ON"
} }
function check_style {
export PATH=/usr/bin:$PATH
#pre-commit install
clang-format --version
if ! pre-commit run -a ; then
git diff
exit 1
fi
}
function cmake_arm { function cmake_arm {
# $1: ARM_TARGET_OS in "android" , "armlinux" # $1: ARM_TARGET_OS in "android" , "armlinux"
# $2: ARM_TARGET_ARCH_ABI in "arm64-v8a", "armeabi-v7a" ,"armeabi-v7a-hf" # $2: ARM_TARGET_ARCH_ABI in "arm64-v8a", "armeabi-v7a" ,"armeabi-v7a-hf"
...@@ -43,10 +59,15 @@ function cmake_arm { ...@@ -43,10 +59,15 @@ function cmake_arm {
-DARM_TARGET_OS=$1 -DARM_TARGET_ARCH_ABI=$2 -DARM_TARGET_OS=$1 -DARM_TARGET_ARCH_ABI=$2
} }
function build_single {
#make $1 -j$(expr $(nproc) - 2)
make $1 -j8
}
function build { function build {
file=$1 file=$1
for _test in $(cat $file); do for _test in $(cat $file); do
make $_test -j$(expr $(nproc) - 2) build_single $_test
done done
} }
...@@ -58,44 +79,12 @@ function test_lite { ...@@ -58,44 +79,12 @@ function test_lite {
for _test in $(cat $file); do for _test in $(cat $file); do
# We move the build phase here to make the 'gen_code' test compiles after the # We move the build phase here to make the 'gen_code' test compiles after the
# corresponding test is executed and the C++ code generates. # corresponding test is executed and the C++ code generates.
make $_test -j$(expr $(nproc) - 2) #make $_test -j$(expr $(nproc) - 2)
make $_test -j8
ctest -R $_test -V ctest -R $_test -V
done done
} }
port_armv8=5554
port_armv7=5556
# Run test on android
function test_lite_android {
local file=$1
local adb_abi=$2
local port=
if [[ ${adb_abi} == "armeabi-v7a" ]]; then
port=${port_armv7}
fi
if [[ ${adb_abi} == "arm64-v8a" ]]; then
port=${port_armv8}
fi
if [[ "${port}x" == "x" ]]; then
echo "Port can not be empty"
exit 1
fi
echo "file: ${file}"
# push all to adb and test
adb_work_dir="/data/local/tmp"
skip_list="test_model_parser_lite"
for _test in $(cat $file); do
[[ $skip_list =~ (^|[[:space:]])$_test($|[[:space:]]) ]] && continue || echo 'skip $_test'
testpath=$(find ./paddle/fluid -name ${_test})
adb -s emulator-${port} push ${testpath} ${adb_work_dir}
adb -s emulator-${port} shell chmod +x "${adb_work_dir}/${_test}"
adb -s emulator-${port} shell "./${adb_work_dir}/${_test}"
done
}
# Build the code and run lite server tests. This is executed in the CI system. # Build the code and run lite server tests. This is executed in the CI system.
function build_test_server { function build_test_server {
mkdir -p ./build mkdir -p ./build
...@@ -108,8 +97,34 @@ function build_test_server { ...@@ -108,8 +97,34 @@ function build_test_server {
build $LIBS_FILE build $LIBS_FILE
} }
# Build the code and run lite server tests. This is executed in the CI system. # test_arm_android <some_test_name> <adb_port_number>
function test_arm_android {
test_name=$1
port=$2
if [[ "${test_name}x" == "x" ]]; then
echo "test_name can not be empty"
exit 1
fi
if [[ "${port}x" == "x" ]]; then
echo "Port can not be empty"
exit 1
fi
echo "test name: ${test_name}"
adb_work_dir="/data/local/tmp"
skip_list="test_model_parser_lite" # add more with space
[[ $skip_list =~ (^|[[:space:]])$test_name($|[[:space:]]) ]] && continue || echo 'skip $test_name'
testpath=$(find ./paddle/fluid -name ${test_name})
adb -s emulator-${port} push ${testpath} ${adb_work_dir}
adb -s emulator-${port} shell chmod +x "${adb_work_dir}/${test_name}"
adb -s emulator-${port} shell "./${adb_work_dir}/${test_name}"
}
# Build the code and run lite arm tests. This is executed in the CI system.
function build_test_arm { function build_test_arm {
port_armv8=5554
port_armv7=5556
adb kill-server adb kill-server
adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
# start android arm64-v8a armeabi-v7a emulators first # start android arm64-v8a armeabi-v7a emulators first
...@@ -122,6 +137,7 @@ function build_test_arm { ...@@ -122,6 +137,7 @@ function build_test_arm {
for os in "android" "armlinux" ; do for os in "android" "armlinux" ; do
for abi in "arm64-v8a" "armeabi-v7a" "armeabi-v7a-hf" ; do for abi in "arm64-v8a" "armeabi-v7a" "armeabi-v7a-hf" ; do
# TODO(TJ): enable compile on v7-hf on andorid and all v7 on armlinux
if [[ ${abi} == "armeabi-v7a-hf" ]]; then if [[ ${abi} == "armeabi-v7a-hf" ]]; then
echo "armeabi-v7a-hf is not supported on both android and armlinux" echo "armeabi-v7a-hf is not supported on both android and armlinux"
continue continue
...@@ -138,17 +154,30 @@ function build_test_arm { ...@@ -138,17 +154,30 @@ function build_test_arm {
cmake_arm ${os} ${abi} cmake_arm ${os} ${abi}
build $TESTS_FILE build $TESTS_FILE
# armlinux need in another docker
# TODO(TJ): enable test with armlinux
if [[ ${os} == "android" ]]; then if [[ ${os} == "android" ]]; then
adb_abi=${abi} adb_abi=${abi}
if [[ ${adb_abi} == "armeabi-v7a-hf" ]]; then if [[ ${adb_abi} == "armeabi-v7a-hf" ]]; then
adb_abi="armeabi-v7a" adb_abi="armeabi-v7a"
fi fi
if [[ ${adb_abi} == "armeabi-v7a" ]]; then if [[ ${adb_abi} == "armeabi-v7a" ]]; then
# skip v7 tests # skip all armv7 tests
# TODO(TJ): enable test with armv7
continue continue
fi fi
test_lite_android $TESTS_FILE ${adb_abi} local port=
# armlinux need in another docker if [[ ${adb_abi} == "armeabi-v7a" ]]; then
port=${port_armv7}
fi
if [[ ${adb_abi} == "arm64-v8a" ]]; then
port=${port_armv8}
fi
echo "test file: ${TESTS_FILE}"
for _test in $(cat $TESTS_FILE); do
test_arm_android $_test $port
done
fi fi
cd - cd -
done done
...@@ -164,12 +193,13 @@ function print_usage { ...@@ -164,12 +193,13 @@ function print_usage {
echo "----------------------------------------" echo "----------------------------------------"
echo -e "cmake_x86: run cmake with X86 mode" echo -e "cmake_x86: run cmake with X86 mode"
echo -e "cmake_cuda: run cmake with CUDA mode" echo -e "cmake_cuda: run cmake with CUDA mode"
echo -e "cmake_arm: run cmake with ARM mode" echo -e "--arm_os=<os> --arm_abi=<abi> cmake_arm: run cmake with ARM mode"
echo echo
echo -e "build: compile the tests" echo -e "build: compile the tests"
echo -e "--test_name=<test_name> build_single: compile single test"
echo echo
echo -e "test_server: run server tests" echo -e "test_server: run server tests"
echo -e "test_mobile: run mobile tests" echo -e "--test_name=<test_name> --adb_port_number=<adb_port_number> test_arm_android: run arm test"
echo "----------------------------------------" echo "----------------------------------------"
echo echo
} }
...@@ -182,11 +212,31 @@ function main { ...@@ -182,11 +212,31 @@ function main {
TESTS_FILE="${i#*=}" TESTS_FILE="${i#*=}"
shift shift
;; ;;
--test_name=*)
TEST_NAME="${i#*=}"
shift
;;
--arm_os=*)
ARM_OS="${i#*=}"
shift
;;
--arm_abi=*)
ARM_ABI="${i#*=}"
shift
;;
--arm_port=*)
ARM_PORT="${i#*=}"
shift
;;
build) build)
build $TESTS_FILE build $TESTS_FILE
build $LIBS_FILE build $LIBS_FILE
shift shift
;; ;;
build_single)
build_single $TEST_NAME
shift
;;
cmake_x86) cmake_x86)
cmake_x86 cmake_x86
shift shift
...@@ -196,15 +246,15 @@ function main { ...@@ -196,15 +246,15 @@ function main {
shift shift
;; ;;
cmake_arm) cmake_arm)
cmake_arm $2 $3 cmake_arm $ARM_OS $ARM_ABI
shift shift
;; ;;
test_server) test_server)
test_lite $TESTS_FILE test_lite $TESTS_FILE
shift shift
;; ;;
test_mobile) test_arm_android)
test_lite $TESTS_FILE test_arm_android $TEST_NAME $ARM_PORT
shift shift
;; ;;
build_test_server) build_test_server)
...@@ -215,6 +265,14 @@ function main { ...@@ -215,6 +265,14 @@ function main {
build_test_arm build_test_arm
shift shift
;; ;;
check_style)
check_style
shift
;;
check_need_ci)
check_need_ci
shift
;;
*) *)
# unknown option # unknown option
print_usage print_usage
...@@ -224,7 +282,5 @@ function main { ...@@ -224,7 +282,5 @@ function main {
done done
} }
print_usage
main $@ main $@
...@@ -9,3 +9,4 @@ set(utils_DEPS glog) ...@@ -9,3 +9,4 @@ set(utils_DEPS glog)
lite_cc_test(test_varient SRCS varient_test.cc DEPS utils_lite) lite_cc_test(test_varient SRCS varient_test.cc DEPS utils_lite)
cc_library(any_lite SRCS any.cc) cc_library(any_lite SRCS any.cc)
cc_library(utils_lite SRCS cp_logging.cc string.cc DEPS ${utils_DEPS} any_lite) cc_library(utils_lite SRCS cp_logging.cc string.cc DEPS ${utils_DEPS} any_lite)
...@@ -4,3 +4,4 @@ endif() ...@@ -4,3 +4,4 @@ endif()
cc_library(target_wrapper_x86 SRCS target_wrapper.cc) cc_library(target_wrapper_x86 SRCS target_wrapper.cc)
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.proto.TrainerConfig_pb2 import OptimizationConfig, TrainerConfig
from paddle.proto.ModelConfig_pb2 import ModelConfig
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册