提交 ace19269 编写于 作者: S superjomn

Merge branch 'incubate/lite' of http://10.87.145.36/inference/paddlelite into HEAD

before_script:
- env
image: $SERVER_LITE_DOCKER_IMAGE
stages:
- ci
- build_server
- build_mobile
check:prebuilt:
tags:
- lite
stage: ci
script:
#- pip3 install pre-commit
#- alias python=python3
- rm -rf ~/.pip
- pip install pre-commit
- pre-commit install
- ./paddle/fluid/lite/tools/build.sh check_style
#- ./paddle/fluid/lite/tools/build.sh check_need_ci
cache:
key: check_style
paths:
- /root/.cache
build:server:
tags:
- lite
image: $SERVER_LITE_DOCKER_IMAGE
stage: build_server
cache:
key: server_thirdparty
paths:
- build/third_party
- /root/.ccache
script:
- apt install ccache
- export http_proxy=http://172.19.57.45:3128
- export https_proxy=http://172.19.57.45:3128
#- export http_proxy=http://agent.baidu.com:8118
#- export https_proxy=http://agent.baidu.com:8118
- mkdir -p build
- cd build
- ../paddle/fluid/lite/tools/build.sh cmake_x86
- make extern_eigen3
- make extern_boost
- make framework_proto
- make extern_warpctc
- cd ..
- export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PWD/build/third_party/install/mklml/lib
- ./paddle/fluid/lite/tools/build.sh build_test_server
dependencies:
- check:prebuilt
build:mobile:
tags:
- lite
stage: build_mobile
image: $MOBILE_LITE_DOCKER_IMAGE
cache:
key: mobile_thirdparty
paths:
- $MOBILE_LITE_CACHE0
- $MOBILE_LITE_CACHE1
- /root/.ccache
script:
- apt install ccache
- export http_proxy=http://172.19.57.45:3128
- export https_proxy=http://172.19.57.45:3128
- ./paddle/fluid/lite/tools/build.sh build_test_arm
dependencies:
- build:server
......@@ -166,6 +166,7 @@ if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
#include(external/zlib) # download, build, install gtest
include(external/protobuf) # download, build, install protobuf
include(external/eigen) # download eigen3
include(ccache) # set ccache for compilation
include(generic) # simplify cmake module
include(configure) # add paddle env configuration
......
......@@ -172,3 +172,4 @@ add_subdirectory(model_parser)
add_subdirectory(utils)
add_subdirectory(api)
add_subdirectory(gen_code)
......@@ -14,7 +14,7 @@ if(LITE_WITH_CUDA)
set(light_api_deps ${light_api_deps} target_wrapper_cuda)
endif()
cc_library(light_api_lite SRCS light_api.cc DEPS ${light_api_deps} ${ops_lite} ${host_kernels})
#cc_library(light_api_lite SRCS light_api.cc DEPS ${light_api_deps} ${ops_lite} ${host_kernels})
message(STATUS "get ops ${ops_lite}")
message(STATUS "get Host kernels ${host_kernels}")
......
......@@ -66,7 +66,7 @@ USE_LITE_OP(fetch);
USE_LITE_OP(io_copy);
USE_LITE_OP(con2d);
USE_LITE_OP(batch_norm);
// USE_LITE_OP(batch_norm);
USE_LITE_OP(relu);
USE_LITE_OP(depthwise_conv2d);
USE_LITE_OP(pool2d);
......
......@@ -32,5 +32,7 @@ cc_library(math_arm SRCS
conv_winograd_3x3.cc
conv_winograd.cc
split.cc
DEPS ${lite_kernel_deps} eigen3)
DEPS ${lite_kernel_deps} eigen3 framework_proto_lite)
# TODO(TJ): fix me do not deps proto
......@@ -58,6 +58,111 @@ void scale<float>(const float* din, float* dout, int num, float scale,
}
}
template <>
void scale<float>(const float* din, float* dout, int outer_dim, int scale_dim,
int inner_dim, const float* scale_data,
const float* bias_data) {
int cnt = inner_dim >> 4;
int remain = inner_dim % 16;
int size = inner_dim * scale_dim;
for (int n = 0; n < outer_dim; n++) {
const float* din_ptr_n = din + n * size;
float* dout_ptr_n = dout + n * size;
#pragma omp parallel for
for (int i = 0; i < scale_dim; i++) {
const float* din_ptr = din_ptr_n + i * inner_dim;
float* dout_ptr = dout_ptr_n + i * inner_dim;
float scale = scale_data[i];
float32x4_t vscale = vdupq_n_f32(scale);
float bias = bias_data[i];
float32x4_t vbias = vdupq_n_f32(bias);
for (int j = 0; j < cnt; j++) {
float32x4_t din0 = vld1q_f32(din_ptr);
float32x4_t din1 = vld1q_f32(din_ptr + 4);
float32x4_t din2 = vld1q_f32(din_ptr + 8);
float32x4_t din3 = vld1q_f32(din_ptr + 12);
float32x4_t vsum1 = vmlaq_f32(vbias, din0, vscale);
float32x4_t vsum2 = vmlaq_f32(vbias, din1, vscale);
float32x4_t vsum3 = vmlaq_f32(vbias, din2, vscale);
float32x4_t vsum4 = vmlaq_f32(vbias, din3, vscale);
din_ptr += 16;
vst1q_f32(dout_ptr, vsum1);
vst1q_f32(dout_ptr + 4, vsum2);
vst1q_f32(dout_ptr + 8, vsum3);
vst1q_f32(dout_ptr + 12, vsum4);
dout_ptr += 16;
}
for (int j = 0; j < remain; j++) {
*dout_ptr = *din_ptr * scale + bias;
dout_ptr++;
din_ptr++;
}
}
}
}
template <>
void scale<float>(const float* din, float* dout, int outer_dim, int scale_dim,
const float* scale_data, const float* bias_data) {
int cnt = scale_dim >> 4;
int remain = scale_dim % 16;
for (int n = 0; n < outer_dim; n++) {
const float* din_ptr_n = din + n * scale_dim;
float* dout_ptr_n = dout + n * scale_dim;
#pragma omp parallel for
for (int i = 0; i < cnt; i++) {
int idx = i << 4;
const float* din_ptr = din_ptr_n + idx;
const float* scale_ptr = scale_data + idx;
const float* bias_ptr = bias_data + idx;
float* dout_ptr = dout_ptr_n + idx;
float32x4_t din0 = vld1q_f32(din_ptr);
float32x4_t vscale0 = vld1q_f32(scale_ptr);
float32x4_t vbias0 = vld1q_f32(bias_ptr);
float32x4_t din1 = vld1q_f32(din_ptr + 4);
float32x4_t vscale1 = vld1q_f32(scale_ptr + 4);
float32x4_t vbias1 = vld1q_f32(bias_ptr + 4);
float32x4_t din2 = vld1q_f32(din_ptr + 8);
float32x4_t vscale2 = vld1q_f32(scale_ptr + 8);
float32x4_t vbias2 = vld1q_f32(bias_ptr + 8);
float32x4_t vsum1 = vmlaq_f32(vbias0, din0, vscale0);
float32x4_t vsum2 = vmlaq_f32(vbias1, din1, vscale1);
float32x4_t din3 = vld1q_f32(din_ptr + 12);
float32x4_t vscale3 = vld1q_f32(scale_ptr + 12);
float32x4_t vbias3 = vld1q_f32(bias_ptr + 12);
vst1q_f32(dout_ptr, vsum1);
vst1q_f32(dout_ptr + 4, vsum2);
float32x4_t vsum3 = vmlaq_f32(vbias2, din2, vscale2);
float32x4_t vsum4 = vmlaq_f32(vbias3, din3, vscale3);
vst1q_f32(dout_ptr + 8, vsum3);
vst1q_f32(dout_ptr + 12, vsum4);
}
int idx = cnt << 4;
const float* din_ptr = din_ptr_n + idx;
float* dout_ptr = dout_ptr_n + idx;
const float* scale_ptr = scale_data + idx;
const float* bias_ptr = bias_data + idx;
for (int j = 0; j < remain; j++) {
*dout_ptr = *din_ptr * (*scale_ptr) + (*bias_ptr);
dout_ptr++;
din_ptr++;
scale_ptr++;
bias_ptr++;
}
}
}
} // namespace math
} // namespace arm
} // namespace lite
......
......@@ -22,6 +22,14 @@ namespace math {
template <typename T>
void scale(const T* din, T* dout, int num, float scale, float bias);
template <typename T>
void scale(const T* din, T* dout, int outer_dim, int scale_dim, int inner_dim,
const float* scale_data, const float* bias_data);
template <typename T>
void scale(const T* din, T* dout, int outer_dim, int scale_dim,
const float* scale_data, const float* bias_data);
} // namespace math
} // namespace arm
} // namespace lite
......
......@@ -12,9 +12,10 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/lite/arm/math/saturate.h"
#include "paddle/fluid/lite/arm/math/type_trans.h"
#include <arm_neon.h>
#include <string.h>
#include "paddle/fluid/lite/arm/math/saturate.h"
namespace paddle {
namespace lite {
......@@ -23,14 +24,13 @@ namespace math {
template <typename dtype>
void int32_to_dtype(const int* din, dtype* dout, const float* scale,
int axis_size, long long outer_size, long long inner_size);
int axis_size, int64_t outer_size, int64_t inner_size);
void fp32_to_int8(const float* din, signed char* dout, const float* scale,
int axis_size, long long outer_size, long long inner_size) {
int axis_size, int64_t outer_size, int64_t inner_size) {
int cnt = inner_size / 16;
int remain = inner_size & 15;
long long loop_size = outer_size * axis_size;
int64_t loop_size = outer_size * axis_size;
#pragma omp parallel for
for (int j = 0; j < loop_size; ++j) {
......@@ -69,10 +69,10 @@ void fp32_to_int8(const float* din, signed char* dout, const float* scale,
"sqxtn2 v8.16b, v5.8h \n"
"str q8, [%[out]], #16 \n"
"bne 0b \n"
: [in] "+r" (din_ptr), [out] "+r" (dout_ptr), [cnt] "+r" (cnt_loop)
: [scale] "w" (vscale)
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11"
);
: [in] "+r"(din_ptr), [out] "+r"(dout_ptr), [cnt] "+r"(cnt_loop)
: [scale] "w"(vscale)
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
"v11");
#else
asm volatile(
"vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n"
......@@ -110,10 +110,11 @@ void fp32_to_int8(const float* din, signed char* dout, const float* scale,
"subs %[cnt], #1 @ loop count -1\n"
"bne 0b @ to main loop\n"
:[dout]"+r"(dout_ptr), [din]"+r"(din_ptr), [cnt]"+r"(cnt_loop)
:[vscale]"w"(vscale), [vpoff]"w"(vpoff), [vnoff]"w"(vnoff), [vzero]"w"(vzero)
:"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11"
);
: [dout] "+r"(dout_ptr), [din] "+r"(din_ptr), [cnt] "+r"(cnt_loop)
: [vscale] "w"(vscale), [vpoff] "w"(vpoff), [vnoff] "w"(vnoff),
[vzero] "w"(vzero)
: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10",
"q11");
#endif
}
const float* din_r = din_c + 16 * cnt;
......@@ -125,11 +126,10 @@ void fp32_to_int8(const float* din, signed char* dout, const float* scale,
}
void fp32_to_int16(const float* din, int16_t* dout, const float* scale,
int axis_size, long long outer_size, long long inner_size) {
int axis_size, int64_t outer_size, int64_t inner_size) {
int cnt = inner_size / 8;
int remain = inner_size & 7;
long long loop_size = outer_size * axis_size;
int64_t loop_size = outer_size * axis_size;
#pragma omp parallel for
for (int j = 0; j < loop_size; ++j) {
......@@ -158,10 +158,9 @@ void fp32_to_int16(const float* din, int16_t* dout, const float* scale,
"sqxtn2 v4.8h, v9.4s \n"
"str q4, [%[out]], #16 \n"
"bne 0b \n"
: [in] "+r" (din_ptr), [out] "+r" (dout_ptr), [cnt] "+r" (cnt_loop)
: [scale] "w" (vscale)
: "v0", "v1", "v4", "v5", "v8", "v9"
);
: [in] "+r"(din_ptr), [out] "+r"(dout_ptr), [cnt] "+r"(cnt_loop)
: [scale] "w"(vscale)
: "v0", "v1", "v4", "v5", "v8", "v9");
#else
asm volatile(
"vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n"
......@@ -185,10 +184,10 @@ void fp32_to_int16(const float* din, int16_t* dout, const float* scale,
"subs %[cnt], #1 @ loop count -1\n"
"bne 0b @ to main loop\n"
:[dout]"+r"(dout_ptr), [din]"+r"(din_ptr), [cnt]"+r"(cnt_loop)
:[vscale]"w"(vscale), [vpoff]"w"(vpoff), [vnoff]"w"(vnoff), [vzero]"w"(vzero)
:"q0", "q1", "q4", "q5", "q6", "q7", "q8", "q9"
);
: [dout] "+r"(dout_ptr), [din] "+r"(din_ptr), [cnt] "+r"(cnt_loop)
: [vscale] "w"(vscale), [vpoff] "w"(vpoff), [vnoff] "w"(vnoff),
[vzero] "w"(vzero)
: "q0", "q1", "q4", "q5", "q6", "q7", "q8", "q9");
#endif
}
const float* din_r = din_c + 8 * cnt;
......@@ -200,13 +199,12 @@ void fp32_to_int16(const float* din, int16_t* dout, const float* scale,
}
void int8_to_fp32(const signed char* in, float* out, const float* scale,
int axis_size, long long outer_size, long long inner_size) {
int axis_size, int64_t outer_size, int64_t inner_size) {
int cnt = inner_size / 16;
int remain = inner_size & 15;
long long loop_size = axis_size * outer_size;
int64_t loop_size = axis_size * outer_size;
#pragma omp parallel for
for (long long n = 0; n < loop_size; ++n) {
for (int64_t n = 0; n < loop_size; ++n) {
float in_scale = scale[n % axis_size];
const signed char* din_c = in + n * inner_size;
float* dout_c = out + n * inner_size;
......@@ -245,10 +243,10 @@ void int8_to_fp32(const signed char* in, float* out, const float* scale,
"stp q6, q7, [%[out]], #32 \n" /* write to memory*/
"bne 0b \n"
:[loop] "+r" (loop), [in] "+r" (din_ptr), [out] "+r" (dout_ptr)
:[scale] "w" (vscale)
:"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11"
);
: [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
: [scale] "w"(vscale)
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
"v11");
#else
asm volatile(
"vld1.32 {d0-d1}, [%[in]]! @ load 16 int8\n"
......@@ -276,11 +274,10 @@ void int8_to_fp32(const signed char* in, float* out, const float* scale,
"vst1.f32 {d12-d15}, [%[out]]! @ write to memory\n"
"bne 0b \n"
:[loop] "+r" (loop), [in] "+r" (din_ptr), [out] "+r" (dout_ptr)
:[scale] "w" (vscale)
:"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"
);
#endif //__aarch64__
: [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
: [scale] "w"(vscale)
: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
#endif // __aarch64__
}
const signed char* din_r = din_c + 16 * cnt;
float* dout_r = dout_c + 16 * cnt;
......@@ -290,21 +287,20 @@ void int8_to_fp32(const signed char* in, float* out, const float* scale,
}
}
void int16_to_fp32(const short* in, float* out, const float* scale,
int axis_size, long long outer_size, long long inner_size) {
void int16_to_fp32(const int16_t* in, float* out, const float* scale,
int axis_size, int64_t outer_size, int64_t inner_size) {
int cnt = inner_size / 16;
int remain = inner_size & 15;
long long loop_size = axis_size * outer_size;
int64_t loop_size = axis_size * outer_size;
#pragma omp parallel for
for (long long n = 0; n < loop_size; ++n) {
for (int64_t n = 0; n < loop_size; ++n) {
float in_scale = scale[n % axis_size];
const short* din_c = in + n * inner_size;
const int16_t* din_c = in + n * inner_size;
float* dout_c = out + n * inner_size;
float32x4_t vscale = vdupq_n_f32(in_scale);
if (cnt > 0) {
int loop = cnt;
const short* din_ptr = din_c;
const int16_t* din_ptr = din_c;
float* dout_ptr = dout_c;
#ifdef __aarch64__
asm volatile(
......@@ -333,10 +329,9 @@ void int16_to_fp32(const short* in, float* out, const float* scale,
"stp q6, q7, [%[out]], #32 \n" /* write to memory*/
"bne 0b \n"
:[loop] "+r" (loop), [in] "+r" (din_ptr), [out] "+r" (dout_ptr)
:[scale] "w" (vscale)
:"v0", "v1", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11"
);
: [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
: [scale] "w"(vscale)
: "v0", "v1", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11");
#else
asm volatile(
"vld1.32 {d0-d3}, [%[in]]! @ load 16 int16\n"
......@@ -362,13 +357,12 @@ void int16_to_fp32(const short* in, float* out, const float* scale,
"vst1.f32 {d12-d15}, [%[out]]! @ write to memory\n"
"bne 0b \n"
:[loop] "+r" (loop), [in] "+r" (din_ptr), [out] "+r" (dout_ptr)
:[scale] "w" (vscale)
:"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"
);
#endif //__aarch64__
: [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
: [scale] "w"(vscale)
: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
#endif // __aarch64__
}
const short* din_r = din_c + 16 * cnt;
const int16_t* din_r = din_c + 16 * cnt;
float* dout_r = dout_c + 16 * cnt;
for (int i = 0; i < remain; ++i) {
dout_r[i] = in_scale * din_r[i];
......@@ -377,12 +371,12 @@ void int16_to_fp32(const short* in, float* out, const float* scale,
}
void int32_to_fp32(const int* din, float* dout, const float* scale,
int axis_size, long long outer_size, long long inner_size) {
int axis_size, int64_t outer_size, int64_t inner_size) {
int cnt = inner_size / 16;
int remain = inner_size & 15;
long long loop_size = axis_size * outer_size;
int64_t loop_size = axis_size * outer_size;
#pragma omp parallel for
for (long long n = 0; n < loop_size; ++n) {
for (int64_t n = 0; n < loop_size; ++n) {
float in_scale = scale[n % axis_size];
const int* din_c = din + n * inner_size;
float* dout_c = dout + n * inner_size;
......@@ -410,10 +404,10 @@ void int32_to_fp32(const int* din, float* dout, const float* scale,
"stp q10, q11, [%[out]], #32 \n"
"subs %[loop], %[loop], #1 \n"
"bne 0b \n"
:[loop] "+r" (loop), [in] "+r" (din_ptr), [out] "+r" (dout_ptr)
:[scale] "w" (vscale)
:"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11"
);
: [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
: [scale] "w"(vscale)
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
"v11");
#else
asm volatile(
"vld1.s32 {d0-d3}, [%[in]]! \n"
......@@ -433,11 +427,11 @@ void int32_to_fp32(const int* din, float* dout, const float* scale,
"vst1.f32 {d16-d19}, [%[out]]! \n"
"vst1.f32 {d20-d23}, [%[out]]! \n"
"bne 0b \n"
:[loop] "+r" (loop), [in] "+r" (din_ptr), [out] "+r" (dout_ptr)
:[scale] "w" (vscale)
:"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11"
);
#endif //__aarch64__
: [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
: [scale] "w"(vscale)
: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10",
"q11");
#endif // __aarch64__
}
const int* din_r = din_c + 16 * cnt;
float* dout_r = dout_c + 16 * cnt;
......@@ -447,13 +441,13 @@ void int32_to_fp32(const int* din, float* dout, const float* scale,
}
}
void int32_to_int8(const int* din, signed char* dout, const float* scale, \
int axis_size, long long outer_size, long long inner_size) {
void int32_to_int8(const int* din, signed char* dout, const float* scale,
int axis_size, int64_t outer_size, int64_t inner_size) {
int cnt = inner_size / 16;
int remain = inner_size & 15;
long long loop_size = outer_size * axis_size;
int64_t loop_size = outer_size * axis_size;
#pragma omp parallel for
for (long long n = 0; n < loop_size; ++n) {
for (int64_t n = 0; n < loop_size; ++n) {
float in_scale = scale[n % axis_size];
const int* din_c = din + n * inner_size;
signed char* dout_c = dout + n * inner_size;
......@@ -497,10 +491,9 @@ void int32_to_int8(const int* din, signed char* dout, const float* scale, \
"st1 {v2.16b}, [%[out]], #16 \n"
"subs %[loop], %[loop], #1 \n"
"bne 0b \n"
:[loop] "+r" (loop), [in] "+r" (din_ptr), [out] "+r" (dout_ptr)
:[scale] "w" (vscale)
:"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
);
: [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
: [scale] "w"(vscale)
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
#else
asm volatile(
"vld1.32 {d0-d3}, [%[din]]! @ load in0~in7\n"
......@@ -541,11 +534,12 @@ void int32_to_int8(const int* din, signed char* dout, const float* scale, \
"vst1.32 {d8-d9}, [%[dout]]! @ write to output\n"
"subs %[loop], #1 @ loop count -1\n"
"bne 0b @ to main loop\n"
:[loop] "+r" (loop), [din] "+r" (din_ptr), [dout] "+r" (dout_ptr)
:[vscale] "w" (vscale), [vzero] "w"(vzero), [vnoff] "w" (vnoff), [vpoff] "w" (vpoff)
:"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11"
);
#endif //__aarch64__
: [loop] "+r"(loop), [din] "+r"(din_ptr), [dout] "+r"(dout_ptr)
: [vscale] "w"(vscale), [vzero] "w"(vzero), [vnoff] "w"(vnoff),
[vpoff] "w"(vpoff)
: "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10",
"q11");
#endif // __aarch64__
}
const int* din_r = din_c + 16 * cnt;
int8_t* dout_r = dout_c + 16 * cnt;
......@@ -555,30 +549,27 @@ void int32_to_int8(const int* din, signed char* dout, const float* scale, \
}
}
void int32_to_int32(const int* din, int* dout, const float* scale, \
int axis_size, long long outer_size, long long inner_size) {
void int32_to_int32(const int* din, int* dout, const float* scale,
int axis_size, int64_t outer_size, int64_t inner_size) {
int size_all = outer_size * axis_size * inner_size;
memmove(dout, din, size_all*sizeof(int));
memmove(dout, din, size_all * sizeof(int));
}
template <>
void int32_to_dtype(const int* din, float* dout, const float* scale,
int axis_size, long long outer_size, long long inner_size) {
int axis_size, int64_t outer_size, int64_t inner_size) {
return int32_to_fp32(din, dout, scale, axis_size, outer_size, inner_size);
}
template <>
void int32_to_dtype(const int* din, signed char* dout, const float* scale,
int axis_size, long long outer_size, long long inner_size) {
int axis_size, int64_t outer_size, int64_t inner_size) {
return int32_to_int8(din, dout, scale, axis_size, outer_size, inner_size);
}
template <>
void int32_to_dtype(const int* din, int* dout, const float* scale,
int axis_size, long long outer_size, long long inner_size) {
int axis_size, int64_t outer_size, int64_t inner_size) {
return int32_to_int32(din, dout, scale, axis_size, outer_size, inner_size);
}
......
......@@ -24,13 +24,14 @@ cc_library(variable_lite SRCS variable.cc)
cc_library(op_registry_lite SRCS op_registry.cc DEPS framework_proto_lite)
cc_library(scope_lite SRCS scope.cc DEPS ${tensor_lite})
cc_library(cpu_info_lite SRCS cpu_info.cc)
cc_library(context_lite SRCS context.cc DEPS ${tensor_lite} any_lite cpu_info_lite)
lite_cc_library(context_lite SRCS context.cc DEPS ${tensor_lite} any_lite cpu_info_lite eigen3)
cc_library(op_lite SRCS op_lite.cc DEPS scope_lite op_registry_lite target_wrapper_lite
cpp_op_desc_lite ${tensor_lite})
cc_library(types_lite SRCS types.cc)
cc_library(type_system SRCS type_system.cc DEPS ${tensor_lite} target_wrapper_lite)
lite_cc_library(program_lite SRCS program.cc DEPS op_lite kernel_lite compatible_pb_lite model_parser_lite HVY_DEPS framework_proto)
lite_cc_library(program_lite SRCS program.cc DEPS op_lite kernel_lite compatible_pb_lite model_parser_lite HVY_DEPS framework_proto
PROFILE_DEPS basic_profiler_lite)
cc_library(optimizer_lite SRCS optimizer.cc DEPS mir_pass_manager model_parser_lite program_lite)
add_subdirectory(mir)
......@@ -56,3 +57,4 @@ lite_cc_test(test_type_system SRCS type_system_test.cc DEPS type_system utils_li
lite_cc_test(test_types_lite SRCS types_test.cc DEPS types_lite)
lite_cc_test(test_memory_lite SRCS memory_test.cc DEPS memory_lite)
lite_cc_test(test_context_lite SRCS context_test.cc DEPS context_lite X86_DEPS operator)
......@@ -54,15 +54,15 @@ void DeviceInfo::InitInternal(DeviceInfo* dev) {
<< ", cluster ID: " << dev->cluster_ids_[dev->core_ids_[i]]
<< ", CPU ARCH: A" << dev->archs_[i];
}
LOG(INFO) << "L1 DataCache size is: ";
VLOG(1) << "L1 DataCache size is: ";
for (int i = 0; i < dev->compute_core_num_; ++i) {
LOG(INFO) << dev->L1_cache_[i] / 1024 << " KB";
VLOG(1) << dev->L1_cache_[i] / 1024 << " KB";
}
LOG(INFO) << "L2 Cache size is: ";
VLOG(1) << "L2 Cache size is: ";
for (int i = 0; i < dev->compute_core_num_; ++i) {
LOG(INFO) << dev->L2_cache_[i] / 1024 << " KB";
VLOG(1) << dev->L2_cache_[i] / 1024 << " KB";
}
LOG(INFO) << "Total memory: " << dev->max_memory_ << "KB";
VLOG(1) << "Total memory: " << dev->max_memory_ << "KB";
dev->max_freq_ = max_freq[0];
for (int j = 1; j < dev->compute_core_num_; ++j) {
......
......@@ -107,6 +107,8 @@ class TensorHvy : public TensorBase<TensorHvy> {
data_.Resize(framework::make_ddim(dims.Vectorize()));
}
void Resize(const std::vector<int64_t>& x) { Resize(DDimHvy(x)); }
void ShareDataWith(const TensorHvy& other) {
data_.ShareDataWith(other.data_);
}
......
cc_library(mir_node SRCS node.cc DEPS framework_proto_lite)
cc_library(mir_ssa_graph SRCS ssa_graph.cc DEPS mir_node)
cc_library(mir_ssa_graph SRCS ssa_graph.cc DEPS mir_node program_lite)
cc_library(mir_pass SRCS pass.cc DEPS mir_ssa_graph)
cc_library(mir_pass_manager SRCS pass_manager.cc DEPS mir_pass mir_ssa_graph mir_passes)
cc_library(mir_pass_registry SRCS pass_registry.cc DEPS mir_pass_manager)
......@@ -20,14 +20,14 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
return()
endif()
cc_test(test_mir_pass_manager SRCS pass_manager_test.cc DEPS mir_pass_manager mir_passes)
cc_test(test_ssa_graph SRCS ssa_graph_test.cc DEPS
mir_ssa_graph scope_lite op_lite
fc_op_lite
${host_kernels}
mir_passes
mir_pass_manager
program_fake_utils
)
#cc_test(test_ssa_graph SRCS ssa_graph_test.cc DEPS
#mir_ssa_graph scope_lite op_lite
#fc_op_lite
#${host_kernels}
#mir_passes
#mir_pass_manager
#program_fake_utils
#)
# lite_cc_test(test_variable_place_infrence_pass SRCS variable_place_inference_pass_test.cc
# DEPS
# mul_op_lite
......@@ -59,3 +59,4 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
pattern_matcher_high_api proto_desc mir_pass_manager fc_op_lite mul_op_lite elementwise_ops_lite
mir_passes compatible_pb_lite program_lite ${ops_lite})
endif()
......@@ -18,10 +18,10 @@ import numpy as np
import paddle.fluid as fluid
from paddle.fluid.backward import append_backward
a = fluid.layers.data(name="a", shape=[100], dtype='float32')
label = fluid.layers.data(name="label", shape=[100], dtype='float32')
a = fluid.layers.data(name="a", shape=[2], dtype='float32')
label = fluid.layers.data(name="label", shape=[10], dtype='float32')
a1 = fluid.layers.fc(input=a, size=500, act=None, bias_attr=False)
a1 = fluid.layers.fc(input=a, size=3, act=None, bias_attr=False)
cost = fluid.layers.square_error_cost(a1, label)
avg_cost = fluid.layers.mean(cost)
......@@ -36,7 +36,7 @@ exe.run(fluid.default_startup_program())
with open('startup_program.pb', 'wb') as f:
f.write(fluid.default_startup_program().desc.serialize_to_string())
data_1 = np.array(numpy.random.random([100, 100]), dtype='float32')
#data_1 = np.array(numpy.random.random([100, 100]), dtype='float32')
#fluid.default_main_program().desc.
......@@ -50,7 +50,7 @@ with open('main_program.pb', 'wb') as f:
#outs = exe.run(program=prog, feed={'a':data_1, }, fetch_list=[cost])
sys.exit(0)
#sys.exit(0)
fluid.io.save_inference_model("./model2", [a.name], [a1], exe)
print(numpy.array(outs))
#print(numpy.array(outs))
......@@ -4,3 +4,4 @@ endif()
lite_cc_library(basic_profiler_lite SRCS basic_profiler.cc)
lite_cc_test(test_basic_profiler SRCS basic_profiler_test.cc DEPS basic_profiler_lite)
......@@ -4,3 +4,4 @@ endif()
nv_library(target_wrapper_cuda SRCS target_wrapper.cc)
nv_library(cuda_blas_lite SRCS blas.cc)
......@@ -18,10 +18,11 @@ if (NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
DEPS scope_lite op_lite kernel_lite paddle_infer_gencode
)
lite_cc_test(test_generated_code SRCS generated_code_test.cc DEPS __generated_code__
${ops_lite} ${host_kernels}
X86_DEPS ${x86_kernels}
)
# lite_cc_test(test_generated_code SRCS generated_code_test.cc DEPS __generated_code__
# ${ops_lite} ${host_kernels}
# X86_DEPS ${x86_kernels}
# )
add_dependencies(__generated_code__ test_gen_code_lite)
# add_dependencies(__generated_code__ test_gen_code_lite)
endif()
cc_library(target_wrapper_host SRCS target_wrapper.cc)
......@@ -5,3 +5,4 @@ add_subdirectory(arm)
add_subdirectory(cuda)
add_subdirectory(x86)
......@@ -6,10 +6,11 @@ message(STATUS "compile with lite ARM kernels")
cc_library(fc_compute_arm SRCS fc_compute.cc DEPS ${lite_kernel_deps} math_arm)
cc_library(relu_compute_arm SRCS relu_compute.cc DEPS ${lite_kernel_deps})
cc_library(mul_compute_arm SRCS mul_compute.cc DEPS ${lite_kernel_deps} eigen3)
cc_library(mul_compute_arm SRCS mul_compute.cc DEPS ${lite_kernel_deps} math_arm)
cc_library(scale_compute_arm SRCS scale_compute.cc DEPS ${lite_kernel_deps} math_arm)
cc_library(softmax_compute_arm SRCS softmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
cc_library(conv_compute_arm SRCS conv_compute.cc DEPS ${lite_kernel_deps} math_arm)
cc_library(batch_norm_compute_arm SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps} math_arm)
cc_library(elementwise_add_compute_arm SRCS elementwise_add_compute.cc DEPS ${lite_kernel_deps} math_arm)
cc_library(pool_compute_arm SRCS pool_compute.cc DEPS ${lite_kernel_deps} math_arm)
cc_library(split_compute_arm SRCS split_compute.cc DEPS ${lite_kernel_deps} math_arm)
......@@ -18,8 +19,10 @@ lite_cc_test(test_fc_compute_arm SRCS fc_compute_test.cc DEPS fc_compute_arm mat
lite_cc_test(test_scale_compute_arm SRCS scale_compute_test.cc DEPS scale_compute_arm)
lite_cc_test(test_softmax_compute_arm SRCS softmax_compute_test.cc DEPS softmax_compute_arm)
lite_cc_test(test_conv_compute_arm SRCS conv_compute_test.cc DEPS conv_compute_arm)
lite_cc_test(test_batch_norm_compute_arm SRCS batch_norm_compute_test.cc DEPS batch_norm_compute_arm)
lite_cc_test(test_elementwise_add_compute_arm SRCS elementwise_add_compute_test.cc DEPS elementwise_add_compute_arm)
lite_cc_test(test_pool_compute_arm SRCS pool_compute_test.cc DEPS pool_compute_arm)
lite_cc_test(test_mul_compute_arm SRCS mul_compute_test.cc DEPS mul_compute_arm)
lite_cc_test(test_split_compute_arm SRCS split_compute_test.cc DEPS split_compute_arm)
set(arm_kernels
......@@ -29,6 +32,7 @@ set(arm_kernels
scale_compute_arm
softmax_compute_arm
conv_compute_arm
batch_norm_compute_arm
elementwise_add_compute_arm
pool_compute_arm
split_compute_arm
......@@ -36,3 +40,4 @@ set(arm_kernels
set(arm_kernels "${arm_kernels}" CACHE INTERNAL "arm kernels")
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/lite/kernels/arm/batch_norm_compute.h"
#include "paddle/fluid/lite/arm/math/funcs.h"
#include "paddle/fluid/lite/core/op_registry.h"
#include "paddle/fluid/lite/core/type_system.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace arm {
void BatchNormCompute::PrepareForRun() {
auto& param = this->Param<param_t>();
auto x_dims = param.x->dims();
bool global_stats = param.is_test || param.use_global_stats;
if (global_stats) {
int64_t channel_size = 0;
switch (param.data_layout) {
case DATALAYOUT(kNCHW):
channel_size = x_dims[1];
break;
// case DATALAYOUT(kNHWC):
// channel_size = x_dims[x_dims.size() - 1];
// break;
default:
LOG(FATAL) << "Unknown storage order: "
<< DataLayoutToStr(param.data_layout);
break;
}
new_scale.Resize({channel_size});
new_bias.Resize({channel_size});
auto* scale_data = param.scale->mutable_data<float>();
auto* bias_data = param.bias->mutable_data<float>();
auto* mean_data = param.mean->mutable_data<float>();
auto* variance_data = param.variance->mutable_data<float>();
auto* new_scale_data = new_scale.mutable_data<float>();
auto* new_bias_data = new_bias.mutable_data<float>();
for (int c = 0; c < channel_size; c++) {
float inv_scale = 1.f / (std::sqrt(variance_data[c] + param.epsilon));
new_bias_data[c] =
bias_data[c] - inv_scale * scale_data[c] * mean_data[c];
new_scale_data[c] = inv_scale * scale_data[c];
}
}
}
void BatchNormCompute::Run() {
auto& param = this->Param<param_t>();
auto x_dims = param.x->dims();
auto x_data = param.x->mutable_data<float>();
auto y_data = param.y->mutable_data<float>();
bool global_stats = param.is_test || param.use_global_stats;
if (global_stats) {
auto* new_scale_data = new_scale.mutable_data<float>();
auto* new_bias_data = new_bias.mutable_data<float>();
int64_t outer_size = 0;
int64_t channel_size = 0;
int64_t inner_size = 0;
switch (param.data_layout) {
case DATALAYOUT(kNCHW):
outer_size = x_dims[0];
channel_size = x_dims[1];
inner_size = x_dims.Slice(2, x_dims.size()).production();
lite::arm::math::scale(x_data, y_data, outer_size, channel_size,
inner_size, new_scale_data, new_bias_data);
break;
// case DATALAYOUT(kNHWC):
// outer_size = x_dims.Slice(0, x_dims.size() - 1).production();
// channel_size = x_dims[x_dims.size() - 1];
// lite::arm::math::scale(x_data, y_data, outer_size, channel_size,
// new_scale_data, new_bias_data);
// break;
default:
LOG(FATAL) << "Unknown storage order: "
<< DataLayoutToStr(param.data_layout);
break;
}
} else {
// TODO(hong19860320) calculate mean_out, variance_out, saved_mean and
// saved_variance
}
}
} // namespace arm
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(batch_norm, kARM, kFloat, kNCHW,
paddle::lite::kernels::arm::BatchNormCompute, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Mean", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Variance", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("MeanOut", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("VarianceOut", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("SavedMean", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("SavedVariance", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/lite/core/kernel.h"
#include "paddle/fluid/lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace arm {
class BatchNormCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
public:
using param_t = operators::BatchNormParam;
void PrepareForRun() override;
void Run() override;
virtual ~BatchNormCompute() = default;
private:
Tensor new_scale;
Tensor new_bias;
};
} // namespace arm
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/lite/kernels/arm/batch_norm_compute.h"
#include <gtest/gtest.h>
#include <memory>
#include <utility>
#include <vector>
#include "paddle/fluid/lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace arm {
template <typename dtype>
void batch_norm_compute_ref(const operators::BatchNormParam& param) {
DDim x_dims = param.x->dims();
auto x_data = param.x->mutable_data<dtype>();
auto scale_data = param.scale->mutable_data<dtype>();
auto bias_data = param.bias->mutable_data<dtype>();
auto mean_data = param.mean->mutable_data<dtype>();
auto variance_data = param.variance->mutable_data<dtype>();
auto y_data = param.y->mutable_data<dtype>();
float epsilon = param.epsilon;
float momentum = param.momentum;
DataLayoutType data_layout = param.data_layout;
bool global_stats = param.is_test || param.use_global_stats;
if (global_stats) {
int64_t outer_size = 0;
int64_t channel_size = 0;
int64_t inner_size = 0;
switch (data_layout) {
case DATALAYOUT(kNCHW):
outer_size = x_dims[0];
channel_size = x_dims[1];
inner_size = x_dims.Slice(2, x_dims.size()).production();
break;
// case DATALAYOUT(kNHWC):
// outer_size = x_dims.Slice(0, x_dims.size() - 1).production();
// channel_size = x_dims[x_dims.size() - 1];
// inner_size = 1;
// break;
default:
LOG(FATAL) << "Unknown storage order: " << DataLayoutToStr(data_layout);
break;
}
auto x_ptr = x_data;
auto y_ptr = y_data;
for (int o = 0; o < outer_size; o++) {
for (int c = 0; c < channel_size; c++) {
for (int i = 0; i < inner_size; i++) {
dtype norm_x =
(*x_ptr - mean_data[c]) / std::sqrt(variance_data[c] + epsilon);
*y_ptr = norm_x * scale_data[c] + bias_data[c];
x_ptr++;
y_ptr++;
}
}
}
} else {
// TODO(hong19860320) calculate mean_out, variance_out, saved_mean and
// saved_variance
}
}
TEST(batch_norm_arm, retrive_op) {
auto batch_norm =
KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
"batch_norm");
ASSERT_FALSE(batch_norm.empty());
ASSERT_TRUE(batch_norm.front());
}
TEST(batch_norm_arm, init) {
BatchNormCompute batch_norm;
ASSERT_EQ(batch_norm.precision(), PRECISION(kFloat));
ASSERT_EQ(batch_norm.target(), TARGET(kARM));
}
TEST(batch_norm_arm, compute) {
DeviceInfo::Init();
for (auto n : {1, 2}) {
for (auto c : {6, 32 /*, 128*/}) {
for (auto h : {9, 18 /*, 56 , 112, 224, 512*/}) {
for (auto w : {9, 18 /*, 56, 112, 224, 512*/}) {
for (auto is_test : {/*false, */ true}) {
for (auto use_global_stats : {false, true}) {
for (auto epsilon : {1e-4f, 1e-5f}) {
for (auto momentum : {0.9f, 0.99f}) {
for (auto data_layout :
{DATALAYOUT(kNCHW) /*, DATALAYOUT(kNHWC)*/}) {
Tensor x;
Tensor scale;
Tensor bias;
Tensor mean;
Tensor variance;
Tensor y;
Tensor mean_out;
Tensor variance_out;
Tensor saved_mean;
Tensor saved_variance;
Tensor y_ref;
Tensor mean_out_ref;
Tensor variance_out_ref;
Tensor saved_mean_ref;
Tensor saved_variance_ref;
// set the dims of input, output, ref output tensors
std::vector<int64_t> in_out_shape;
switch (data_layout) {
case DATALAYOUT(kNCHW):
in_out_shape = {n, c, h, w};
break;
// case DATALAYOUT(kNHWC):
// in_out_shape = {n, h, w, c};
// break;
default:
LOG(FATAL) << "Unknown storage order: "
<< DataLayoutToStr(data_layout);
break;
}
x.Resize(in_out_shape);
scale.Resize({c});
bias.Resize({c});
mean.Resize({c});
variance.Resize({c});
y.Resize(in_out_shape);
mean_out.Resize({c});
variance_out.Resize({c});
saved_mean.Resize({c});
saved_variance.Resize({c});
y_ref.Resize(in_out_shape);
mean_out_ref.Resize({c});
variance_out_ref.Resize({c});
saved_mean_ref.Resize({c});
saved_variance_ref.Resize({c});
// initialize the data of input tensors
auto* x_data = x.mutable_data<float>();
auto* scale_data = scale.mutable_data<float>();
auto* bias_data = bias.mutable_data<float>();
auto* mean_data = mean.mutable_data<float>();
auto* variance_data = variance.mutable_data<float>();
auto* y_data = y.mutable_data<float>();
for (int i = 0; i < x.dims().production(); i++) {
x_data[i] = static_cast<float>(i % 64);
}
for (int i = 0; i < scale.dims().production(); i++) {
scale_data[i] = static_cast<float>(i) * 0.01f + 0.03f;
}
for (int i = 0; i < bias.dims().production(); i++) {
bias_data[i] = static_cast<float>(i) * 0.065f + 0.1f;
}
for (int i = 0; i < mean.dims().production(); i++) {
mean_data[i] = static_cast<float>(i) * 0.0565f;
}
for (int i = 0; i < variance.dims().production(); i++) {
variance_data[i] = static_cast<float>(i) * 2.08f + 1.5f;
}
// prepare kernel params and run
BatchNormCompute batch_norm;
std::unique_ptr<KernelContext> ctx(new KernelContext);
ctx->As<ARMContext>();
batch_norm.SetContext(std::move(ctx));
operators::BatchNormParam param;
param.x = &x;
param.scale = &scale;
param.bias = &bias;
param.mean = &mean;
param.variance = &variance;
param.is_test = is_test;
param.use_global_stats = use_global_stats;
param.epsilon = epsilon;
param.momentum = momentum;
param.data_layout = data_layout;
param.y = &y;
param.mean_out = &mean_out;
param.variance_out = &variance_out;
param.saved_mean = &saved_mean;
param.saved_variance = &saved_variance;
batch_norm.SetParam(param);
batch_norm.Launch();
// invoking ref implementation and compare results
param.y = &y_ref;
param.mean_out = &mean_out_ref;
param.variance_out = &variance_out_ref;
param.saved_mean = &saved_mean_ref;
param.saved_variance = &saved_variance_ref;
batch_norm_compute_ref<float>(param);
auto* y_ref_data = y_ref.mutable_data<float>();
for (int i = 0; i < y.dims().production(); i++) {
EXPECT_NEAR(y_data[i], y_ref_data[i], 1e-5);
}
}
}
}
}
}
}
}
}
}
}
} // namespace arm
} // namespace kernels
} // namespace lite
} // namespace paddle
USE_LITE_KERNEL(batch_norm, kARM, kFloat, kNCHW, def);
......@@ -124,6 +124,20 @@ TEST(conv_arm, init) {
TEST(conv_arm, compute) {
DeviceInfo::Init();
#if 1
for (auto n : {2}) {
for (auto ic : {6}) {
for (auto oc : {6}) {
for (auto ih : {9}) {
for (auto iw : {9}) {
for (auto flag_bias : {false, true}) {
for (auto flag_relu : {false, true}) {
for (auto depthwise : {false, true}) {
for (auto dilation : {1}) {
for (auto stride : {1, 2}) {
for (auto padding : {0, 1, 2}) {
for (auto ks : {1, 3, 5}) {
#else
for (auto n : {1, 2}) {
for (auto ic : {6, 32 /*, 128*/}) {
for (auto oc : {6, 32 /*, 128*/}) {
......@@ -136,6 +150,7 @@ TEST(conv_arm, compute) {
for (auto stride : {1, 2}) {
for (auto padding : {0, 1, 2}) {
for (auto ks : {1, 3, 5}) {
#endif
int group = 1;
if (depthwise) { // depthwise convolution ?
group = oc = ic;
......
......@@ -22,7 +22,7 @@ namespace lite {
namespace kernels {
namespace arm {
void FcCompute::Run() {
void FcCompute::PrepareForRun() {
auto& param = this->Param<operators::FcParam>();
auto x_dims = param.input->dims();
auto w_dims = param.w->dims();
......@@ -31,39 +31,56 @@ void FcCompute::Run() {
CHECK_EQ(w_dims.size(), 2UL);
CHECK_EQ(param.output->dims().size(), 2UL);
m_ = x_dims.Slice(0, param.in_num_col_dims).production();
k_ = x_dims.Slice(param.in_num_col_dims, x_dims.size()).production();
n_ = w_dims[1];
CHECK_EQ(k_, static_cast<int>(w_dims[0]));
if (m_ == 1) {
if (!transed_weight_) {
transed_weight_ = new Tensor;
}
transed_weight_->Resize({n_, k_});
const auto* w_data = param.w->data<float>();
auto* t_data = transed_weight_->mutable_data<float>();
int i = 0;
for (int nn = 0; nn < n_; ++nn) {
for (int kk = 0; kk < k_; ++kk) {
t_data[i++] = w_data[kk * n_ + nn];
}
}
}
}
void FcCompute::Run() {
auto& param = this->Param<operators::FcParam>();
const auto* i_data = param.input->data<float>();
const auto* w_data = param.w->data<float>();
const auto* b_data = param.bias ? param.bias->data<float>() : nullptr;
auto* o_data = param.output->mutable_data<float>();
int x_h = x_dims.Slice(0, param.in_num_col_dims).production();
int x_w = x_dims.Slice(param.in_num_col_dims, x_dims.size()).production();
int n = w_dims[1];
CHECK_EQ(x_w, static_cast<int>(w_dims[0]));
auto& ctx = this->ctx_->template As<ARMContext>();
if (x_h > 1) {
if (m_ > 1) {
float* packed_in = static_cast<float*>(ctx.workspace_data<float>()) +
ctx.l2_cache_size() / sizeof(float);
lite::arm::math::prepackA(packed_in, i_data, x_w, 0, x_h, 0, x_w, false,
&ctx);
lite::arm::math::sgemm_prepack(packed_in, w_data, b_data, o_data, x_h, n,
x_w, false, false, false, &ctx);
lite::arm::math::prepackA(packed_in, i_data, k_, 0, m_, 0, k_, false, &ctx);
lite::arm::math::sgemm_prepack(packed_in, w_data, b_data, o_data, m_, n_,
k_, false, false, false, &ctx);
if (param.bias) {
CHECK_EQ(param.bias->numel(), n);
lite::arm::math::fill_bias_fc(o_data, b_data, x_h, n);
CHECK_EQ(param.bias->numel(), n_);
lite::arm::math::fill_bias_fc(o_data, b_data, m_, n_);
}
} else {
// use sgemmv
// sgemv((const float*)weights, (const float*)din, (float*)dout,
// false, n, x_w, _param->_flag_bias, (float*)bias, false);
CHECK(transed_weight_);
const auto* t_data = transed_weight_->data<float>();
lite::arm::math::sgemv(t_data, i_data, o_data, false, n_, k_,
b_data != nullptr, b_data, false);
}
}
TargetType FcCompute::target() const { return TARGET(kARM); }
PrecisionType FcCompute::precision() const { return PRECISION(kFloat); }
} // namespace arm
} // namespace kernels
} // namespace lite
......
......@@ -25,12 +25,19 @@ class FcCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
public:
using param_t = operators::FcParam;
void PrepareForRun() override;
void Run() override;
TargetType target() const override;
PrecisionType precision() const override;
~FcCompute() override {
if (transed_weight_) {
delete transed_weight_;
}
};
virtual ~FcCompute() = default;
private:
lite::Tensor* transed_weight_{nullptr};
int m_, n_, k_;
};
} // namespace arm
......
......@@ -14,6 +14,11 @@
#include "paddle/fluid/lite/kernels/arm/fc_compute.h"
#include <gtest/gtest.h>
#include <algorithm>
#include <iostream>
#include <memory>
#include <random>
#include <utility>
#include <vector>
#include "paddle/fluid/lite/arm/math/funcs.h"
#include "paddle/fluid/lite/core/op_registry.h"
......@@ -23,6 +28,17 @@ namespace lite {
namespace kernels {
namespace arm {
template <typename T>
void FillData(T* a, const int n, const T lower = static_cast<T>(-2.f),
const T upper = static_cast<T>(2.f)) {
static unsigned int seed = 100;
std::mt19937 rng(seed++);
std::uniform_real_distribution<double> uniform_dist(0, 1);
for (int i = 0; i < n; ++i) {
a[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
}
}
TEST(fc_arm, retrive_op) {
auto fc =
KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>("fc");
......@@ -37,43 +53,46 @@ TEST(fc_arm, init) {
}
TEST(fc_arm, compare_test) {
using T = float;
for (int m : {1, 2, 3, 4}) {
for (int n : {1, 2, 3, 4}) {
for (int k : {1, 2, 3, 4}) {
for (bool with_bias : {true, false}) {
VLOG(3) << "m: " << m << ", n: " << n << ", k: " << k
<< (with_bias ? ", with bias" : "");
lite::Tensor x, w, b, out, ref;
constexpr int batch_size = 2;
x.Resize({batch_size, 3});
w.Resize({3, 4});
b.Resize({1, 4});
out.Resize({batch_size, 4});
ref.Resize({batch_size, 4});
auto x_data = x.mutable_data<float>();
auto w_data = w.mutable_data<float>();
auto b_data = b.mutable_data<float>();
auto out_data = out.mutable_data<float>();
auto ref_data = ref.mutable_data<float>();
x.Resize({m, k});
w.Resize({k, n});
b.Resize({1, n});
out.Resize({m, n});
ref.Resize({m, n});
for (int64_t i = 0; i < x.dims().product(); i++) {
x_data[i] = static_cast<float>(i);
}
for (int64_t i = 0; i < w.dims().product(); i++) {
w_data[i] = static_cast<float>(i);
}
for (int64_t i = 0; i < b.dims().product(); i++) {
b_data[i] = static_cast<float>(i);
}
auto* x_data = x.mutable_data<T>();
auto* w_data = w.mutable_data<T>();
auto* b_data = with_bias ? b.mutable_data<T>() : nullptr;
auto* out_data = out.mutable_data<T>();
auto* ref_data = ref.mutable_data<T>();
lite::arm::math::fc_compute_eigen(x_data, batch_size, 3, //
w_data, 3, 4, //
b_data, ref_data);
FillData<T>(x_data, x.dims().production());
FillData<T>(w_data, w.dims().production());
FillData<T>(out_data, out.dims().production(), 0, 0);
FillData<T>(ref_data, ref.dims().production(), 0, 0);
if (with_bias) {
FillData<T>(b_data, b.dims().production());
}
// fc compute kernel
FcCompute fc;
operators::FcParam param;
param.in_num_col_dims = 1;
param.input = &x;
param.w = &w;
param.bias = &b;
param.bias = with_bias ? &b : nullptr;
param.output = &out;
param.in_num_col_dims = 1;
param.in_mat_dims = x.dims();
DeviceInfo::Init();
......@@ -81,55 +100,53 @@ TEST(fc_arm, compare_test) {
ctx->As<ARMContext>();
fc.SetParam(param);
fc.SetContext(std::move(ctx));
fc.PrepareForRun();
fc.Run();
VLOG(3) << "output vs ref";
for (int i = 0; i < out.dims().product(); i++) {
VLOG(3) << out_data[i] << " vs " << ref_data[i];
lite::arm::math::fc_compute_eigen(x_data, m, k, w_data, k, n, b_data,
ref_data);
for (int i = 0; i < out.dims().production(); i++) {
EXPECT_NEAR(out_data[i], ref_data[i], 1e-3);
}
}
}
}
for (int i = 0; i < out.dims().product(); ++i) {
EXPECT_NEAR(out_data[i], ref_data[i], 1e-5);
}
}
TEST(fc_arm, num_col_dims) {
FcCompute fc;
operators::FcParam param;
using T = float;
lite::Tensor x;
lite::Tensor w;
lite::Tensor bias;
lite::Tensor output;
for (bool with_bias : {true, false}) {
lite::Tensor x, w, b, out, ref;
x.Resize({1, 2, 3});
w.Resize({3, 4});
bias.Resize({1, 4});
output.Resize({2, 4});
b.Resize({1, 4});
out.Resize({2, 4});
ref.Resize({2, 4});
auto* x_data = x.mutable_data<float>();
auto* w_data = w.mutable_data<float>();
auto* bias_data = bias.mutable_data<float>();
auto* output_data = output.mutable_data<float>();
auto* b_data = with_bias ? b.mutable_data<T>() : nullptr;
for (int64_t i = 0; i < x.dims().product(); i++) {
x_data[i] = static_cast<float>(i);
}
for (int64_t i = 0; i < w.dims().product(); i++) {
w_data[i] = static_cast<float>(i);
}
for (int64_t i = 0; i < bias.dims().product(); i++) {
bias_data[i] = static_cast<float>(i);
}
for (int64_t i = 0; i < output.dims().product(); i++) {
output_data[i] = static_cast<float>(i);
}
auto* out_data = out.mutable_data<T>();
auto* ref_data = ref.mutable_data<T>();
param.in_num_col_dims = 2;
FillData<T>(x_data, x.dims().production());
FillData<T>(w_data, w.dims().production());
FillData<T>(out_data, out.dims().production(), 0, 0);
FillData<T>(ref_data, ref.dims().production(), 0, 0);
if (with_bias) {
FillData<T>(b_data, b.dims().production());
}
FcCompute fc;
operators::FcParam param;
param.input = &x;
param.w = &w;
param.bias = &bias;
param.output = &output;
param.bias = with_bias ? &b : nullptr;
param.output = &out;
param.in_num_col_dims = 2;
param.in_mat_dims = x.dims();
std::unique_ptr<KernelContext> ctx(new KernelContext);
......@@ -138,7 +155,15 @@ TEST(fc_arm, num_col_dims) {
fc.SetParam(param);
fc.SetContext(std::move(ctx));
fc.PrepareForRun();
fc.Run();
lite::arm::math::fc_compute_eigen(x_data, 2, 3, w_data, 3, 4, b_data,
ref_data);
for (int i = 0; i < out.dims().production(); i++) {
EXPECT_NEAR(out_data[i], ref_data[i], 1e-3);
}
}
}
} // namespace arm
......
......@@ -12,57 +12,57 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <Eigen/Core>
#include "paddle/fluid/lite/core/kernel.h"
#include "paddle/fluid/lite/kernels/arm/mul_compute.h"
#include "paddle/fluid/lite/arm/math/funcs.h"
#include "paddle/fluid/lite/core/op_registry.h"
#include "paddle/fluid/lite/core/types.h"
#include "paddle/fluid/lite/core/type_system.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace arm {
template <typename T>
void mul_compute_eigen(const T* x, int x_h, int x_w, const T* y, int y_h,
int y_w, T* out) {
using matrix_t =
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
Eigen::Map<const matrix_t> X(x, x_h, x_w);
Eigen::Map<const matrix_t> Y(y, y_h, y_w);
Eigen::Map<matrix_t> Out(out, x_h, y_w);
Out = X * Y;
void MulCompute::PrepareForRun() {
// TODO(TJ): transpose x or y if necessary
}
class MulCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
public:
using param_t = operators::MulParam;
void MulCompute::Run() {
auto& param = Param<param_t>();
void Run() override {
auto& param = Param<operators::MulParam>();
core::dim2 x_shape(
{static_cast<int>(
param.x->dims().Slice(0, param.x_num_col_dims).production()),
static_cast<int>(
param.x->dims()
const auto* x_data = param.x->data<float>();
const auto* y_data = param.y->data<float>();
auto* o_data = param.output->mutable_data<float>();
int m = static_cast<int>(
param.x->dims().Slice(0, param.x_num_col_dims).production());
int x_w =
static_cast<int>(param.x->dims()
.Slice(param.x_num_col_dims, param.x->dims().size())
.production())});
core::dim2 y_shape(
{static_cast<int>(
param.y->dims().Slice(0, param.y_num_col_dims).production()),
static_cast<int>(
param.y->dims()
.production());
int y_h = static_cast<int>(
param.y->dims().Slice(0, param.y_num_col_dims).production());
int n =
static_cast<int>(param.y->dims()
.Slice(param.y_num_col_dims, param.y->dims().size())
.production())});
.production());
mul_compute_eigen(param.x->data<float>(), x_shape.x, x_shape.y, //
param.y->data<float>(), y_shape.x, y_shape.y, //
param.output->mutable_data<float>());
}
CHECK_EQ(x_w, y_h) << "x_w must be equal with y_h";
auto k = x_w;
if (n == 1) {
lite::arm::math::sgemv(x_data, y_data, o_data, false, m, k, false, nullptr,
false);
virtual ~MulCompute() = default;
};
} else {
constexpr bool is_tranposed_y = false;
auto& ctx = this->ctx_->template As<ARMContext>();
float* packed_x = static_cast<float*>(ctx.workspace_data<float>()) +
ctx.l2_cache_size() / sizeof(float);
lite::arm::math::prepackA(packed_x, x_data, k, 0, m, 0, k, false, &ctx);
lite::arm::math::sgemm_prepack(packed_x, y_data, nullptr, o_data, m, n, k,
false, false, is_tranposed_y, &ctx);
}
}
} // namespace arm
} // namespace kernels
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/lite/core/kernel.h"
#include "paddle/fluid/lite/core/op_registry.h"
#include "paddle/fluid/lite/core/types.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace arm {
class MulCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
public:
using param_t = operators::MulParam;
void PrepareForRun() override;
void Run() override;
virtual ~MulCompute() = default;
};
} // namespace arm
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/lite/kernels/arm/mul_compute.h"
#include <gtest/gtest.h>
#include <algorithm>
#include <iostream>
#include <memory>
#include <random>
#include <utility>
#include <vector>
#include "paddle/fluid/lite/arm/math/funcs.h"
#include "paddle/fluid/lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace arm {
template <typename T>
void FillData(T* a, const int n, const T lower = static_cast<T>(-2.f),
const T upper = static_cast<T>(2.f)) {
static unsigned int seed = 100;
std::mt19937 rng(seed++);
std::uniform_real_distribution<double> uniform_dist(0, 1);
for (int i = 0; i < n; ++i) {
a[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
}
}
TEST(mul_arm, retrive_op) {
auto mul =
KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>("mul");
ASSERT_FALSE(mul.empty());
ASSERT_TRUE(mul.front());
}
TEST(mul_arm, init) {
MulCompute mul;
ASSERT_EQ(mul.precision(), PRECISION(kFloat));
ASSERT_EQ(mul.target(), TARGET(kARM));
}
TEST(mul_arm, compare_test) {
using T = float;
for (int m : {1, 2, 3, 4}) {
for (int n : {1, 2, 3, 4}) {
for (int k : {1, 2, 3, 4}) {
VLOG(3) << "m: " << m << ", n: " << n << ", k: " << k;
lite::Tensor x, y, out, ref;
x.Resize({m, k});
y.Resize({k, n});
out.Resize({m, n});
ref.Resize({m, n});
auto* x_data = x.mutable_data<T>();
auto* y_data = y.mutable_data<T>();
auto* out_data = out.mutable_data<T>();
auto* ref_data = ref.mutable_data<T>();
FillData<T>(x_data, x.dims().production());
FillData<T>(y_data, y.dims().production());
FillData<T>(out_data, out.dims().production(), 0, 0);
FillData<T>(ref_data, ref.dims().production(), 0, 0);
MulCompute mul;
operators::MulParam param;
param.x = &x;
param.y = &y;
param.output = &out;
DeviceInfo::Init();
std::unique_ptr<KernelContext> ctx(new KernelContext);
ctx->As<ARMContext>();
mul.SetParam(param);
mul.SetContext(std::move(ctx));
mul.PrepareForRun();
mul.Run();
lite::arm::math::mul_compute_eigen(x_data, m, k, y_data, k, n,
ref_data);
for (int i = 0; i < out.dims().production(); i++) {
EXPECT_NEAR(out_data[i], ref_data[i], 1e-3);
}
}
}
}
}
TEST(mul_arm, num_col_dims) {
using T = float;
lite::Tensor x, y, out, ref;
x.Resize({2, 3, 4});
y.Resize({3, 4, 5});
out.Resize({2, 5});
ref.Resize({2, 5});
auto* x_data = x.mutable_data<T>();
auto* y_data = y.mutable_data<T>();
auto* out_data = out.mutable_data<T>();
auto* ref_data = ref.mutable_data<T>();
FillData<T>(x_data, x.dims().production());
FillData<T>(y_data, y.dims().production());
FillData<T>(out_data, out.dims().production());
FillData<T>(ref_data, out.dims().production());
MulCompute mul;
operators::MulParam param;
param.x = &x;
param.y = &y;
param.output = &out;
param.x_num_col_dims = 1;
param.y_num_col_dims = 2;
DeviceInfo::Init();
std::unique_ptr<KernelContext> ctx(new KernelContext);
ctx->As<ARMContext>();
mul.SetParam(param);
mul.SetContext(std::move(ctx));
mul.PrepareForRun();
mul.Run();
lite::arm::math::mul_compute_eigen(x_data, 2, 12, y_data, 12, 5, ref_data);
for (int i = 0; i < out.dims().production(); i++) {
EXPECT_NEAR(out_data[i], ref_data[i], 1e-3);
}
}
} // namespace arm
} // namespace kernels
} // namespace lite
} // namespace paddle
USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
......@@ -182,7 +182,7 @@ TEST(pool_arm, compute) {
for (auto stride : {2}) {
for (auto pad : {0}) {
for (auto n : {1, 3, 4, 11}) {
for (auto c : {1, 3, 11, 4, 1024}) {
for (auto c : {1, 3, 11 /* ,1024 */}) { // speedup for ci
for (auto h : {3, 1, 11, 4, 1}) {
for (auto w : {1, 3, 4, 12, 1}) {
VLOG(3) << "n:" << n << " c:" << c << " h:" << h << " w:" << w
......
......@@ -54,6 +54,15 @@ TEST(scale_arm, compute) {
lite::Tensor output;
lite::Tensor output_ref;
#if 1 // for ci speedup
for (auto n : {1, 3}) {
for (auto c : {1, 3}) {
for (auto h : {3, 4}) {
for (auto w : {4, 3}) {
for (auto bias_after_scale : {true, false}) {
for (auto s : {-1.0f, 0.13f}) {
for (auto b : {-15.f, 0.11234f}) {
#else
for (auto n : {1, 3, 4, 11}) {
for (auto c : {1, 3, 11, 4}) {
for (auto h : {3, 1, 11, 4}) {
......@@ -61,6 +70,8 @@ TEST(scale_arm, compute) {
for (auto bias_after_scale : {true, false}) {
for (auto s : {-100.25f, -1.0f, 0.13f, 3840.975f}) {
for (auto b : {-3075.495f, -15.f, 0.11234f, 128.15f}) {
#endif
x.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
output.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
output_ref.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
......
......@@ -9,3 +9,4 @@ cc_library(io_copy_compute_cuda SRCS io_copy_compute.cc DEPS ${tensor_lite})
nv_library(kernels_cuda DEPS mul_compute_cuda io_copy_compute_cuda cuda_blas_lite)
......@@ -13,3 +13,4 @@ set(host_kernels
)
set(host_kernels "${host_kernels}" CACHE GLOBAL "host kernels")
......@@ -35,3 +35,4 @@ set(x86_kernels
)
set(x86_kernels "${x86_kernels}" CACHE INTERNAL "x86 kernels")
......@@ -29,9 +29,9 @@ class SGDCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
using param_t = operators::ActivationParam;
void Run() override {
auto &context = context_->As<X86Context>();
auto &context = ctx_->As<X86Context>();
auto &sgd_param = *param_.get_mutable<operators::SGDParam>();
CHECK(context.x86_device_context);
CHECK(context.x86_device_context());
// param.Out->template mutable_data<T>();
......@@ -45,12 +45,12 @@ class SGDCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
PADDLE_ENFORCE_EQ(grad->numel(), sz);
paddle::operators::jit::sgd_attr_t attr(1, sz, 1, sz, 1);
const T *lr = learning_rate->data<T>();
const T *param_data = param->data<T>();
const T *grad_data = grad->data<T>();
const T *lr = learning_rate->template data<T>();
const T *param_data = param->template data<T>();
const T *grad_data = grad->template data<T>();
int64_t rows_idx = 0;
T *out_data =
param_out->mutable_data<T>(context.x86_device_context->GetPlace());
T *out_data = param_out->template mutable_data<T>(
context.x86_device_context()->GetPlace());
auto sgd =
paddle::operators::jit::KernelFuncs<paddle::operators::jit::SgdTuple<T>,
......
......@@ -27,3 +27,4 @@ lite_cc_test(test_op_desc_lite SRCS op_desc_test.cc DEPS cpp_op_desc_lite op_des
add_subdirectory(pb)
add_subdirectory(cpp)
cc_library(cpp_op_desc_lite SRCS op_desc.cc DEPS any_lite)
cc_library(var_desc_lite SRCS var_desc.cc DEPS framework_proto_lite)
cc_library(op_desc_lite SRCS op_desc.cc DEPS framework_proto_lite)
......@@ -8,6 +8,7 @@ cc_library(mul_op_lite SRCS mul_op.cc DEPS ${op_DEPS})
cc_library(scale_op_lite SRCS scale_op.cc DEPS ${op_DEPS})
cc_library(softmax_op_lite SRCS softmax_op.cc DEPS ${op_DEPS})
cc_library(reshape_op_lite SRCS reshape_op.cc DEPS ${op_DEPS} )
cc_library(batch_norm_op_lite SRCS batch_norm_op.cc DEPS ${op_DEPS})
cc_library(feed_op_lite SRCS feed_op.cc DEPS ${op_DEPS})
cc_library(fetch_op_lite SRCS fetch_op.cc DEPS ${op_DEPS})
cc_library(io_copy_op_lite SRCS io_copy_op.cc DEPS ${op_DEPS})
......@@ -30,6 +31,7 @@ set(ops_lite
scale_op_lite
softmax_op_lite
reshape_op_lite
batch_norm_op_lite
feed_op_lite
fetch_op_lite
io_copy_op_lite
......@@ -52,4 +54,6 @@ lite_cc_test(test_pool_op_lite SRCS pool_op_test.cc
lite_cc_test(test_scale_op_lite SRCS scale_op_test.cc DEPS scale_op_lite memory_lite)
lite_cc_test(test_softmax_op_lite SRCS softmax_op_test.cc DEPS softmax_op_lite memory_lite)
lite_cc_test(test_reshape_op_lite SRCS reshape_op_test.cc DEPS reshape_op_lite memory_lite)
lite_cc_test(test_batch_norm_op_lite SRCS batch_norm_op_test.cc DEPS batch_norm_op_lite memory_lite)
lite_cc_test(test_concat_op_lite SRCS concat_op_test.cc DEPS concat_op_lite memory_lite)
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/lite/operators/batch_norm_op.h"
#include "paddle/fluid/lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace operators {
bool BatchNormOp::CheckShape() const {
CHECK_OR_FALSE(param_.x);
CHECK_OR_FALSE(param_.bias);
CHECK_OR_FALSE(param_.scale);
CHECK_OR_FALSE(param_.mean);
CHECK_OR_FALSE(param_.variance);
CHECK_OR_FALSE(param_.y);
if (!param_.is_test) {
CHECK_OR_FALSE(param_.mean_out);
CHECK_OR_FALSE(param_.variance_out);
CHECK_OR_FALSE(param_.saved_mean);
CHECK_OR_FALSE(param_.saved_variance);
}
auto x_dims = param_.x->dims();
auto scale_dims = param_.scale->dims();
auto bias_dims = param_.bias->dims();
auto mean_dims = param_.mean->dims();
auto variance_dims = param_.variance->dims();
CHECK(x_dims.size() >= 2 && x_dims.size() <= 5)
<< "Input X must have 2 to 5 dimensions.";
CHECK_EQ(scale_dims.size(), 1UL) << "Input Scale must have 1 dimensions.";
CHECK_EQ(bias_dims.size(), 1UL) << "Input Bias must have 1 dimensions.";
CHECK_EQ(mean_dims.size(), 1UL) << "Input Mean must have 1 dimensions.";
CHECK_EQ(variance_dims.size(), 1UL)
<< "Input Variance must have 1 dimensions.";
return true;
}
bool BatchNormOp::InferShape() const {
auto x_dims = param_.x->dims();
int64_t channel_size = 0;
switch (param_.data_layout) {
case DATALAYOUT(kNCHW):
channel_size = x_dims[1];
break;
// case DATALAYOUT(kNHWC):
// channel_size = x_dims[x_dims.size() - 1];
// break;
default:
LOG(FATAL) << "Unknown storage order: "
<< DataLayoutToStr(param_.data_layout);
break;
}
if (!param_.is_test) {
param_.mean_out->Resize({channel_size});
param_.variance_out->Resize({channel_size});
param_.saved_mean->Resize({channel_size});
param_.saved_variance->Resize({channel_size});
}
param_.y->Resize(x_dims);
return true;
}
bool BatchNormOp::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
param_.x = scope->FindVar(op_desc.Input("X").front())->GetMutable<Tensor>();
param_.bias =
scope->FindVar(op_desc.Input("Bias").front())->GetMutable<Tensor>();
param_.scale =
scope->FindVar(op_desc.Input("Scale").front())->GetMutable<Tensor>();
param_.mean =
scope->FindVar(op_desc.Input("Mean").front())->GetMutable<Tensor>();
param_.variance =
scope->FindVar(op_desc.Input("Variance").front())->GetMutable<Tensor>();
param_.y = scope->FindVar(op_desc.Output("Y").front())->GetMutable<Tensor>();
param_.is_test = op_desc.GetAttr<bool>("is_test");
param_.use_global_stats = op_desc.GetAttr<bool>("use_global_stats");
if (!param_.is_test) {
param_.mean_out =
scope->FindVar(op_desc.Output("MeanOut").front())->GetMutable<Tensor>();
param_.variance_out = scope->FindVar(op_desc.Output("VarianceOut").front())
->GetMutable<Tensor>();
param_.saved_mean = scope->FindVar(op_desc.Output("SavedMean").front())
->GetMutable<Tensor>();
param_.saved_variance =
scope->FindVar(op_desc.Output("SavedVariance").front())
->GetMutable<Tensor>();
}
param_.epsilon = op_desc.GetAttr<float>("epsilon");
param_.momentum = op_desc.GetAttr<float>("momentum");
std::string data_layout = op_desc.GetAttr<std::string>("data_layout");
CHECK_EQ(data_layout, "NCHW") << "TODO(hong19860320): Only support NCHW.";
// param_.data_layout = StringToDataLayout(data_layout);
return true;
}
} // namespace operators
} // namespace lite
} // namespace paddle
REGISTER_LITE_OP(batch_norm, paddle::lite::operators::BatchNormOp);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/lite/core/op_lite.h"
#include "paddle/fluid/lite/core/scope.h"
#include "paddle/fluid/lite/utils/all.h"
namespace paddle {
namespace lite {
namespace operators {
class BatchNormOp : public OpLite {
public:
BatchNormOp() {}
explicit BatchNormOp(const std::string &op_type) : OpLite(op_type) {}
bool CheckShape() const override;
bool InferShape() const override;
bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
std::string DebugString() const override { return "batch_norm"; }
private:
mutable BatchNormParam param_;
};
} // namespace operators
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/lite/operators/batch_norm_op.h"
#include <gtest/gtest.h>
#include "paddle/fluid/lite/core/op_registry.h"
namespace paddle {
namespace lite {
namespace operators {
TEST(batch_norm_op_lite, test) {
// prepare variables
Scope scope;
auto* x = scope.Var("x")->GetMutable<Tensor>();
auto* scale = scope.Var("scale")->GetMutable<Tensor>();
auto* bias = scope.Var("bias")->GetMutable<Tensor>();
auto* mean = scope.Var("mean")->GetMutable<Tensor>();
auto* variance = scope.Var("variance")->GetMutable<Tensor>();
auto* y = scope.Var("y")->GetMutable<Tensor>();
x->Resize({2, 32, 10, 20});
auto x_dims = x->dims();
const int64_t channel_size = x_dims[1]; // NCHW
scale->Resize({channel_size});
bias->Resize({channel_size});
mean->Resize({channel_size});
variance->Resize({channel_size});
// prepare op desc
cpp::OpDesc desc;
desc.SetType("batch_norm");
desc.SetInput("X", {"x"});
desc.SetInput("Scale", {"scale"});
desc.SetInput("Bias", {"bias"});
desc.SetInput("Mean", {"mean"});
desc.SetInput("Variance", {"variance"});
desc.SetOutput("Y", {"y"});
desc.SetAttr("is_test", true);
desc.SetAttr("use_global_stats", false);
desc.SetAttr("epsilon", 1e-5f);
desc.SetAttr("momentum", 0.9f);
desc.SetAttr("data_layout", std::string("NCHW"));
BatchNormOp batch_norm("batch_norm");
batch_norm.SetValidPlaces({Place{TARGET(kHost), PRECISION(kFloat)}});
batch_norm.Attach(desc, &scope);
batch_norm.CheckShape();
batch_norm.InferShape();
// check output dims
auto y_dims = y->dims();
CHECK_EQ(y_dims.size(), x_dims.size());
for (size_t i = 0; i < y_dims.size(); i++) {
CHECK_EQ(y_dims[i], x_dims[i]);
}
}
TEST(batch_norm_op_lite, test_enable_is_test) {
// prepare variables
Scope scope;
auto* x = scope.Var("x")->GetMutable<Tensor>();
auto* scale = scope.Var("scale")->GetMutable<Tensor>();
auto* bias = scope.Var("bias")->GetMutable<Tensor>();
auto* mean = scope.Var("mean")->GetMutable<Tensor>();
auto* variance = scope.Var("variance")->GetMutable<Tensor>();
auto* y = scope.Var("y")->GetMutable<Tensor>();
auto* mean_out = scope.Var("mean_out")->GetMutable<Tensor>();
auto* variance_out = scope.Var("variance_out")->GetMutable<Tensor>();
auto* saved_mean = scope.Var("saved_mean")->GetMutable<Tensor>();
auto* saved_variance = scope.Var("saved_variance")->GetMutable<Tensor>();
x->Resize({2, 32, 10, 20});
auto x_dims = x->dims();
const int64_t channel_size = x_dims[1]; // NCHW
scale->Resize({channel_size});
bias->Resize({channel_size});
mean->Resize({channel_size});
variance->Resize({channel_size});
// prepare op desc
cpp::OpDesc desc;
desc.SetType("batch_norm");
desc.SetInput("X", {"x"});
desc.SetInput("Scale", {"scale"});
desc.SetInput("Bias", {"bias"});
desc.SetInput("Mean", {"mean"});
desc.SetInput("Variance", {"variance"});
desc.SetOutput("Y", {"y"});
desc.SetOutput("MeanOut", {"mean_out"});
desc.SetOutput("VarianceOut", {"variance_out"});
desc.SetOutput("SavedMean", {"saved_mean"});
desc.SetOutput("SavedVariance", {"saved_variance"});
desc.SetAttr("is_test", false);
desc.SetAttr("use_global_stats", false);
desc.SetAttr("epsilon", 1e-5f);
desc.SetAttr("momentum", 0.9f);
desc.SetAttr("data_layout", std::string("NCHW"));
BatchNormOp batch_norm("batch_norm");
batch_norm.SetValidPlaces({Place{TARGET(kHost), PRECISION(kFloat)}});
batch_norm.Attach(desc, &scope);
batch_norm.CheckShape();
batch_norm.InferShape();
// check output dims
auto y_dims = y->dims();
CHECK_EQ(y_dims.size(), x_dims.size());
for (size_t i = 0; i < y_dims.size(); i++) {
CHECK_EQ(y_dims[i], x_dims[i]);
}
auto mean_out_dims = mean_out->dims();
auto variance_out_dims = variance_out->dims();
auto saved_mean_dims = saved_mean->dims();
auto saved_variance_dims = saved_variance->dims();
CHECK_EQ(mean_out_dims.size(), 1UL);
CHECK_EQ(variance_out_dims.size(), 1UL);
CHECK_EQ(saved_mean_dims.size(), 1UL);
CHECK_EQ(saved_variance_dims.size(), 1UL);
CHECK_EQ(mean_out_dims[0], channel_size);
CHECK_EQ(variance_out_dims[0], channel_size);
CHECK_EQ(saved_mean_dims[0], channel_size);
CHECK_EQ(saved_variance_dims[0], channel_size);
}
} // namespace operators
} // namespace lite
} // namespace paddle
......@@ -57,6 +57,7 @@ struct FcParam {
lite::Tensor* output{};
lite::DDim in_mat_dims;
int in_num_col_dims{1};
bool weight_transposed{false};
};
struct ReluParam {
......@@ -145,6 +146,25 @@ struct ConvParam {
std::string data_format{"Anylayout"};
};
// For BatchNorm op
struct BatchNormParam {
lite::Tensor* x{};
lite::Tensor* bias{};
lite::Tensor* scale{};
lite::Tensor* mean{};
lite::Tensor* variance{};
lite::Tensor* y{};
lite::Tensor* mean_out{};
lite::Tensor* variance_out{};
lite::Tensor* saved_mean{};
lite::Tensor* saved_variance{};
bool is_test{true};
bool use_global_stats{false};
float epsilon;
float momentum;
DataLayoutType data_layout{DATALAYOUT(kNCHW)};
};
// For Pooling op
struct PoolParam {
lite::Tensor* x{};
......
......@@ -74,7 +74,11 @@ TEST(pool_op_lite, test) {
pool.Attach(desc, &scope);
auto kernels = pool.CreateKernels({Place{TARGET(kARM), PRECISION(kFloat)}});
LOG(INFO) << "kernels.size(): " << kernels.size();
#ifdef LITE_WITH_ARM
ASSERT_FALSE(kernels.empty());
#else
ASSERT_TRUE(kernels.empty());
#endif
}
} // namespace operators
......
......@@ -37,7 +37,7 @@ bool SplitOp::InferShape() const {
const auto &sections = param_.sections;
const int outs_number = outs.size();
std::vector<lite::DDimLite> outs_dims;
std::vector<lite::DDimHvy> outs_dims;
outs_dims.reserve(outs_number);
if (num > 0) {
......
......@@ -13,6 +13,11 @@ function prepare_for_codegen {
mkdir -p ./paddle/fluid/lite/gen_code
touch ./paddle/fluid/lite/gen_code/__generated_code__.cc
}
function check_need_ci {
git log -1 --oneline | grep "test=develop" || exit -1
}
function cmake_x86 {
prepare_for_codegen
cmake .. -DWITH_GPU=OFF -DWITH_MKLDNN=OFF -DLITE_WITH_X86=ON ${common_flags}
......@@ -28,6 +33,17 @@ function cmake_gpu {
cmake .. " -DWITH_GPU=ON {common_flags} -DLITE_WITH_GPU=ON"
}
function check_style {
export PATH=/usr/bin:$PATH
#pre-commit install
clang-format --version
if ! pre-commit run -a ; then
git diff
exit 1
fi
}
function cmake_arm {
# $1: ARM_TARGET_OS in "android" , "armlinux"
# $2: ARM_TARGET_ARCH_ABI in "arm64-v8a", "armeabi-v7a" ,"armeabi-v7a-hf"
......@@ -43,10 +59,15 @@ function cmake_arm {
-DARM_TARGET_OS=$1 -DARM_TARGET_ARCH_ABI=$2
}
function build_single {
#make $1 -j$(expr $(nproc) - 2)
make $1 -j8
}
function build {
file=$1
for _test in $(cat $file); do
make $_test -j$(expr $(nproc) - 2)
build_single $_test
done
}
......@@ -58,44 +79,12 @@ function test_lite {
for _test in $(cat $file); do
# We move the build phase here to make the 'gen_code' test compiles after the
# corresponding test is executed and the C++ code generates.
make $_test -j$(expr $(nproc) - 2)
#make $_test -j$(expr $(nproc) - 2)
make $_test -j8
ctest -R $_test -V
done
}
port_armv8=5554
port_armv7=5556
# Run test on android
function test_lite_android {
local file=$1
local adb_abi=$2
local port=
if [[ ${adb_abi} == "armeabi-v7a" ]]; then
port=${port_armv7}
fi
if [[ ${adb_abi} == "arm64-v8a" ]]; then
port=${port_armv8}
fi
if [[ "${port}x" == "x" ]]; then
echo "Port can not be empty"
exit 1
fi
echo "file: ${file}"
# push all to adb and test
adb_work_dir="/data/local/tmp"
skip_list="test_model_parser_lite"
for _test in $(cat $file); do
[[ $skip_list =~ (^|[[:space:]])$_test($|[[:space:]]) ]] && continue || echo 'skip $_test'
testpath=$(find ./paddle/fluid -name ${_test})
adb -s emulator-${port} push ${testpath} ${adb_work_dir}
adb -s emulator-${port} shell chmod +x "${adb_work_dir}/${_test}"
adb -s emulator-${port} shell "./${adb_work_dir}/${_test}"
done
}
# Build the code and run lite server tests. This is executed in the CI system.
function build_test_server {
mkdir -p ./build
......@@ -108,8 +97,34 @@ function build_test_server {
build $LIBS_FILE
}
# Build the code and run lite server tests. This is executed in the CI system.
# test_arm_android <some_test_name> <adb_port_number>
function test_arm_android {
test_name=$1
port=$2
if [[ "${test_name}x" == "x" ]]; then
echo "test_name can not be empty"
exit 1
fi
if [[ "${port}x" == "x" ]]; then
echo "Port can not be empty"
exit 1
fi
echo "test name: ${test_name}"
adb_work_dir="/data/local/tmp"
skip_list="test_model_parser_lite" # add more with space
[[ $skip_list =~ (^|[[:space:]])$test_name($|[[:space:]]) ]] && continue || echo 'skip $test_name'
testpath=$(find ./paddle/fluid -name ${test_name})
adb -s emulator-${port} push ${testpath} ${adb_work_dir}
adb -s emulator-${port} shell chmod +x "${adb_work_dir}/${test_name}"
adb -s emulator-${port} shell "./${adb_work_dir}/${test_name}"
}
# Build the code and run lite arm tests. This is executed in the CI system.
function build_test_arm {
port_armv8=5554
port_armv7=5556
adb kill-server
adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
# start android arm64-v8a armeabi-v7a emulators first
......@@ -122,6 +137,7 @@ function build_test_arm {
for os in "android" "armlinux" ; do
for abi in "arm64-v8a" "armeabi-v7a" "armeabi-v7a-hf" ; do
# TODO(TJ): enable compile on v7-hf on andorid and all v7 on armlinux
if [[ ${abi} == "armeabi-v7a-hf" ]]; then
echo "armeabi-v7a-hf is not supported on both android and armlinux"
continue
......@@ -138,17 +154,30 @@ function build_test_arm {
cmake_arm ${os} ${abi}
build $TESTS_FILE
# armlinux need in another docker
# TODO(TJ): enable test with armlinux
if [[ ${os} == "android" ]]; then
adb_abi=${abi}
if [[ ${adb_abi} == "armeabi-v7a-hf" ]]; then
adb_abi="armeabi-v7a"
fi
if [[ ${adb_abi} == "armeabi-v7a" ]]; then
# skip v7 tests
# skip all armv7 tests
# TODO(TJ): enable test with armv7
continue
fi
test_lite_android $TESTS_FILE ${adb_abi}
# armlinux need in another docker
local port=
if [[ ${adb_abi} == "armeabi-v7a" ]]; then
port=${port_armv7}
fi
if [[ ${adb_abi} == "arm64-v8a" ]]; then
port=${port_armv8}
fi
echo "test file: ${TESTS_FILE}"
for _test in $(cat $TESTS_FILE); do
test_arm_android $_test $port
done
fi
cd -
done
......@@ -164,12 +193,13 @@ function print_usage {
echo "----------------------------------------"
echo -e "cmake_x86: run cmake with X86 mode"
echo -e "cmake_cuda: run cmake with CUDA mode"
echo -e "cmake_arm: run cmake with ARM mode"
echo -e "--arm_os=<os> --arm_abi=<abi> cmake_arm: run cmake with ARM mode"
echo
echo -e "build: compile the tests"
echo -e "--test_name=<test_name> build_single: compile single test"
echo
echo -e "test_server: run server tests"
echo -e "test_mobile: run mobile tests"
echo -e "--test_name=<test_name> --adb_port_number=<adb_port_number> test_arm_android: run arm test"
echo "----------------------------------------"
echo
}
......@@ -182,11 +212,31 @@ function main {
TESTS_FILE="${i#*=}"
shift
;;
--test_name=*)
TEST_NAME="${i#*=}"
shift
;;
--arm_os=*)
ARM_OS="${i#*=}"
shift
;;
--arm_abi=*)
ARM_ABI="${i#*=}"
shift
;;
--arm_port=*)
ARM_PORT="${i#*=}"
shift
;;
build)
build $TESTS_FILE
build $LIBS_FILE
shift
;;
build_single)
build_single $TEST_NAME
shift
;;
cmake_x86)
cmake_x86
shift
......@@ -196,15 +246,15 @@ function main {
shift
;;
cmake_arm)
cmake_arm $2 $3
cmake_arm $ARM_OS $ARM_ABI
shift
;;
test_server)
test_lite $TESTS_FILE
shift
;;
test_mobile)
test_lite $TESTS_FILE
test_arm_android)
test_arm_android $TEST_NAME $ARM_PORT
shift
;;
build_test_server)
......@@ -215,6 +265,14 @@ function main {
build_test_arm
shift
;;
check_style)
check_style
shift
;;
check_need_ci)
check_need_ci
shift
;;
*)
# unknown option
print_usage
......@@ -224,7 +282,5 @@ function main {
done
}
print_usage
main $@
......@@ -9,3 +9,4 @@ set(utils_DEPS glog)
lite_cc_test(test_varient SRCS varient_test.cc DEPS utils_lite)
cc_library(any_lite SRCS any.cc)
cc_library(utils_lite SRCS cp_logging.cc string.cc DEPS ${utils_DEPS} any_lite)
......@@ -4,3 +4,4 @@ endif()
cc_library(target_wrapper_x86 SRCS target_wrapper.cc)
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.proto.TrainerConfig_pb2 import OptimizationConfig, TrainerConfig
from paddle.proto.ModelConfig_pb2 import ModelConfig
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册