提交 dfa731e1 编写于 作者: L liuruilong

format files

上级 3b82bfb5
......@@ -45,34 +45,35 @@ void BatchNormKernel<CPU, float>::Compute(const BatchNormParam &param) const {
auto scale_ptr = scale->data<float>();
auto bias_ptr = bias->data<float>();
// Tensor inv_std;
// auto inv_std_ptr = inv_std.mutable_data<float>(make_ddim({C}));
PADDLE_MOBILE_ENFORCE(C == variance->numel(), "C must equal to variance.numel()");
PADDLE_MOBILE_ENFORCE(C == variance->numel(),
"C must equal to variance.numel()");
int HXW = H * W;
if (HXW > 32) {
int NXC = N * C;
float *inv_std_ptr = new float[NXC * 4];
float * volatile new_scale_ptr = new float[NXC * 4];
float * volatile new_bias_ptr = new float[NXC * 4];
float *volatile new_scale_ptr = new float[NXC * 4];
float *volatile new_bias_ptr = new float[NXC * 4];
/// std = (var + epsilon).sqrt();
/// inv_std = 1 / std;
for (int i = 0; i < C * 4; i += 4) {
inv_std_ptr[i] =
1 / static_cast<float>(pow((variance_ptr[i/4] + epsilon), 0.5));
1 / static_cast<float>(pow((variance_ptr[i / 4] + epsilon), 0.5));
inv_std_ptr[i + 1] = inv_std_ptr[i];
inv_std_ptr[i + 2] = inv_std_ptr[i];
inv_std_ptr[i + 3] = inv_std_ptr[i];
new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i/4];
new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i / 4];
new_scale_ptr[i + 1] = new_scale_ptr[i];
new_scale_ptr[i + 2] = new_scale_ptr[i];
new_scale_ptr[i + 3] = new_scale_ptr[i];
new_bias_ptr[i] = bias_ptr[i/4] - mean_ptr[i/4] * inv_std_ptr[i] * scale_ptr[i/4];
new_bias_ptr[i] =
bias_ptr[i / 4] - mean_ptr[i / 4] * inv_std_ptr[i] * scale_ptr[i / 4];
new_bias_ptr[i + 1] = new_bias_ptr[i];
new_bias_ptr[i + 2] = new_bias_ptr[i];
......@@ -84,7 +85,6 @@ void BatchNormKernel<CPU, float>::Compute(const BatchNormParam &param) const {
new_bias_ptr[j] = new_bias_ptr[j - C * 4];
}
asm volatile(
"subs %[N], %[N], #1 \n\t"
"blt end_n_%= \n\t"
......@@ -180,14 +180,15 @@ void BatchNormKernel<CPU, float>::Compute(const BatchNormParam &param) const {
"bge loop_n_%= \n\t"
"end_n_%=: \n\t"
:
:[input_x_ptr]"r"(input_x_ptr), [out_ptr]"r"(out_ptr), [new_scale_ptr]"r"(new_scale_ptr), [new_bias_ptr]"r"(new_bias_ptr),
[N]"r"(N), [C]"r"(C), [HXW]"r"(HXW)
:"memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "r5", "r6"
);
: [input_x_ptr] "r"(input_x_ptr), [out_ptr] "r"(out_ptr),
[new_scale_ptr] "r"(new_scale_ptr), [new_bias_ptr] "r"(new_bias_ptr),
[N] "r"(N), [C] "r"(C), [HXW] "r"(HXW)
: "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
"q10", "r5", "r6");
delete [] inv_std_ptr;
delete [] new_scale_ptr;
delete [] new_bias_ptr;
delete[] inv_std_ptr;
delete[] new_scale_ptr;
delete[] new_bias_ptr;
} else {
float *inv_std_ptr = new float[C];
......@@ -205,7 +206,8 @@ void BatchNormKernel<CPU, float>::Compute(const BatchNormParam &param) const {
/// (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
for (int i = 0; i < C; i++) {
new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
new_bias_ptr[i] =
bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
{
for (int n = 0; n < N; n++) {
for (int h = 0; h < H; h++) {
......@@ -220,16 +222,15 @@ void BatchNormKernel<CPU, float>::Compute(const BatchNormParam &param) const {
}
}
delete [] inv_std_ptr;
// DLOG << "input[2,5,1,0](input[102]) ,channel 5 :";
// DLOG << "input_x_ptr : " << input_x_ptr[102];
// DLOG << "variance : " << variance_ptr[5];
// DLOG << "inv_std_ptr : " << inv_std_ptr[5];
// DLOG << "new_scale_ptr : " << new_scale_ptr[5];
// DLOG << "new_bias_ptr : " << new_bias_ptr[5];
// DLOG << "out_ptr : " << out_ptr[102];
delete[] inv_std_ptr;
// DLOG << "input[2,5,1,0](input[102]) ,channel 5 :";
// DLOG << "input_x_ptr : " << input_x_ptr[102];
// DLOG << "variance : " << variance_ptr[5];
// DLOG << "inv_std_ptr : " << inv_std_ptr[5];
// DLOG << "new_scale_ptr : " << new_scale_ptr[5];
// DLOG << "new_bias_ptr : " << new_bias_ptr[5];
// DLOG << "out_ptr : " << out_ptr[102];
}
}
} // namespace operators
} // namespace paddle_mobile
......
......@@ -38,70 +38,71 @@ void ReluKernel<CPU, float>::Compute(const ReluParam &param) const {
auto *out_ptr = out->mutable_data<float>();
int numel = input_x->numel();
// if (numel > 64) {
// asm volatile(
// "pld [%[input_x_ptr], #0] \n\t"
// "vmov.f32 q8, #0.0 \n\t"
// "subs %[num], %[num], #32 \n\t"
// "blt end_num_%= \n\t"
// "loop_num_%=: \n\t"
// "pld [%[input_x_ptr], #1024] \n\t"
//
// "vld1.32 {q0, q1}, [%[input_x_ptr]]! \n\t"
// "vld1.32 {q2, q3}, [%[input_x_ptr]]! \n\t"
// "vld1.32 {q4, q5}, [%[input_x_ptr]]! \n\t"
// "vld1.32 {q6, q7}, [%[input_x_ptr]]! \n\t"
//
// "vmax.f32 q0, q0, q8 \n\t"
// "vmax.f32 q1, q1, q8 \n\t"
// "vmax.f32 q2, q2, q8 \n\t"
// "vmax.f32 q3, q3, q8 \n\t"
// "vmax.f32 q4, q4, q8 \n\t"
// "vmax.f32 q5, q5, q8 \n\t"
// "vmax.f32 q6, q6, q8 \n\t"
// "vmax.f32 q7, q7, q8 \n\t"
//
// "vst1.32 {q0, q1}, [%[out_ptr]]! \n\t"
// "vst1.32 {q2, q3}, [%[out_ptr]]! \n\t"
// "vst1.32 {q4, q5}, [%[out_ptr]]! \n\t"
// "vst1.32 {q6, q7}, [%[out_ptr]]! \n\t"
//
// "subs %[num], %[num], #32 \n\t"
// "bge loop_num_%= \n\t"
// "end_num_%=: \n\t"
// "cmp %[num], #0 \n\t"
// "bge end_%= \n\t"
// "mov r6, #4 \n\t"
// "mul r5, %[num], r6 \n\t"
// "add %[input_x_ptr], %[input_x_ptr], r5 \n\t"
// "vld1.32 {q0, q1}, [%[input_x_ptr]]! \n\t"
// "vld1.32 {q2, q3}, [%[input_x_ptr]]! \n\t"
// "vld1.32 {q4, q5}, [%[input_x_ptr]]! \n\t"
// "vld1.32 {q6, q7}, [%[input_x_ptr]]! \n\t"
// "vmax.f32 q0, q0, q8 \n\t"
// "vmax.f32 q1, q1, q8 \n\t"
// "vmax.f32 q2, q2, q8 \n\t"
// "vmax.f32 q3, q3, q8 \n\t"
// "vmax.f32 q4, q4, q8 \n\t"
// "vmax.f32 q5, q5, q8 \n\t"
// "vmax.f32 q6, q6, q8 \n\t"
// "vmax.f32 q7, q7, q8 \n\t"
// "add %[out_ptr], %[out_ptr], r5 \n\t"
// "vst1.32 {q0, q1}, [%[out_ptr]]! \n\t"
// "vst1.32 {q2, q3}, [%[out_ptr]]! \n\t"
// "vst1.32 {q4, q5}, [%[out_ptr]]! \n\t"
// "vst1.32 {q6, q7}, [%[out_ptr]]! \n\t"
// "end_%=: \n\t"
// :
// :
// [out_ptr] "r"(out_ptr), [input_x_ptr] "r"(input_x_ptr), [num] "r"(numel)
// : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "r5",
// "r6");
// } else {
// if (numel > 64) {
// asm volatile(
// "pld [%[input_x_ptr], #0] \n\t"
// "vmov.f32 q8, #0.0 \n\t"
// "subs %[num], %[num], #32 \n\t"
// "blt end_num_%= \n\t"
// "loop_num_%=: \n\t"
// "pld [%[input_x_ptr], #1024] \n\t"
//
// "vld1.32 {q0, q1}, [%[input_x_ptr]]! \n\t"
// "vld1.32 {q2, q3}, [%[input_x_ptr]]! \n\t"
// "vld1.32 {q4, q5}, [%[input_x_ptr]]! \n\t"
// "vld1.32 {q6, q7}, [%[input_x_ptr]]! \n\t"
//
// "vmax.f32 q0, q0, q8 \n\t"
// "vmax.f32 q1, q1, q8 \n\t"
// "vmax.f32 q2, q2, q8 \n\t"
// "vmax.f32 q3, q3, q8 \n\t"
// "vmax.f32 q4, q4, q8 \n\t"
// "vmax.f32 q5, q5, q8 \n\t"
// "vmax.f32 q6, q6, q8 \n\t"
// "vmax.f32 q7, q7, q8 \n\t"
//
// "vst1.32 {q0, q1}, [%[out_ptr]]! \n\t"
// "vst1.32 {q2, q3}, [%[out_ptr]]! \n\t"
// "vst1.32 {q4, q5}, [%[out_ptr]]! \n\t"
// "vst1.32 {q6, q7}, [%[out_ptr]]! \n\t"
//
// "subs %[num], %[num], #32 \n\t"
// "bge loop_num_%= \n\t"
// "end_num_%=: \n\t"
// "cmp %[num], #0 \n\t"
// "bge end_%= \n\t"
// "mov r6, #4 \n\t"
// "mul r5, %[num], r6 \n\t"
// "add %[input_x_ptr], %[input_x_ptr], r5 \n\t"
// "vld1.32 {q0, q1}, [%[input_x_ptr]]! \n\t"
// "vld1.32 {q2, q3}, [%[input_x_ptr]]! \n\t"
// "vld1.32 {q4, q5}, [%[input_x_ptr]]! \n\t"
// "vld1.32 {q6, q7}, [%[input_x_ptr]]! \n\t"
// "vmax.f32 q0, q0, q8 \n\t"
// "vmax.f32 q1, q1, q8 \n\t"
// "vmax.f32 q2, q2, q8 \n\t"
// "vmax.f32 q3, q3, q8 \n\t"
// "vmax.f32 q4, q4, q8 \n\t"
// "vmax.f32 q5, q5, q8 \n\t"
// "vmax.f32 q6, q6, q8 \n\t"
// "vmax.f32 q7, q7, q8 \n\t"
// "add %[out_ptr], %[out_ptr], r5 \n\t"
// "vst1.32 {q0, q1}, [%[out_ptr]]! \n\t"
// "vst1.32 {q2, q3}, [%[out_ptr]]! \n\t"
// "vst1.32 {q4, q5}, [%[out_ptr]]! \n\t"
// "vst1.32 {q6, q7}, [%[out_ptr]]! \n\t"
// "end_%=: \n\t"
// :
// :
// [out_ptr] "r"(out_ptr), [input_x_ptr] "r"(input_x_ptr), [num]
// "r"(numel) : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6",
// "q7", "q8", "r5",
// "r6");
// } else {
ReluFunctor<float> func_;
math::Transform trans;
trans(input_x_ptr, input_x_ptr + numel, out_ptr, func_);
// }
// }
}
} // namespace operators
} // namespace paddle_mobile
......
......@@ -19,9 +19,9 @@ limitations under the License. */
#ifndef PADDLE_MOBILE_TEST_LIB_SIZE_H
#define PADDLE_MOBILE_TEST_LIB_SIZE_H
#include <vector>
#include <pthread.h>
#include <thread>
#include <vector>
//#include <list>
//#include <tuple>
//#include <typeinfo>
......@@ -74,7 +74,7 @@ void foo() {
// int z = 10;
// }
// std::shared_ptr<int> s1 = std::make_shared<int>();
// std::shared_ptr<int> s1 = std::make_shared<int>();
// std::stringstream ss;
// ss << "12345";
......
......@@ -137,7 +137,8 @@ int main() {
auto *inputx1_ptr = inputx1.data<float>();
paddle_mobile::framework::Tensor mean;
SetupTensor<float>(&mean, {256}, static_cast<float>(0), static_cast<float>(1));
SetupTensor<float>(&mean, {256}, static_cast<float>(0),
static_cast<float>(1));
auto *mean_ptr = mean.data<float>();
paddle_mobile::framework::Tensor scale;
......@@ -151,7 +152,8 @@ int main() {
auto *variance_ptr = variance.data<float>();
paddle_mobile::framework::Tensor bias;
SetupTensor<float>(&bias, {256}, static_cast<float>(0), static_cast<float>(1));
SetupTensor<float>(&bias, {256}, static_cast<float>(0),
static_cast<float>(1));
auto *bias_ptr = bias.data<float>();
paddle_mobile::framework::TestBatchNormOp<paddle_mobile::CPU> testBatchNormOp(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册