提交 ad143ba5 编写于 作者: L liuruilong

format files

上级 aec0327a
...@@ -45,34 +45,35 @@ void BatchNormKernel<CPU, float>::Compute(const BatchNormParam &param) const { ...@@ -45,34 +45,35 @@ void BatchNormKernel<CPU, float>::Compute(const BatchNormParam &param) const {
auto scale_ptr = scale->data<float>(); auto scale_ptr = scale->data<float>();
auto bias_ptr = bias->data<float>(); auto bias_ptr = bias->data<float>();
// Tensor inv_std; // Tensor inv_std;
// auto inv_std_ptr = inv_std.mutable_data<float>(make_ddim({C})); // auto inv_std_ptr = inv_std.mutable_data<float>(make_ddim({C}));
PADDLE_MOBILE_ENFORCE(C == variance->numel(), "C must equal to variance.numel()"); PADDLE_MOBILE_ENFORCE(C == variance->numel(),
"C must equal to variance.numel()");
int HXW = H * W; int HXW = H * W;
if (HXW > 32) { if (HXW > 32) {
int NXC = N * C; int NXC = N * C;
float *inv_std_ptr = new float[NXC * 4]; float *inv_std_ptr = new float[NXC * 4];
float * volatile new_scale_ptr = new float[NXC * 4]; float *volatile new_scale_ptr = new float[NXC * 4];
float * volatile new_bias_ptr = new float[NXC * 4]; float *volatile new_bias_ptr = new float[NXC * 4];
/// std = (var + epsilon).sqrt(); /// std = (var + epsilon).sqrt();
/// inv_std = 1 / std; /// inv_std = 1 / std;
for (int i = 0; i < C * 4; i += 4) { for (int i = 0; i < C * 4; i += 4) {
inv_std_ptr[i] = inv_std_ptr[i] =
1 / static_cast<float>(pow((variance_ptr[i/4] + epsilon), 0.5)); 1 / static_cast<float>(pow((variance_ptr[i / 4] + epsilon), 0.5));
inv_std_ptr[i + 1] = inv_std_ptr[i]; inv_std_ptr[i + 1] = inv_std_ptr[i];
inv_std_ptr[i + 2] = inv_std_ptr[i]; inv_std_ptr[i + 2] = inv_std_ptr[i];
inv_std_ptr[i + 3] = inv_std_ptr[i]; inv_std_ptr[i + 3] = inv_std_ptr[i];
new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i/4]; new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i / 4];
new_scale_ptr[i + 1] = new_scale_ptr[i]; new_scale_ptr[i + 1] = new_scale_ptr[i];
new_scale_ptr[i + 2] = new_scale_ptr[i]; new_scale_ptr[i + 2] = new_scale_ptr[i];
new_scale_ptr[i + 3] = new_scale_ptr[i]; new_scale_ptr[i + 3] = new_scale_ptr[i];
new_bias_ptr[i] = bias_ptr[i/4] - mean_ptr[i/4] * inv_std_ptr[i] * scale_ptr[i/4]; new_bias_ptr[i] =
bias_ptr[i / 4] - mean_ptr[i / 4] * inv_std_ptr[i] * scale_ptr[i / 4];
new_bias_ptr[i + 1] = new_bias_ptr[i]; new_bias_ptr[i + 1] = new_bias_ptr[i];
new_bias_ptr[i + 2] = new_bias_ptr[i]; new_bias_ptr[i + 2] = new_bias_ptr[i];
...@@ -84,7 +85,6 @@ void BatchNormKernel<CPU, float>::Compute(const BatchNormParam &param) const { ...@@ -84,7 +85,6 @@ void BatchNormKernel<CPU, float>::Compute(const BatchNormParam &param) const {
new_bias_ptr[j] = new_bias_ptr[j - C * 4]; new_bias_ptr[j] = new_bias_ptr[j - C * 4];
} }
asm volatile( asm volatile(
"subs %[N], %[N], #1 \n\t" "subs %[N], %[N], #1 \n\t"
"blt end_n_%= \n\t" "blt end_n_%= \n\t"
...@@ -180,14 +180,15 @@ void BatchNormKernel<CPU, float>::Compute(const BatchNormParam &param) const { ...@@ -180,14 +180,15 @@ void BatchNormKernel<CPU, float>::Compute(const BatchNormParam &param) const {
"bge loop_n_%= \n\t" "bge loop_n_%= \n\t"
"end_n_%=: \n\t" "end_n_%=: \n\t"
: :
:[input_x_ptr]"r"(input_x_ptr), [out_ptr]"r"(out_ptr), [new_scale_ptr]"r"(new_scale_ptr), [new_bias_ptr]"r"(new_bias_ptr), : [input_x_ptr] "r"(input_x_ptr), [out_ptr] "r"(out_ptr),
[N]"r"(N), [C]"r"(C), [HXW]"r"(HXW) [new_scale_ptr] "r"(new_scale_ptr), [new_bias_ptr] "r"(new_bias_ptr),
:"memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "r5", "r6" [N] "r"(N), [C] "r"(C), [HXW] "r"(HXW)
); : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
"q10", "r5", "r6");
delete [] inv_std_ptr; delete[] inv_std_ptr;
delete [] new_scale_ptr; delete[] new_scale_ptr;
delete [] new_bias_ptr; delete[] new_bias_ptr;
} else { } else {
float *inv_std_ptr = new float[C]; float *inv_std_ptr = new float[C];
...@@ -205,7 +206,8 @@ void BatchNormKernel<CPU, float>::Compute(const BatchNormParam &param) const { ...@@ -205,7 +206,8 @@ void BatchNormKernel<CPU, float>::Compute(const BatchNormParam &param) const {
/// (x * inv_var * scale) + (bias - est_mean * inv_var * scale) /// (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
for (int i = 0; i < C; i++) { for (int i = 0; i < C; i++) {
new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i]; new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i]; new_bias_ptr[i] =
bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
{ {
for (int n = 0; n < N; n++) { for (int n = 0; n < N; n++) {
for (int h = 0; h < H; h++) { for (int h = 0; h < H; h++) {
...@@ -220,16 +222,15 @@ void BatchNormKernel<CPU, float>::Compute(const BatchNormParam &param) const { ...@@ -220,16 +222,15 @@ void BatchNormKernel<CPU, float>::Compute(const BatchNormParam &param) const {
} }
} }
delete [] inv_std_ptr; delete[] inv_std_ptr;
// DLOG << "input[2,5,1,0](input[102]) ,channel 5 :"; // DLOG << "input[2,5,1,0](input[102]) ,channel 5 :";
// DLOG << "input_x_ptr : " << input_x_ptr[102]; // DLOG << "input_x_ptr : " << input_x_ptr[102];
// DLOG << "variance : " << variance_ptr[5]; // DLOG << "variance : " << variance_ptr[5];
// DLOG << "inv_std_ptr : " << inv_std_ptr[5]; // DLOG << "inv_std_ptr : " << inv_std_ptr[5];
// DLOG << "new_scale_ptr : " << new_scale_ptr[5]; // DLOG << "new_scale_ptr : " << new_scale_ptr[5];
// DLOG << "new_bias_ptr : " << new_bias_ptr[5]; // DLOG << "new_bias_ptr : " << new_bias_ptr[5];
// DLOG << "out_ptr : " << out_ptr[102]; // DLOG << "out_ptr : " << out_ptr[102];
} }
} }
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
......
...@@ -38,70 +38,71 @@ void ReluKernel<CPU, float>::Compute(const ReluParam &param) const { ...@@ -38,70 +38,71 @@ void ReluKernel<CPU, float>::Compute(const ReluParam &param) const {
auto *out_ptr = out->mutable_data<float>(); auto *out_ptr = out->mutable_data<float>();
int numel = input_x->numel(); int numel = input_x->numel();
// if (numel > 64) { // if (numel > 64) {
// asm volatile( // asm volatile(
// "pld [%[input_x_ptr], #0] \n\t" // "pld [%[input_x_ptr], #0] \n\t"
// "vmov.f32 q8, #0.0 \n\t" // "vmov.f32 q8, #0.0 \n\t"
// "subs %[num], %[num], #32 \n\t" // "subs %[num], %[num], #32 \n\t"
// "blt end_num_%= \n\t" // "blt end_num_%= \n\t"
// "loop_num_%=: \n\t" // "loop_num_%=: \n\t"
// "pld [%[input_x_ptr], #1024] \n\t" // "pld [%[input_x_ptr], #1024] \n\t"
// //
// "vld1.32 {q0, q1}, [%[input_x_ptr]]! \n\t" // "vld1.32 {q0, q1}, [%[input_x_ptr]]! \n\t"
// "vld1.32 {q2, q3}, [%[input_x_ptr]]! \n\t" // "vld1.32 {q2, q3}, [%[input_x_ptr]]! \n\t"
// "vld1.32 {q4, q5}, [%[input_x_ptr]]! \n\t" // "vld1.32 {q4, q5}, [%[input_x_ptr]]! \n\t"
// "vld1.32 {q6, q7}, [%[input_x_ptr]]! \n\t" // "vld1.32 {q6, q7}, [%[input_x_ptr]]! \n\t"
// //
// "vmax.f32 q0, q0, q8 \n\t" // "vmax.f32 q0, q0, q8 \n\t"
// "vmax.f32 q1, q1, q8 \n\t" // "vmax.f32 q1, q1, q8 \n\t"
// "vmax.f32 q2, q2, q8 \n\t" // "vmax.f32 q2, q2, q8 \n\t"
// "vmax.f32 q3, q3, q8 \n\t" // "vmax.f32 q3, q3, q8 \n\t"
// "vmax.f32 q4, q4, q8 \n\t" // "vmax.f32 q4, q4, q8 \n\t"
// "vmax.f32 q5, q5, q8 \n\t" // "vmax.f32 q5, q5, q8 \n\t"
// "vmax.f32 q6, q6, q8 \n\t" // "vmax.f32 q6, q6, q8 \n\t"
// "vmax.f32 q7, q7, q8 \n\t" // "vmax.f32 q7, q7, q8 \n\t"
// //
// "vst1.32 {q0, q1}, [%[out_ptr]]! \n\t" // "vst1.32 {q0, q1}, [%[out_ptr]]! \n\t"
// "vst1.32 {q2, q3}, [%[out_ptr]]! \n\t" // "vst1.32 {q2, q3}, [%[out_ptr]]! \n\t"
// "vst1.32 {q4, q5}, [%[out_ptr]]! \n\t" // "vst1.32 {q4, q5}, [%[out_ptr]]! \n\t"
// "vst1.32 {q6, q7}, [%[out_ptr]]! \n\t" // "vst1.32 {q6, q7}, [%[out_ptr]]! \n\t"
// //
// "subs %[num], %[num], #32 \n\t" // "subs %[num], %[num], #32 \n\t"
// "bge loop_num_%= \n\t" // "bge loop_num_%= \n\t"
// "end_num_%=: \n\t" // "end_num_%=: \n\t"
// "cmp %[num], #0 \n\t" // "cmp %[num], #0 \n\t"
// "bge end_%= \n\t" // "bge end_%= \n\t"
// "mov r6, #4 \n\t" // "mov r6, #4 \n\t"
// "mul r5, %[num], r6 \n\t" // "mul r5, %[num], r6 \n\t"
// "add %[input_x_ptr], %[input_x_ptr], r5 \n\t" // "add %[input_x_ptr], %[input_x_ptr], r5 \n\t"
// "vld1.32 {q0, q1}, [%[input_x_ptr]]! \n\t" // "vld1.32 {q0, q1}, [%[input_x_ptr]]! \n\t"
// "vld1.32 {q2, q3}, [%[input_x_ptr]]! \n\t" // "vld1.32 {q2, q3}, [%[input_x_ptr]]! \n\t"
// "vld1.32 {q4, q5}, [%[input_x_ptr]]! \n\t" // "vld1.32 {q4, q5}, [%[input_x_ptr]]! \n\t"
// "vld1.32 {q6, q7}, [%[input_x_ptr]]! \n\t" // "vld1.32 {q6, q7}, [%[input_x_ptr]]! \n\t"
// "vmax.f32 q0, q0, q8 \n\t" // "vmax.f32 q0, q0, q8 \n\t"
// "vmax.f32 q1, q1, q8 \n\t" // "vmax.f32 q1, q1, q8 \n\t"
// "vmax.f32 q2, q2, q8 \n\t" // "vmax.f32 q2, q2, q8 \n\t"
// "vmax.f32 q3, q3, q8 \n\t" // "vmax.f32 q3, q3, q8 \n\t"
// "vmax.f32 q4, q4, q8 \n\t" // "vmax.f32 q4, q4, q8 \n\t"
// "vmax.f32 q5, q5, q8 \n\t" // "vmax.f32 q5, q5, q8 \n\t"
// "vmax.f32 q6, q6, q8 \n\t" // "vmax.f32 q6, q6, q8 \n\t"
// "vmax.f32 q7, q7, q8 \n\t" // "vmax.f32 q7, q7, q8 \n\t"
// "add %[out_ptr], %[out_ptr], r5 \n\t" // "add %[out_ptr], %[out_ptr], r5 \n\t"
// "vst1.32 {q0, q1}, [%[out_ptr]]! \n\t" // "vst1.32 {q0, q1}, [%[out_ptr]]! \n\t"
// "vst1.32 {q2, q3}, [%[out_ptr]]! \n\t" // "vst1.32 {q2, q3}, [%[out_ptr]]! \n\t"
// "vst1.32 {q4, q5}, [%[out_ptr]]! \n\t" // "vst1.32 {q4, q5}, [%[out_ptr]]! \n\t"
// "vst1.32 {q6, q7}, [%[out_ptr]]! \n\t" // "vst1.32 {q6, q7}, [%[out_ptr]]! \n\t"
// "end_%=: \n\t" // "end_%=: \n\t"
// : // :
// : // :
// [out_ptr] "r"(out_ptr), [input_x_ptr] "r"(input_x_ptr), [num] "r"(numel) // [out_ptr] "r"(out_ptr), [input_x_ptr] "r"(input_x_ptr), [num]
// : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "r5", // "r"(numel) : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6",
// "r6"); // "q7", "q8", "r5",
// } else { // "r6");
// } else {
ReluFunctor<float> func_; ReluFunctor<float> func_;
math::Transform trans; math::Transform trans;
trans(input_x_ptr, input_x_ptr + numel, out_ptr, func_); trans(input_x_ptr, input_x_ptr + numel, out_ptr, func_);
// } // }
} }
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
......
...@@ -19,9 +19,9 @@ limitations under the License. */ ...@@ -19,9 +19,9 @@ limitations under the License. */
#ifndef PADDLE_MOBILE_TEST_LIB_SIZE_H #ifndef PADDLE_MOBILE_TEST_LIB_SIZE_H
#define PADDLE_MOBILE_TEST_LIB_SIZE_H #define PADDLE_MOBILE_TEST_LIB_SIZE_H
#include <vector>
#include <pthread.h> #include <pthread.h>
#include <thread> #include <thread>
#include <vector>
//#include <list> //#include <list>
//#include <tuple> //#include <tuple>
//#include <typeinfo> //#include <typeinfo>
...@@ -74,7 +74,7 @@ void foo() { ...@@ -74,7 +74,7 @@ void foo() {
// int z = 10; // int z = 10;
// } // }
// std::shared_ptr<int> s1 = std::make_shared<int>(); // std::shared_ptr<int> s1 = std::make_shared<int>();
// std::stringstream ss; // std::stringstream ss;
// ss << "12345"; // ss << "12345";
......
...@@ -137,7 +137,8 @@ int main() { ...@@ -137,7 +137,8 @@ int main() {
auto *inputx1_ptr = inputx1.data<float>(); auto *inputx1_ptr = inputx1.data<float>();
paddle_mobile::framework::Tensor mean; paddle_mobile::framework::Tensor mean;
SetupTensor<float>(&mean, {256}, static_cast<float>(0), static_cast<float>(1)); SetupTensor<float>(&mean, {256}, static_cast<float>(0),
static_cast<float>(1));
auto *mean_ptr = mean.data<float>(); auto *mean_ptr = mean.data<float>();
paddle_mobile::framework::Tensor scale; paddle_mobile::framework::Tensor scale;
...@@ -151,7 +152,8 @@ int main() { ...@@ -151,7 +152,8 @@ int main() {
auto *variance_ptr = variance.data<float>(); auto *variance_ptr = variance.data<float>();
paddle_mobile::framework::Tensor bias; paddle_mobile::framework::Tensor bias;
SetupTensor<float>(&bias, {256}, static_cast<float>(0), static_cast<float>(1)); SetupTensor<float>(&bias, {256}, static_cast<float>(0),
static_cast<float>(1));
auto *bias_ptr = bias.data<float>(); auto *bias_ptr = bias.data<float>();
paddle_mobile::framework::TestBatchNormOp<paddle_mobile::CPU> testBatchNormOp( paddle_mobile::framework::TestBatchNormOp<paddle_mobile::CPU> testBatchNormOp(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册