提交 98c24151 编写于 作者: R Ruilong Liu 提交者: GitHub

Merge pull request #423 from codeWorm2015/develop

fix #422 optimize batch norm 
...@@ -23,7 +23,6 @@ namespace operators { ...@@ -23,7 +23,6 @@ namespace operators {
template <> template <>
void BatchNormKernel<CPU, float>::Compute(const BatchNormParam &param) const { void BatchNormKernel<CPU, float>::Compute(const BatchNormParam &param) const {
/// todo: test.
const Tensor *input_x = param.InputX(); const Tensor *input_x = param.InputX();
auto input_x_ptr = input_x->data<float>(); auto input_x_ptr = input_x->data<float>();
const auto &x_dims = input_x->dims(); const auto &x_dims = input_x->dims();
...@@ -46,50 +45,192 @@ void BatchNormKernel<CPU, float>::Compute(const BatchNormParam &param) const { ...@@ -46,50 +45,192 @@ void BatchNormKernel<CPU, float>::Compute(const BatchNormParam &param) const {
auto scale_ptr = scale->data<float>(); auto scale_ptr = scale->data<float>();
auto bias_ptr = bias->data<float>(); auto bias_ptr = bias->data<float>();
Tensor inv_std; // Tensor inv_std;
auto inv_std_ptr = inv_std.mutable_data<float>(make_ddim({C})); // auto inv_std_ptr = inv_std.mutable_data<float>(make_ddim({C}));
if (C != variance->numel()) {
DLOG << "C must equal to variance.numel()";
}
assert(C == variance->numel());
/// std = (var + epsilon).sqrt(); PADDLE_MOBILE_ENFORCE(C == variance->numel(),
/// inv_std = 1 / std; "C must equal to variance.numel()");
for (int i = 0; i < C; i++) {
inv_std_ptr[i] = int HXW = H * W;
1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5)); if (HXW > 32) {
} int NXC = N * C;
float *inv_std_ptr = new float[NXC * 4];
float *volatile new_scale_ptr = new float[NXC * 4];
float *volatile new_bias_ptr = new float[NXC * 4];
/// std = (var + epsilon).sqrt();
/// inv_std = 1 / std;
for (int i = 0; i < C * 4; i += 4) {
inv_std_ptr[i] =
1 / static_cast<float>(pow((variance_ptr[i / 4] + epsilon), 0.5));
inv_std_ptr[i + 1] = inv_std_ptr[i];
inv_std_ptr[i + 2] = inv_std_ptr[i];
inv_std_ptr[i + 3] = inv_std_ptr[i];
new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i / 4];
new_scale_ptr[i + 1] = new_scale_ptr[i];
new_scale_ptr[i + 2] = new_scale_ptr[i];
new_scale_ptr[i + 3] = new_scale_ptr[i];
new_bias_ptr[i] =
bias_ptr[i / 4] - mean_ptr[i / 4] * inv_std_ptr[i] * scale_ptr[i / 4];
new_bias_ptr[i + 1] = new_bias_ptr[i];
new_bias_ptr[i + 2] = new_bias_ptr[i];
new_bias_ptr[i + 3] = new_bias_ptr[i];
}
for (int j = C * 4; j < NXC * 4; ++j) {
new_scale_ptr[j] = new_scale_ptr[j - C * 4];
new_bias_ptr[j] = new_bias_ptr[j - C * 4];
}
asm volatile(
"subs %[N], %[N], #1 \n\t"
"blt end_n_%= \n\t"
"loop_n_%=: \n\t"
"subs %[C], %[C], #1 \n\t"
"blt end_c_%= \n\t"
"loop_c_%=: \n\t"
"vld1.32 {q9}, [%[new_scale_ptr]]! \n\t"
"vld1.32 {q10}, [%[new_bias_ptr]]! \n\t"
"mov r6, %[HXW] \n\t"
"subs r6, r6, #32 \n\t"
"blt end_hw_%= \n\t"
"loop_hw_%=: \n\t"
"vld1.32 {q1, q2}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q3, q4}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q5, q6}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q7, q8}, [%[input_x_ptr]]! \n\t"
"vmul.f32 q1, q1, q9 \n\t"
"vmul.f32 q2, q2, q9 \n\t"
"vmul.f32 q3, q3, q9 \n\t"
"vmul.f32 q4, q4, q9 \n\t"
Tensor new_scale; "vmul.f32 q5, q5, q9 \n\t"
auto new_scale_ptr = new_scale.mutable_data<float>(make_ddim({C})); "vmul.f32 q6, q6, q9 \n\t"
Tensor new_bias; "vmul.f32 q7, q7, q9 \n\t"
auto new_bias_ptr = new_bias.mutable_data<float>(make_ddim({C})); "vmul.f32 q8, q8, q9 \n\t"
/// ((x - est_mean) * (inv_var) * scale + bias equal to "vadd.f32 q1, q1, q10 \n\t"
/// (x * inv_var * scale) + (bias - est_mean * inv_var * scale) "vadd.f32 q2, q2, q10 \n\t"
for (int i = 0; i < C; i++) { "vadd.f32 q3, q3, q10 \n\t"
new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i]; "vadd.f32 q4, q4, q10 \n\t"
new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i]; "vadd.f32 q5, q5, q10 \n\t"
{ "vadd.f32 q6, q6, q10 \n\t"
for (int n = 0; n < N; n++) { "vadd.f32 q7, q7, q10 \n\t"
for (int h = 0; h < H; h++) { "vadd.f32 q8, q8, q10 \n\t"
int tmp_index = n * stride0 + i * stride1 + h * stride2;
for (int w = 0; w < W; w++) { "vst1.32 {q1, q2}, [%[out_ptr]]! \n\t"
int index = tmp_index + w; "vst1.32 {q3, q4}, [%[out_ptr]]! \n\t"
out_ptr[index] = "vst1.32 {q5, q6}, [%[out_ptr]]! \n\t"
input_x_ptr[index] * new_scale_ptr[i] + new_bias_ptr[i]; "vst1.32 {q7, q8}, [%[out_ptr]]! \n\t"
"subs r6, r6, #32 \n\t"
"bge loop_hw_%= \n\t"
"end_hw_%=: \n\t"
"cmp r6, #0 \n\t"
"bge end_remainder_%= \n\t"
"mov r5, #4 \n\t"
"mul r6, r6, r5 \n\t"
"add %[input_x_ptr], %[input_x_ptr], r6 \n\t"
"vld1.32 {q1, q2}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q3, q4}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q5, q6}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q7, q8}, [%[input_x_ptr]]! \n\t"
"vmul.f32 q1, q1, q9 \n\t"
"vmul.f32 q2, q2, q9 \n\t"
"vmul.f32 q3, q3, q9 \n\t"
"vmul.f32 q4, q4, q9 \n\t"
"vmul.f32 q5, q5, q9 \n\t"
"vmul.f32 q6, q6, q9 \n\t"
"vmul.f32 q7, q7, q9 \n\t"
"vmul.f32 q8, q8, q9 \n\t"
"vadd.f32 q1, q1, q10 \n\t"
"vadd.f32 q2, q2, q10 \n\t"
"vadd.f32 q3, q3, q10 \n\t"
"vadd.f32 q4, q4, q10 \n\t"
"vadd.f32 q5, q5, q10 \n\t"
"vadd.f32 q6, q6, q10 \n\t"
"vadd.f32 q7, q7, q10 \n\t"
"vadd.f32 q8, q8, q10 \n\t"
"add %[out_ptr], %[out_ptr], r6 \n\t"
"vst1.32 {q1, q2}, [%[out_ptr]]! \n\t"
"vst1.32 {q3, q4}, [%[out_ptr]]! \n\t"
"vst1.32 {q5, q6}, [%[out_ptr]]! \n\t"
"vst1.32 {q7, q8}, [%[out_ptr]]! \n\t"
"end_remainder_%=: \n\t"
"subs %[C], %[C], #1 \n\t"
"bge loop_c_%= \n\t"
"end_c_%=: \n\t"
"subs %[N], %[N], #1 \n\t"
"bge loop_n_%= \n\t"
"end_n_%=: \n\t"
:
: [input_x_ptr] "r"(input_x_ptr), [out_ptr] "r"(out_ptr),
[new_scale_ptr] "r"(new_scale_ptr), [new_bias_ptr] "r"(new_bias_ptr),
[N] "r"(N), [C] "r"(C), [HXW] "r"(HXW)
: "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
"q10", "r5", "r6");
delete[] inv_std_ptr;
delete[] new_scale_ptr;
delete[] new_bias_ptr;
} else {
float *inv_std_ptr = new float[C];
for (int i = 0; i < C; i++) {
inv_std_ptr[i] =
1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
}
Tensor new_scale;
auto new_scale_ptr = new_scale.mutable_data<float>(make_ddim({C}));
Tensor new_bias;
auto new_bias_ptr = new_bias.mutable_data<float>(make_ddim({C}));
/// ((x - est_mean) * (inv_var) * scale + bias equal to
/// (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
for (int i = 0; i < C; i++) {
new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
new_bias_ptr[i] =
bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
{
for (int n = 0; n < N; n++) {
for (int h = 0; h < H; h++) {
int tmp_index = n * stride0 + i * stride1 + h * stride2;
for (int w = 0; w < W; w++) {
int index = tmp_index + w;
out_ptr[index] =
input_x_ptr[index] * new_scale_ptr[i] + new_bias_ptr[i];
}
} }
} }
} }
} }
delete[] inv_std_ptr;
// DLOG << "input[2,5,1,0](input[102]) ,channel 5 :";
// DLOG << "input_x_ptr : " << input_x_ptr[102];
// DLOG << "variance : " << variance_ptr[5];
// DLOG << "inv_std_ptr : " << inv_std_ptr[5];
// DLOG << "new_scale_ptr : " << new_scale_ptr[5];
// DLOG << "new_bias_ptr : " << new_bias_ptr[5];
// DLOG << "out_ptr : " << out_ptr[102];
} }
DLOG << "input[2,5,1,0](input[102]) ,channel 5 :";
DLOG << "input_x_ptr : " << input_x_ptr[102];
DLOG << "variance : " << variance_ptr[5];
DLOG << "inv_std_ptr : " << inv_std_ptr[5];
DLOG << "new_scale_ptr : " << new_scale_ptr[5];
DLOG << "new_bias_ptr : " << new_bias_ptr[5];
DLOG << "out_ptr : " << out_ptr[102];
} }
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
......
...@@ -38,70 +38,71 @@ void ReluKernel<CPU, float>::Compute(const ReluParam &param) const { ...@@ -38,70 +38,71 @@ void ReluKernel<CPU, float>::Compute(const ReluParam &param) const {
auto *out_ptr = out->mutable_data<float>(); auto *out_ptr = out->mutable_data<float>();
int numel = input_x->numel(); int numel = input_x->numel();
if (numel > 32) { // if (numel > 64) {
asm volatile( // asm volatile(
"pld [%[input_x_ptr], #0] \n\t" // "pld [%[input_x_ptr], #0] \n\t"
"vmov.f32 q8, #0.0 \n\t" // "vmov.f32 q8, #0.0 \n\t"
"subs %[num], %[num], #32 \n\t" // "subs %[num], %[num], #32 \n\t"
"blt end_num_%= \n\t" // "blt end_num_%= \n\t"
"loop_num_%=: \n\t" // "loop_num_%=: \n\t"
"pld [%[input_x_ptr], #1024] \n\t" // "pld [%[input_x_ptr], #1024] \n\t"
//
"vld1.32 {q0, q1}, [%[input_x_ptr]]! \n\t" // "vld1.32 {q0, q1}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q2, q3}, [%[input_x_ptr]]! \n\t" // "vld1.32 {q2, q3}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q4, q5}, [%[input_x_ptr]]! \n\t" // "vld1.32 {q4, q5}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q6, q7}, [%[input_x_ptr]]! \n\t" // "vld1.32 {q6, q7}, [%[input_x_ptr]]! \n\t"
//
"vmax.f32 q0, q0, q8 \n\t" // "vmax.f32 q0, q0, q8 \n\t"
"vmax.f32 q1, q1, q8 \n\t" // "vmax.f32 q1, q1, q8 \n\t"
"vmax.f32 q2, q2, q8 \n\t" // "vmax.f32 q2, q2, q8 \n\t"
"vmax.f32 q3, q3, q8 \n\t" // "vmax.f32 q3, q3, q8 \n\t"
"vmax.f32 q4, q4, q8 \n\t" // "vmax.f32 q4, q4, q8 \n\t"
"vmax.f32 q5, q5, q8 \n\t" // "vmax.f32 q5, q5, q8 \n\t"
"vmax.f32 q6, q6, q8 \n\t" // "vmax.f32 q6, q6, q8 \n\t"
"vmax.f32 q7, q7, q8 \n\t" // "vmax.f32 q7, q7, q8 \n\t"
//
"vst1.32 {q0, q1}, [%[out_ptr]]! \n\t" // "vst1.32 {q0, q1}, [%[out_ptr]]! \n\t"
"vst1.32 {q2, q3}, [%[out_ptr]]! \n\t" // "vst1.32 {q2, q3}, [%[out_ptr]]! \n\t"
"vst1.32 {q4, q5}, [%[out_ptr]]! \n\t" // "vst1.32 {q4, q5}, [%[out_ptr]]! \n\t"
"vst1.32 {q6, q7}, [%[out_ptr]]! \n\t" // "vst1.32 {q6, q7}, [%[out_ptr]]! \n\t"
//
"subs %[num], %[num], #32 \n\t" // "subs %[num], %[num], #32 \n\t"
"bge loop_num_%= \n\t" // "bge loop_num_%= \n\t"
"end_num_%=: \n\t" // "end_num_%=: \n\t"
"cmp %[num], #0 \n\t" // "cmp %[num], #0 \n\t"
"bge end_%= \n\t" // "bge end_%= \n\t"
"mov r6, #4 \n\t" // "mov r6, #4 \n\t"
"mul r5, %[num], r6 \n\t" // "mul r5, %[num], r6 \n\t"
"add %[input_x_ptr], %[input_x_ptr], r5 \n\t" // "add %[input_x_ptr], %[input_x_ptr], r5 \n\t"
"vld1.32 {q0, q1}, [%[input_x_ptr]]! \n\t" // "vld1.32 {q0, q1}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q2, q3}, [%[input_x_ptr]]! \n\t" // "vld1.32 {q2, q3}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q4, q5}, [%[input_x_ptr]]! \n\t" // "vld1.32 {q4, q5}, [%[input_x_ptr]]! \n\t"
"vld1.32 {q6, q7}, [%[input_x_ptr]]! \n\t" // "vld1.32 {q6, q7}, [%[input_x_ptr]]! \n\t"
"vmax.f32 q0, q0, q8 \n\t" // "vmax.f32 q0, q0, q8 \n\t"
"vmax.f32 q1, q1, q8 \n\t" // "vmax.f32 q1, q1, q8 \n\t"
"vmax.f32 q2, q2, q8 \n\t" // "vmax.f32 q2, q2, q8 \n\t"
"vmax.f32 q3, q3, q8 \n\t" // "vmax.f32 q3, q3, q8 \n\t"
"vmax.f32 q4, q4, q8 \n\t" // "vmax.f32 q4, q4, q8 \n\t"
"vmax.f32 q5, q5, q8 \n\t" // "vmax.f32 q5, q5, q8 \n\t"
"vmax.f32 q6, q6, q8 \n\t" // "vmax.f32 q6, q6, q8 \n\t"
"vmax.f32 q7, q7, q8 \n\t" // "vmax.f32 q7, q7, q8 \n\t"
"add %[out_ptr], %[out_ptr], r5 \n\t" // "add %[out_ptr], %[out_ptr], r5 \n\t"
"vst1.32 {q0, q1}, [%[out_ptr]]! \n\t" // "vst1.32 {q0, q1}, [%[out_ptr]]! \n\t"
"vst1.32 {q2, q3}, [%[out_ptr]]! \n\t" // "vst1.32 {q2, q3}, [%[out_ptr]]! \n\t"
"vst1.32 {q4, q5}, [%[out_ptr]]! \n\t" // "vst1.32 {q4, q5}, [%[out_ptr]]! \n\t"
"vst1.32 {q6, q7}, [%[out_ptr]]! \n\t" // "vst1.32 {q6, q7}, [%[out_ptr]]! \n\t"
"end_%=: \n\t" // "end_%=: \n\t"
: // :
: // :
[out_ptr] "r"(out_ptr), [input_x_ptr] "r"(input_x_ptr), [num] "r"(numel) // [out_ptr] "r"(out_ptr), [input_x_ptr] "r"(input_x_ptr), [num]
: "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "r5", // "r"(numel) : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6",
"r6"); // "q7", "q8", "r5",
} else { // "r6");
ReluFunctor<float> func_; // } else {
math::Transform trans; ReluFunctor<float> func_;
trans(input_x_ptr, input_x_ptr + numel, out_ptr, func_); math::Transform trans;
} trans(input_x_ptr, input_x_ptr + numel, out_ptr, func_);
// }
} }
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
......
...@@ -137,4 +137,6 @@ else () ...@@ -137,4 +137,6 @@ else ()
ADD_EXECUTABLE(test-depthwise-conv-op operators/test_depthwise_conv_op.cpp test_helper.h test_include.h executor_for_test.h) ADD_EXECUTABLE(test-depthwise-conv-op operators/test_depthwise_conv_op.cpp test_helper.h test_include.h executor_for_test.h)
target_link_libraries(test-depthwise-conv-op paddle-mobile) target_link_libraries(test-depthwise-conv-op paddle-mobile)
#add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp)
endif() endif()
...@@ -19,6 +19,8 @@ limitations under the License. */ ...@@ -19,6 +19,8 @@ limitations under the License. */
#ifndef PADDLE_MOBILE_TEST_LIB_SIZE_H #ifndef PADDLE_MOBILE_TEST_LIB_SIZE_H
#define PADDLE_MOBILE_TEST_LIB_SIZE_H #define PADDLE_MOBILE_TEST_LIB_SIZE_H
#include <pthread.h>
#include <thread>
#include <vector> #include <vector>
//#include <list> //#include <list>
//#include <tuple> //#include <tuple>
...@@ -33,7 +35,7 @@ limitations under the License. */ ...@@ -33,7 +35,7 @@ limitations under the License. */
//#include <iostream> //#include <iostream>
//#include <sstream> //#include <sstream>
#include <memory> //#include <memory>
//#include <stdio.h> //#include <stdio.h>
//#include <cstring> //#include <cstring>
...@@ -44,8 +46,10 @@ void foo() { ...@@ -44,8 +46,10 @@ void foo() {
// std::cout << "12345" << std::endl; // std::cout << "12345" << std::endl;
std::vector<int> vec = {1, 2, 3, 4, 5}; std::vector<int> vec = {1, 2, 3, 4, 5};
vec.push_back(2);
// std::find(vec.begin(), vec.end(), 1); pthread_mutex_init(NULL, NULL);
pthread_attr_destroy(NULL);
// std::find(vec.begin(), vec.end(), 1); // std::find(vec.begin(), vec.end(), 1);
// std::list<int> l; // std::list<int> l;
...@@ -70,7 +74,7 @@ void foo() { ...@@ -70,7 +74,7 @@ void foo() {
// int z = 10; // int z = 10;
// } // }
std::shared_ptr<int> s1 = std::make_shared<int>(); // std::shared_ptr<int> s1 = std::make_shared<int>();
// std::stringstream ss; // std::stringstream ss;
// ss << "12345"; // ss << "12345";
......
...@@ -19,11 +19,9 @@ int main() { ...@@ -19,11 +19,9 @@ int main() {
paddle_mobile::Loader<paddle_mobile::CPU> loader; paddle_mobile::Loader<paddle_mobile::CPU> loader;
// ../../../test/models/googlenet // ../../../test/models/googlenet
// ../../../test/models/mobilenet // ../../../test/models/mobilenet
auto program = loader.Load(g_googlenet, true, true); auto program = loader.Load(g_mobilenet_ssd, false, false);
// loader.Load(g_googlenet_combine + "/model", g_googlenet_combine + // loader.Load(g_googlenet_combine + "/model", g_googlenet_combine +
// "/params", // "/params", true);
// true);
program.originProgram->Description("program desc: "); program.originProgram->Description("program desc: ");
return 0; return 0;
} }
...@@ -18,7 +18,7 @@ limitations under the License. */ ...@@ -18,7 +18,7 @@ limitations under the License. */
int main() { int main() {
paddle_mobile::Loader<paddle_mobile::CPU> loader; paddle_mobile::Loader<paddle_mobile::CPU> loader;
bool optimize = false; bool optimize = true;
auto time1 = time(); auto time1 = time();
auto program = loader.Load(g_googlenet, optimize); auto program = loader.Load(g_googlenet, optimize);
// auto program = loader.Load(g_googlenet_combine + "/model", // auto program = loader.Load(g_googlenet_combine + "/model",
......
...@@ -41,7 +41,7 @@ class TestBatchNormOp { ...@@ -41,7 +41,7 @@ class TestBatchNormOp {
for (int j = 0; j < ops.size(); ++j) { for (int j = 0; j < ops.size(); ++j) {
std::shared_ptr<OpDesc> op = ops[j]; std::shared_ptr<OpDesc> op = ops[j];
if (op->Type() == "batch_norm" && if (op->Type() == "batch_norm" &&
op->Input("X")[0] == "conv2d_0.tmp_0") { op->Input("X")[0] == "conv2d_5.tmp_0") {
DLOG << " mul attr size: " << op->GetAttrMap().size(); DLOG << " mul attr size: " << op->GetAttrMap().size();
DLOG << " inputs size: " << op->GetInputs().size(); DLOG << " inputs size: " << op->GetInputs().size();
DLOG << " outputs size: " << op->GetOutputs().size(); DLOG << " outputs size: " << op->GetOutputs().size();
...@@ -67,29 +67,29 @@ class TestBatchNormOp { ...@@ -67,29 +67,29 @@ class TestBatchNormOp {
const Tensor &t5) { const Tensor &t5) {
// feed // feed
auto scope = program_.scope; auto scope = program_.scope;
Variable *x1_feed_value = scope->Var("conv2d_0.tmp_0"); Variable *x1_feed_value = scope->Var("conv2d_5.tmp_0");
auto tensor_x1 = x1_feed_value->GetMutable<LoDTensor>(); auto tensor_x1 = x1_feed_value->GetMutable<LoDTensor>();
tensor_x1->ShareDataWith(t1); tensor_x1->ShareDataWith(t1);
Variable *mean_feed_value = scope->Var("batch_norm_0.w_1"); Variable *mean_feed_value = scope->Var("batch_norm_10.w_1");
auto tensor_mean = mean_feed_value->GetMutable<LoDTensor>(); auto tensor_mean = mean_feed_value->GetMutable<LoDTensor>();
tensor_mean->ShareDataWith(t2); tensor_mean->ShareDataWith(t2);
Variable *scale_feed_value = scope->Var("batch_norm_0.w_0"); Variable *scale_feed_value = scope->Var("batch_norm_10.w_0");
auto tensor_scale = scale_feed_value->GetMutable<LoDTensor>(); auto tensor_scale = scale_feed_value->GetMutable<LoDTensor>();
tensor_scale->ShareDataWith(t3); tensor_scale->ShareDataWith(t3);
Variable *variance_feed_value = scope->Var("batch_norm_0.w_2"); Variable *variance_feed_value = scope->Var("batch_norm_10.w_2");
auto tensor_variance = variance_feed_value->GetMutable<LoDTensor>(); auto tensor_variance = variance_feed_value->GetMutable<LoDTensor>();
tensor_variance->ShareDataWith(t4); tensor_variance->ShareDataWith(t4);
Variable *bias_feed_value = scope->Var("batch_norm_0.b_0"); Variable *bias_feed_value = scope->Var("batch_norm_10.b_0");
auto tensor_bias = bias_feed_value->GetMutable<LoDTensor>(); auto tensor_bias = bias_feed_value->GetMutable<LoDTensor>();
tensor_bias->ShareDataWith(t5); tensor_bias->ShareDataWith(t5);
Variable *output = scope->Var("batch_norm_0.tmp_2"); Variable *output = scope->Var("batch_norm_10.tmp_2");
auto *output_tensor = output->GetMutable<LoDTensor>(); auto *output_tensor = output->GetMutable<LoDTensor>();
output_tensor->mutable_data<float>({4, 10, 2, 2}); output_tensor->mutable_data<float>({1, 256, 38, 38});
// DLOG << typeid(output_tensor).name(); // DLOG << typeid(output_tensor).name();
// DLOG << "output_tensor dims: " << output_tensor->dims(); // DLOG << "output_tensor dims: " << output_tensor->dims();
...@@ -128,30 +128,32 @@ int main() { ...@@ -128,30 +128,32 @@ int main() {
DLOG << "----------**********----------"; DLOG << "----------**********----------";
DLOG << "begin to run BatchNormOp Test"; DLOG << "begin to run BatchNormOp Test";
paddle_mobile::Loader<paddle_mobile::CPU> loader; paddle_mobile::Loader<paddle_mobile::CPU> loader;
auto program = loader.Load(std::string(g_resnet)); auto program = loader.Load(std::string(g_mobilenet_ssd));
/// input x (4,10,2,2) /// input x (4,10,2,2)
paddle_mobile::framework::Tensor inputx1; paddle_mobile::framework::Tensor inputx1;
SetupTensor<float>(&inputx1, {4, 10, 2, 2}, static_cast<float>(0), SetupTensor<float>(&inputx1, {1, 256, 38, 38}, static_cast<float>(0),
static_cast<float>(1)); static_cast<float>(1));
auto *inputx1_ptr = inputx1.data<float>(); auto *inputx1_ptr = inputx1.data<float>();
paddle_mobile::framework::Tensor mean; paddle_mobile::framework::Tensor mean;
SetupTensor<float>(&mean, {10}, static_cast<float>(0), static_cast<float>(1)); SetupTensor<float>(&mean, {256}, static_cast<float>(0),
static_cast<float>(1));
auto *mean_ptr = mean.data<float>(); auto *mean_ptr = mean.data<float>();
paddle_mobile::framework::Tensor scale; paddle_mobile::framework::Tensor scale;
SetupTensor<float>(&scale, {10}, static_cast<float>(0), SetupTensor<float>(&scale, {256}, static_cast<float>(0),
static_cast<float>(1)); static_cast<float>(1));
auto *scale_ptr = scale.data<float>(); auto *scale_ptr = scale.data<float>();
paddle_mobile::framework::Tensor variance; paddle_mobile::framework::Tensor variance;
SetupTensor<float>(&variance, {10}, static_cast<float>(0), SetupTensor<float>(&variance, {256}, static_cast<float>(0),
static_cast<float>(1)); static_cast<float>(1));
auto *variance_ptr = variance.data<float>(); auto *variance_ptr = variance.data<float>();
paddle_mobile::framework::Tensor bias; paddle_mobile::framework::Tensor bias;
SetupTensor<float>(&bias, {10}, static_cast<float>(0), static_cast<float>(1)); SetupTensor<float>(&bias, {256}, static_cast<float>(0),
static_cast<float>(1));
auto *bias_ptr = bias.data<float>(); auto *bias_ptr = bias.data<float>();
paddle_mobile::framework::TestBatchNormOp<paddle_mobile::CPU> testBatchNormOp( paddle_mobile::framework::TestBatchNormOp<paddle_mobile::CPU> testBatchNormOp(
...@@ -161,11 +163,13 @@ int main() { ...@@ -161,11 +163,13 @@ int main() {
testBatchNormOp.predict_bn(inputx1, mean, scale, variance, bias); testBatchNormOp.predict_bn(inputx1, mean, scale, variance, bias);
auto *output_bn_ptr = output_bn->data<float>(); auto *output_bn_ptr = output_bn->data<float>();
/// [2, 5, 1, 0] DLOG << " (" << inputx1_ptr[0] << " - " << mean_ptr[0] << ")/(("
DLOG << " (" << inputx1_ptr[102] << " - " << mean_ptr[5] << ")/((" << variance_ptr[0] << " + 0.00001"
<< variance_ptr[5] << " + 0.00001" << ")^0.5)* " << scale_ptr[0] << " + " << bias_ptr[0] << " = ";
<< ")^0.5)* " << scale_ptr[5] << " + " << bias_ptr[5] << " = "; DLOG << output_bn_ptr[0];
DLOG << output_bn_ptr[102];
DLOG << "input_ptr 0 : " << inputx1_ptr[0];
DLOG << "output_ptr 0 : " << output_bn_ptr[0];
return 0; return 0;
} }
#!/usr/bin/env sh
push_fn () {
MODELS_PATH="../test/models/*"
MODELS_SRC="../test/models"
IMAGE_PATH="../test/images/*"
EXE_FILE="../test/build/*"
EXE_DIR="data/local/tmp/bin"
adb shell mkdir ${EXE_DIR}
MODELS_DIR="data/local/tmp/models"
adb shell mkdir ${MODELS_DIR}
for file in `ls ${MODELS_SRC}`
do
adb shell mkdir ${MODELS_DIR}"/"${file}
done
IMAGES_DIR="data/local/tmp/images"
adb shell mkdir ${IMAGES_DIR}
LIB_PATH="../build/release/arm-v7a/build/*"
adb push ${EXE_FILE} ${EXE_DIR}
adb push ${LIB_PATH} ${EXE_DIR}
if [[ $1 != "npm" ]]; then
adb push ${IMAGE_PATH} ${IMAGES_DIR}
adb push ${MODELS_PATH} ${MODELS_DIR}
fi
}
if [[ $1 == "npm" ]]; then
push_fn $1
else
push_fn
fi
...@@ -24,8 +24,15 @@ adb shell mkdir ${IMAGES_DIR} ...@@ -24,8 +24,15 @@ adb shell mkdir ${IMAGES_DIR}
LIB_PATH="../build/release/arm-v7a/build/*" LIB_PATH="../build/release/arm-v7a/build/*"
adb push ${EXE_FILE} ${EXE_DIR} adb push ${EXE_FILE} ${EXE_DIR}
adb push ${LIB_PATH} ${EXE_DIR} adb push ${LIB_PATH} ${EXE_DIR}
if [[ $1 != "npm" ]]; then
adb push ${IMAGE_PATH} ${IMAGES_DIR} adb push ${IMAGE_PATH} ${IMAGES_DIR}
adb push ${MODELS_PATH} ${MODELS_DIR} adb push ${MODELS_PATH} ${MODELS_DIR}
fi
adb shell "cd /data/local/tmp/bin; LD_LIBRARY_PATH=. ./${TESTUNIT}" adb shell "cd /data/local/tmp/bin; LD_LIBRARY_PATH=. ./${TESTUNIT}"
} }
if [[ $1 == "npm" ]]; then
push_fn $1
else
push_fn push_fn
fi
\ No newline at end of file
...@@ -19,12 +19,19 @@ adb shell mkdir ${IMAGES_DIR} ...@@ -19,12 +19,19 @@ adb shell mkdir ${IMAGES_DIR}
LIB_PATH="../../build/release/arm-v7a/build/*" LIB_PATH="../../build/release/arm-v7a/build/*"
adb push ${EXE_FILE} ${EXE_DIR} adb push ${EXE_FILE} ${EXE_DIR}
adb push ${LIB_PATH} ${EXE_DIR} adb push ${LIB_PATH} ${EXE_DIR}
if [[ $1 != "npm" ]]; then
adb push ${IMAGE_PATH} ${IMAGES_DIR} adb push ${IMAGE_PATH} ${IMAGES_DIR}
adb push ${MODELS_PATH} ${MODELS_DIR} adb push ${MODELS_PATH} ${MODELS_DIR}
fi
echo "test-op or test-net below : " echo "test-op or test-net below : "
adb shell ls /data/local/tmp/bin adb shell ls /data/local/tmp/bin
echo "**** choose OP or NET to test ****" echo "**** choose OP or NET to test ****"
read -p "which to test : " test_name read -p "which to test : " test_name
adb shell "cd /data/local/tmp/bin; LD_LIBRARY_PATH=. ./${test_name}" adb shell "cd /data/local/tmp/bin; LD_LIBRARY_PATH=. ./${test_name}"
} }
if [[ $1 == "npm" ]]; then
push_fn $1
else
push_fn push_fn
fi
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册