提交 4db33cfd 编写于 作者: H hjchen2

Merge branch 'dev-latest' of https://github.com/hjchen2/paddle-mobile into dev-latest

cmake_minimum_required(VERSION 3.0.0) cmake_minimum_required(VERSION 3.0.0)
option(USE_OPENMP "openmp support" ON) option(USE_OPENMP "openmp support" ON)
option(DEBUGING "enable debug mode" OFF) option(DEBUGING "enable debug mode" ON)
option(USE_EXCEPTION "use std exception" OFF) option(USE_EXCEPTION "use std exception" ON)
option(SYMBOL_HIDDEN "symbol hidden" OFF) # on when use jni or ios io option(SYMBOL_HIDDEN "symbol hidden" OFF) # on when use jni or ios io
option(LOG_PROFILE "log profile" OFF) option(LOG_PROFILE "log profile" OFF)
# select the platform to build # select the platform to build
option(CPU "armv7 with neon" ON) option(CPU "armv7 with neon" ON)
option(GPU_MALI "mali gpu" OFF) option(GPU_MALI "mali gpu" OFF)
...@@ -15,7 +16,6 @@ if(FPGA) ...@@ -15,7 +16,6 @@ if(FPGA)
option(FPGAV2 "fpga v2" OFF) option(FPGAV2 "fpga v2" OFF)
endif() endif()
project(paddle-mobile) project(paddle-mobile)
file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c src/*.mm) file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c src/*.mm)
...@@ -247,5 +247,3 @@ elseif(FPGA) ...@@ -247,5 +247,3 @@ elseif(FPGA)
add_subdirectory(test) add_subdirectory(test)
endif() endif()
add_subdirectory(test)
...@@ -95,12 +95,13 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size, ...@@ -95,12 +95,13 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
} }
template <typename Dtype> template <typename Dtype>
void LoadMemInternal(void **data, framework::LoDTensor *tensor) { static void LoadMemInternal(void **data, framework::LoDTensor *tensor,
bool quant_uint8 = false) {
char **data_buf = reinterpret_cast<char **>(data); char **data_buf = reinterpret_cast<char **>(data);
int64_t size = tensor->numel(); int64_t size = tensor->numel();
Dtype *tensor_data = tensor->mutable_data<Dtype>(); Dtype *tensor_data = tensor->mutable_data<Dtype>();
if (0) { if (quant_uint8) {
// TODO(hjchen2) should be moved into operator init function // should be moved into operator init function
float min_value; float min_value;
float max_value; float max_value;
memory::Copy(&min_value, data_buf, sizeof(float)); memory::Copy(&min_value, data_buf, sizeof(float));
...@@ -156,7 +157,8 @@ void Executor<Dtype, P>::LoadMemory( ...@@ -156,7 +157,8 @@ void Executor<Dtype, P>::LoadMemory(
// parse tensor from stream // parse tensor from stream
switch (tensor_desc.DataType()) { switch (tensor_desc.DataType()) {
case framework::VARTYPE_TYPE_FP32: case framework::VARTYPE_TYPE_FP32:
LoadMemInternal<float>(reinterpret_cast<void **>(data_buf), tensor); LoadMemInternal<float>(reinterpret_cast<void **>(data_buf), tensor,
program_.quantification);
break; break;
case framework::VARTYPE_TYPE_INT8: case framework::VARTYPE_TYPE_INT8:
LoadMemInternal<int8_t>(reinterpret_cast<void **>(data_buf), tensor); LoadMemInternal<int8_t>(reinterpret_cast<void **>(data_buf), tensor);
...@@ -263,7 +265,6 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict( ...@@ -263,7 +265,6 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
framework::Variable *g_feed_value = program_.scope->Var("feed"); framework::Variable *g_feed_value = program_.scope->Var("feed");
framework::Tensor *feed_tensor = framework::Tensor *feed_tensor =
g_feed_value->GetMutable<framework::LoDTensor>(); g_feed_value->GetMutable<framework::LoDTensor>();
DLOG << "feed_tensor dim: " << feed_tensor->dims();
feed_tensor->Resize(t.dims()); feed_tensor->Resize(t.dims());
feed_tensor->ShareDataWith(t); feed_tensor->ShareDataWith(t);
std::shared_ptr<framework::BlockDesc> to_predict_block = std::shared_ptr<framework::BlockDesc> to_predict_block =
...@@ -298,16 +299,8 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict( ...@@ -298,16 +299,8 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
for (int i = 0; i < profile.size(); i++) { for (int i = 0; i < profile.size(); i++) {
const auto &pInfo = profile[i]; const auto &pInfo = profile[i];
uint64_t timeCost = pInfo.runEnd - pInfo.runBegin; uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
if (ops[i]->Type() == "conv2d") {
auto inputs = ops[i]->Inputs();
auto *filter = framework::GetVarValue<framework::LoDTensor>(
"Filter", inputs, *(program_.scope));
int kernel_size = filter->dims()[2];
_tp[ops[i]->Type() + "_" + std::to_string(kernel_size)] += timeCost;
} else {
_tp[ops[i]->Type()] += timeCost; _tp[ops[i]->Type()] += timeCost;
} }
}
printf("====================[ profile ]======================\n"); printf("====================[ profile ]======================\n");
using prof_t = std::pair<std::string, uint64_t>; using prof_t = std::pair<std::string, uint64_t>;
std::vector<prof_t> _tv(_tp.begin(), _tp.end()); std::vector<prof_t> _tv(_tp.begin(), _tp.end());
...@@ -376,14 +369,6 @@ std::shared_ptr<framework::LoDTensor> Executor<Dtype, P>::PredictLod( ...@@ -376,14 +369,6 @@ std::shared_ptr<framework::LoDTensor> Executor<Dtype, P>::PredictLod(
for (int i = 0; i < profile.size(); i++) { for (int i = 0; i < profile.size(); i++) {
const auto &pInfo = profile[i]; const auto &pInfo = profile[i];
uint64_t timeCost = pInfo.runEnd - pInfo.runBegin; uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
if (ops[i]->Type() == "conv2d") {
auto inputs = ops[i]->Inputs();
auto input_keys = ops[i]->GetInputKeys();
auto *filter = framework::GetVarValue<framework::LoDTensor>(
input_keys[1], inputs, *(program_.scope));
int kernel_size = filter->dims()[2];
printf("kernel size: %d\n", kernel_size);
}
_tp[ops[i]->Type()] += timeCost; _tp[ops[i]->Type()] += timeCost;
} }
printf("====================[ profile ]======================\n"); printf("====================[ profile ]======================\n");
......
...@@ -909,10 +909,9 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input, ...@@ -909,10 +909,9 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input,
"veor q13, q13, q13 \n" "veor q13, q13, q13 \n"
"veor q14, q14, q14 \n" "veor q14, q14, q14 \n"
"veor q15, q15, q15 \n" "veor q15, q15, q15 \n"
// loop 2 channels
"cmp %[inter_channel], #0 \n"
"ble cmp_remain_%= \n"
"b store_res_%= \n"
// loop 2 channels
"loop_2c_%=: \n" "loop_2c_%=: \n"
"vld1.32 {d0-d3}, [%[w_ptr]]! \n" "vld1.32 {d0-d3}, [%[w_ptr]]! \n"
"vld1.32 {d4-d7}, [%[in_ptr]]! \n" "vld1.32 {d4-d7}, [%[in_ptr]]! \n"
...@@ -937,11 +936,7 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input, ...@@ -937,11 +936,7 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input,
"subs %[inter_channel], #1 \n" "subs %[inter_channel], #1 \n"
"bne loop_2c_%= \n" "bne loop_2c_%= \n"
"mov pc, lr \n"
// cmp remain channel > 0
"cmp_remain_%=: \n"
"cmp %[remain_channel], #0 \n"
"ble store_res_%= \n"
// loop 1 channel // loop 1 channel
"loop_c_%=: \n" "loop_c_%=: \n"
...@@ -959,8 +954,16 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input, ...@@ -959,8 +954,16 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input,
"subs %[remain_channel], #1 \n" "subs %[remain_channel], #1 \n"
"bne loop_c_%= \n" "bne loop_c_%= \n"
"mov pc, lr \n"
"store_res_%=: \n" "store_res_%=: \n"
"cmp %[inter_channel], #0 \n"
"it gt \n"
"blgt loop_2c_%= \n"
"cmp %[remain_channel], #0 \n"
"it gt \n"
"blgt loop_c_%= \n"
"vst1.32 {d16-d19}, [%[uv_ptr]]! \n" "vst1.32 {d16-d19}, [%[uv_ptr]]! \n"
"vst1.32 {d20-d23}, [%[uv_ptr]]! \n" "vst1.32 {d20-d23}, [%[uv_ptr]]! \n"
"vst1.32 {d24-d27}, [%[uv_ptr]]! \n" "vst1.32 {d24-d27}, [%[uv_ptr]]! \n"
...@@ -970,7 +973,7 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input, ...@@ -970,7 +973,7 @@ void winograd_transform_output<8, 3>(const framework::Tensor &input,
[inter_channel] "+r"(inter_channel) [inter_channel] "+r"(inter_channel)
: :
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "pc", "lr");
} }
} }
} }
......
...@@ -3,4 +3,3 @@ set(ANDROID_PIE TRUE) ...@@ -3,4 +3,3 @@ set(ANDROID_PIE TRUE)
set(ANDROID_STL "c++_static") set(ANDROID_STL "c++_static")
set(ANDROID_PLATFORM "android-22") set(ANDROID_PLATFORM "android-22")
include("${CMAKE_CURRENT_LIST_DIR}/../android-cmake/android.toolchain.cmake") include("${CMAKE_CURRENT_LIST_DIR}/../android-cmake/android.toolchain.cmake")
#include("/Users/chenhoujiang/Project/android-ndk-r16b/build/cmake/android.toolchain.cmake")
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册