提交 4769f4c9 编写于 作者: xiebaiyuan's avatar xiebaiyuan

run android gpu for super

上级 a29abccb
...@@ -38,18 +38,15 @@ namespace framework { ...@@ -38,18 +38,15 @@ namespace framework {
#pragma mark - executor #pragma mark - executor
template <typename Device, typename T> template <typename Device, typename T>
Executor<Device, T>::Executor(const Program<Device> &program, paddle_mobile::PaddleMobileConfigInternal config, int batch_size, Executor<Device, T>::Executor(const Program<Device> &program,
const bool use_optimize, const bool lod_mode): Executor(program, batch_size, use_optimize, lod_mode) { paddle_mobile::PaddleMobileConfigInternal config,
config_ = config; int batch_size, const bool use_optimize,
}; const bool lod_mode)
template <typename Device, typename T>
Executor<Device, T>::Executor(const Program<Device> &program, int batch_size,
const bool use_optimize, const bool lod_mode)
: program_(program), : program_(program),
batch_size_(batch_size), batch_size_(batch_size),
use_optimize_(use_optimize), use_optimize_(use_optimize),
lod_mode_(lod_mode) { lod_mode_(lod_mode),
config_(config) {
DLOG << "executor in lod mode: " << lod_mode_; DLOG << "executor in lod mode: " << lod_mode_;
Variable *variable_ptr = program_.scope->Var("batch_size"); Variable *variable_ptr = program_.scope->Var("batch_size");
...@@ -224,7 +221,8 @@ void Executor<Device, T>::InitCombineMemory() { ...@@ -224,7 +221,8 @@ void Executor<Device, T>::InitCombineMemory() {
LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor); LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
} else { } else {
if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) { if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
DLOG << " init combine memory no persistable in lod: " << var_desc->Name(); DLOG << " init combine memory no persistable in lod: "
<< var_desc->Name();
varInputMemory(var_desc, var, tensor); varInputMemory(var_desc, var, tensor);
} else { } else {
DLOG << " init combine memory no persistable: " << var_desc->Name(); DLOG << " init combine memory no persistable: " << var_desc->Name();
...@@ -239,7 +237,8 @@ void Executor<Device, T>::InitCombineMemory() { ...@@ -239,7 +237,8 @@ void Executor<Device, T>::InitCombineMemory() {
} }
template <typename Device, typename T> template <typename Device, typename T>
void Executor<Device, T>::InitNoPersistableMemory(const LoDTensor &input_tensor) { void Executor<Device, T>::InitNoPersistableMemory(
const LoDTensor &input_tensor) {
for (const auto &block : program_desc_->Blocks()) { for (const auto &block : program_desc_->Blocks()) {
for (const auto &var_desc : block->Vars()) { for (const auto &var_desc : block->Vars()) {
auto var = program_.scope->Var(var_desc->Name()); auto var = program_.scope->Var(var_desc->Name());
...@@ -251,8 +250,10 @@ void Executor<Device, T>::InitNoPersistableMemory(const LoDTensor &input_tensor) ...@@ -251,8 +250,10 @@ void Executor<Device, T>::InitNoPersistableMemory(const LoDTensor &input_tensor)
} else { } else {
if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) { if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
DDim tensor_dim = tensor->dims(); DDim tensor_dim = tensor->dims();
DDim new_dim = make_ddim({tensor_dim[0], tensor_dim[1], input_tensor.dims()[2], input_tensor.dims()[3]}); DDim new_dim =
tensor->template Resize(new_dim); make_ddim({tensor_dim[0], tensor_dim[1], input_tensor.dims()[2],
input_tensor.dims()[3]});
tensor->Resize(new_dim);
tensor->template mutable_data<T>(); tensor->template mutable_data<T>();
} }
} }
...@@ -313,7 +314,6 @@ PMStatus Executor<Device, T>::Predict( ...@@ -313,7 +314,6 @@ PMStatus Executor<Device, T>::Predict(
template <typename Device, typename T> template <typename Device, typename T>
std::vector<T> Executor<Device, T>::Predict(const std::vector<T> &input, std::vector<T> Executor<Device, T>::Predict(const std::vector<T> &input,
const std::vector<int64_t> &dims) { const std::vector<int64_t> &dims) {
Tensor feed_tensor(input, make_ddim(dims)); Tensor feed_tensor(input, make_ddim(dims));
SetInput(feed_tensor, "feed"); SetInput(feed_tensor, "feed");
std::vector<T> output; std::vector<T> output;
...@@ -336,7 +336,8 @@ void Executor<Device, T>::SetInput(const Tensor &input, ...@@ -336,7 +336,8 @@ void Executor<Device, T>::SetInput(const Tensor &input,
auto *target_tensor = target_var->template GetMutable<LoDTensor>(); auto *target_tensor = target_var->template GetMutable<LoDTensor>();
if (config_.load_when_predict) { if (config_.load_when_predict) {
if (target_tensor->IsInitialized() && target_tensor->dims() != input.dims()) { if (target_tensor->IsInitialized() &&
target_tensor->dims() != input.dims()) {
InitNoPersistableMemory(*target_tensor); InitNoPersistableMemory(*target_tensor);
} }
} }
...@@ -348,14 +349,14 @@ void Executor<Device, T>::SetInput(const Tensor &input, ...@@ -348,14 +349,14 @@ void Executor<Device, T>::SetInput(const Tensor &input,
template <typename Device, typename T> template <typename Device, typename T>
void Executor<Device, T>::SetInput(const LoDTensor &input, void Executor<Device, T>::SetInput(const LoDTensor &input,
const std::string &var_name) { const std::string &var_name) {
auto *target_var = program_.scope->FindVar(var_name); auto *target_var = program_.scope->FindVar(var_name);
PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist", PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
var_name.c_str()); var_name.c_str());
auto *target_tensor = target_var->template GetMutable<LoDTensor>(); auto *target_tensor = target_var->template GetMutable<LoDTensor>();
if (config_.load_when_predict) { if (config_.load_when_predict) {
if (target_tensor->IsInitialized() && target_tensor->dims() != input.dims()) { if (target_tensor->IsInitialized() &&
target_tensor->dims() != input.dims()) {
InitNoPersistableMemory(*target_tensor); InitNoPersistableMemory(*target_tensor);
} }
} }
...@@ -504,6 +505,70 @@ void Executor<Device, T>::Predict_To(int end) { ...@@ -504,6 +505,70 @@ void Executor<Device, T>::Predict_To(int end) {
#endif #endif
#ifdef PADDLE_MOBILE_CL #ifdef PADDLE_MOBILE_CL
template <>
void Executor<GPU_CL, float>::InitNoPersistableMemory(
const LoDTensor &input_tensor) {
DLOG << "CL InitNoPersistableMemory ";
for (const auto &block : program_desc_->Blocks()) {
for (const auto &var_desc : block->Vars()) {
auto var = program_.scope->Var(var_desc->Name());
auto cl_image = var->template GetMutable<CLImage>();
if (var_desc->Persistable()) {
if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
continue;
}
} else {
if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
cl_context context = program_.scope->GetCLScpoe()->Context();
cl_command_queue command_queue =
program_.scope->GetCLScpoe()->CommandQueue();
DDim tensor_dim = cl_image->dims();
DDim new_dim =
make_ddim({tensor_dim[0], tensor_dim[1], input_tensor.dims()[2],
input_tensor.dims()[3]});
cl_image->Resize(new_dim);
cl_image->InitEmptyImage(context, command_queue, new_dim);
}
}
}
}
std::shared_ptr<LoDTensor> output = GetOutput("fetch");
output->Resize(input_tensor.dims());
output->mutable_data<float>();
}
template <>
void Executor<GPU_CL, float>::SetInput(const Tensor &input,
const std::string &var_name) {
auto *target_var = program_.scope->FindVar(var_name);
PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
var_name.c_str());
auto *target_tensor = target_var->template GetMutable<LoDTensor>();
DLOG << "config_.load_when_predict " << config_.load_when_predict;
DLOG << "target_tensor->IsInitialized() " << target_tensor->IsInitialized();
DLOG << "target_tensor->dims() " << target_tensor->dims();
DLOG << "input.dims() " << input.dims();
if (config_.load_when_predict) {
if (target_tensor->dims() != input.dims()) {
if (!target_tensor->IsInitialized()) {
DLOG << "SetInput ---- > resize1";
std::cout << "SetInput ---- > resize1" << std::endl;
target_tensor->Resize(input.dims());
target_tensor->mutable_data<float>();
}
InitNoPersistableMemory(*target_tensor);
}
} else {
DLOG << "SetInput ---- > resize2";
target_tensor->Resize(input.dims());
DLOG << "SetInput ---- > ShareDataWith";
}
target_tensor->ShareDataWith(input);
}
template <typename Device, typename T> template <typename Device, typename T>
void Executor<Device, T>::LoadMemory(const VarDesc var_desc, float *tensorInput, void Executor<Device, T>::LoadMemory(const VarDesc var_desc, float *tensorInput,
char **data) {} char **data) {}
...@@ -639,6 +704,8 @@ void Executor<GPU_CL, float>::InitMemory() { ...@@ -639,6 +704,8 @@ void Executor<GPU_CL, float>::InitMemory() {
template <> template <>
void Executor<GPU_CL, float>::InitCombineMemory() { void Executor<GPU_CL, float>::InitCombineMemory() {
DLOG << "CL InitCombineMemory---- "
<< "config_.load_when_predict: " << config_.load_when_predict;
char *origin_data = nullptr; char *origin_data = nullptr;
bool self_alloc = false; bool self_alloc = false;
if (program_.combined_params_buf && program_.combined_params_len) { if (program_.combined_params_buf && program_.combined_params_len) {
......
...@@ -32,9 +32,8 @@ namespace framework { ...@@ -32,9 +32,8 @@ namespace framework {
template <typename Device, typename T = float> template <typename Device, typename T = float>
class Executor { class Executor {
public: public:
Executor(const Program<Device> &program, paddle_mobile::PaddleMobileConfigInternal config, int batch_size = 1, Executor(const Program<Device> &program,
const bool use_optimize = true, const bool lod_mode = false); paddle_mobile::PaddleMobileConfigInternal config, int batch_size = 1,
Executor(const Program<Device> &program, int batch_size = 1,
const bool use_optimize = true, const bool lod_mode = false); const bool use_optimize = true, const bool lod_mode = false);
PMStatus Predict(const std::vector<std::pair<std::string, Tensor>> &inputs); PMStatus Predict(const std::vector<std::pair<std::string, Tensor>> &inputs);
...@@ -62,8 +61,6 @@ class Executor { ...@@ -62,8 +61,6 @@ class Executor {
protected: protected:
Executor() = default; Executor() = default;
bool varInputMemory(const std::shared_ptr<VarDesc> &var_desc, Variable *var, bool varInputMemory(const std::shared_ptr<VarDesc> &var_desc, Variable *var,
LoDTensor *tensor) const; LoDTensor *tensor) const;
void InitMemory(); void InitMemory();
...@@ -89,7 +86,6 @@ class Executor { ...@@ -89,7 +86,6 @@ class Executor {
// for super resoltion // for super resoltion
DDim input_dim_; DDim input_dim_;
#ifdef PADDLE_MOBILE_PROFILE #ifdef PADDLE_MOBILE_PROFILE
struct ProfInfo { struct ProfInfo {
int tid = 0; int tid = 0;
......
...@@ -25,7 +25,6 @@ namespace framework { ...@@ -25,7 +25,6 @@ namespace framework {
template <typename Device = CPU, typename T = float> template <typename Device = CPU, typename T = float>
class Loader { class Loader {
public: public:
/* /*
* @b load separate format fluid model * @b load separate format fluid model
* @b 加载分开存储的fluid模型 * @b 加载分开存储的fluid模型
...@@ -60,7 +59,6 @@ class Loader { ...@@ -60,7 +59,6 @@ class Loader {
void InitMemoryFromProgram( void InitMemoryFromProgram(
const std::shared_ptr<ProgramDesc> &originProgramDesc, const std::shared_ptr<ProgramDesc> &originProgramDesc,
const std::shared_ptr<Scope> &scope); const std::shared_ptr<Scope> &scope);
}; };
} // namespace framework } // namespace framework
......
...@@ -42,8 +42,8 @@ PMStatus PaddleMobile<Device, T>::Load(const std::string &dirname, ...@@ -42,8 +42,8 @@ PMStatus PaddleMobile<Device, T>::Load(const std::string &dirname,
if (executor_.get() == nullptr) { if (executor_.get() == nullptr) {
executor_ = std::make_shared<framework::Executor<Device, T>>( executor_ = std::make_shared<framework::Executor<Device, T>>(
loader_->Load(dirname, optimize, quantification), config_, batch_size, optimize, loader_->Load(dirname, optimize, quantification), config_, batch_size,
loddable); optimize, loddable);
} else { } else {
LOG(kLOG_INFO) << "executor inited"; LOG(kLOG_INFO) << "executor inited";
} }
...@@ -64,7 +64,8 @@ PMStatus PaddleMobile<Device, T>::Load(const std::string &model_path, ...@@ -64,7 +64,8 @@ PMStatus PaddleMobile<Device, T>::Load(const std::string &model_path,
if (executor_.get() == nullptr) { if (executor_.get() == nullptr) {
executor_ = std::make_shared<framework::Executor<Device, T>>( executor_ = std::make_shared<framework::Executor<Device, T>>(
loader_->Load(model_path, para_path, optimize, quantification), config_, batch_size, optimize, loddable); loader_->Load(model_path, para_path, optimize, quantification), config_,
batch_size, optimize, loddable);
} else { } else {
LOG(kLOG_INFO) << "executor inited"; LOG(kLOG_INFO) << "executor inited";
} }
...@@ -86,8 +87,8 @@ bool PaddleMobile<Device, T>::LoadCombinedMemory( ...@@ -86,8 +87,8 @@ bool PaddleMobile<Device, T>::LoadCombinedMemory(
executor_ = std::make_shared<framework::Executor<Device, T>>( executor_ = std::make_shared<framework::Executor<Device, T>>(
loader_->LoadCombinedMemory(model_len, model_buf, combined_params_len, loader_->LoadCombinedMemory(model_len, model_buf, combined_params_len,
combined_params_buf, optimize, combined_params_buf, optimize,
quantification), config_, quantification),
batch_size, optimize, loddable); config_, batch_size, optimize, loddable);
} else { } else {
LOG(kLOG_INFO) << "executor inited"; LOG(kLOG_INFO) << "executor inited";
} }
......
...@@ -33,12 +33,10 @@ limitations under the License. */ ...@@ -33,12 +33,10 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
template <typename Device, typename T = float> template <typename Device, typename T = float>
class PaddleMobile { class PaddleMobile {
public: public:
PaddleMobile(PaddleMobileConfigInternal config) : config_(config) {
PaddleMobile(PaddleMobileConfigInternal config): config_(config){
#ifndef PADDLE_MOBILE_CL #ifndef PADDLE_MOBILE_CL
bool is_gpu = std::is_same<DeviceType<kGPU_CL>, Device>::value; bool is_gpu = std::is_same<DeviceType<kGPU_CL>, Device>::value;
PADDLE_MOBILE_ENFORCE(!is_gpu, "Please recompile with GPU_CL is on"); PADDLE_MOBILE_ENFORCE(!is_gpu, "Please recompile with GPU_CL is on");
......
...@@ -1190,8 +1190,7 @@ void Gemm::WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc, ...@@ -1190,8 +1190,7 @@ void Gemm::WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
void Gemm::VectorKernel(int m, int n, int k, float alpha, const float *A, void Gemm::VectorKernel(int m, int n, int k, float alpha, const float *A,
int lda, const float *B, int ldb, float beta, float *C, int lda, const float *B, int ldb, float beta, float *C,
int ldc, bool relu) { int ldc, bool relu) {}
}
#else #else
......
...@@ -27,7 +27,7 @@ int main() { ...@@ -27,7 +27,7 @@ int main() {
// auto program = loader.Load(g_super, true); // auto program = loader.Load(g_super, true);
auto program = loader.Load(std::string(g_super) + "/model", auto program = loader.Load(std::string(g_super) + "/model",
std::string(g_super) + "/params", false); std::string(g_super) + "/params", false);
// program.originProgram->Description("program desc: "); // program.originProgram->Description("program desc: ");
return 0; return 0;
......
...@@ -21,7 +21,7 @@ int main() { ...@@ -21,7 +21,7 @@ int main() {
paddle_mobile::PaddleMobileConfigInternal config; paddle_mobile::PaddleMobileConfigInternal config;
config.load_when_predict = true; config.load_when_predict = true;
paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile(config); paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile(config);
// paddle_mobile.SetThreadNum(4); // paddle_mobile.SetThreadNum(4);
auto time1 = paddle_mobile::time(); auto time1 = paddle_mobile::time();
#ifdef PADDLE_MOBILE_CL #ifdef PADDLE_MOBILE_CL
...@@ -40,21 +40,26 @@ int main() { ...@@ -40,21 +40,26 @@ int main() {
std::vector<float> input; std::vector<float> input;
std::vector<int64_t> dims{1, 1, 300, 300}; std::vector<int64_t> dims{1, 1, 300, 300};
GetInput<float>(g_yolo_img, &input, dims); GetInput<float>(g_test_image_1x3x224x224, &input, dims);
std::vector<float> vec_result; std::vector<float> vec_result;
auto time3 = paddle_mobile::time(); auto time3 = paddle_mobile::time();
int max = 1; int max = 10;
for (int i = 0; i < max; ++i) { for (int i = 0; i < max; ++i) {
auto time5 = paddle_mobile::time();
vec_result = paddle_mobile.Predict(input, dims); vec_result = paddle_mobile.Predict(input, dims);
auto time6 = paddle_mobile::time();
std::cout << "predict cost :第" << i << ": "
<< paddle_mobile::time_diff(time5, time6) << "ms" << std::endl;
} }
auto time4 = paddle_mobile::time(); auto time4 = paddle_mobile::time();
std::cout << "predict cost :" std::cout << "predict cost :"
<< paddle_mobile::time_diff(time3, time4) / max << "ms" << paddle_mobile::time_diff(time3, time4) / max << "ms"
<< std::endl; << std::endl;
std::vector<float>::iterator biggest = auto biggest =
std::max_element(std::begin(vec_result), std::end(vec_result)); std::max_element(std::begin(vec_result), std::end(vec_result));
std::cout << " Max element is " << *biggest << " at position " std::cout << " Max element is " << *biggest << " at position "
<< std::distance(std::begin(vec_result), biggest) << std::endl; << std::distance(std::begin(vec_result), biggest) << std::endl;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册