提交 4769f4c9 编写于 作者: xiebaiyuan's avatar xiebaiyuan

run android gpu for super

上级 a29abccb
......@@ -38,18 +38,15 @@ namespace framework {
#pragma mark - executor
template <typename Device, typename T>
Executor<Device, T>::Executor(const Program<Device> &program, paddle_mobile::PaddleMobileConfigInternal config, int batch_size,
const bool use_optimize, const bool lod_mode): Executor(program, batch_size, use_optimize, lod_mode) {
config_ = config;
};
template <typename Device, typename T>
Executor<Device, T>::Executor(const Program<Device> &program, int batch_size,
const bool use_optimize, const bool lod_mode)
Executor<Device, T>::Executor(const Program<Device> &program,
paddle_mobile::PaddleMobileConfigInternal config,
int batch_size, const bool use_optimize,
const bool lod_mode)
: program_(program),
batch_size_(batch_size),
use_optimize_(use_optimize),
lod_mode_(lod_mode) {
lod_mode_(lod_mode),
config_(config) {
DLOG << "executor in lod mode: " << lod_mode_;
Variable *variable_ptr = program_.scope->Var("batch_size");
......@@ -224,7 +221,8 @@ void Executor<Device, T>::InitCombineMemory() {
LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
} else {
if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
DLOG << " init combine memory no persistable in lod: " << var_desc->Name();
DLOG << " init combine memory no persistable in lod: "
<< var_desc->Name();
varInputMemory(var_desc, var, tensor);
} else {
DLOG << " init combine memory no persistable: " << var_desc->Name();
......@@ -239,7 +237,8 @@ void Executor<Device, T>::InitCombineMemory() {
}
template <typename Device, typename T>
void Executor<Device, T>::InitNoPersistableMemory(const LoDTensor &input_tensor) {
void Executor<Device, T>::InitNoPersistableMemory(
const LoDTensor &input_tensor) {
for (const auto &block : program_desc_->Blocks()) {
for (const auto &var_desc : block->Vars()) {
auto var = program_.scope->Var(var_desc->Name());
......@@ -251,8 +250,10 @@ void Executor<Device, T>::InitNoPersistableMemory(const LoDTensor &input_tensor)
} else {
if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
DDim tensor_dim = tensor->dims();
DDim new_dim = make_ddim({tensor_dim[0], tensor_dim[1], input_tensor.dims()[2], input_tensor.dims()[3]});
tensor->template Resize(new_dim);
DDim new_dim =
make_ddim({tensor_dim[0], tensor_dim[1], input_tensor.dims()[2],
input_tensor.dims()[3]});
tensor->Resize(new_dim);
tensor->template mutable_data<T>();
}
}
......@@ -313,7 +314,6 @@ PMStatus Executor<Device, T>::Predict(
template <typename Device, typename T>
std::vector<T> Executor<Device, T>::Predict(const std::vector<T> &input,
const std::vector<int64_t> &dims) {
Tensor feed_tensor(input, make_ddim(dims));
SetInput(feed_tensor, "feed");
std::vector<T> output;
......@@ -336,7 +336,8 @@ void Executor<Device, T>::SetInput(const Tensor &input,
auto *target_tensor = target_var->template GetMutable<LoDTensor>();
if (config_.load_when_predict) {
if (target_tensor->IsInitialized() && target_tensor->dims() != input.dims()) {
if (target_tensor->IsInitialized() &&
target_tensor->dims() != input.dims()) {
InitNoPersistableMemory(*target_tensor);
}
}
......@@ -348,14 +349,14 @@ void Executor<Device, T>::SetInput(const Tensor &input,
template <typename Device, typename T>
void Executor<Device, T>::SetInput(const LoDTensor &input,
const std::string &var_name) {
auto *target_var = program_.scope->FindVar(var_name);
PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
var_name.c_str());
auto *target_tensor = target_var->template GetMutable<LoDTensor>();
if (config_.load_when_predict) {
if (target_tensor->IsInitialized() && target_tensor->dims() != input.dims()) {
if (target_tensor->IsInitialized() &&
target_tensor->dims() != input.dims()) {
InitNoPersistableMemory(*target_tensor);
}
}
......@@ -504,6 +505,70 @@ void Executor<Device, T>::Predict_To(int end) {
#endif
#ifdef PADDLE_MOBILE_CL
template <>
void Executor<GPU_CL, float>::InitNoPersistableMemory(
const LoDTensor &input_tensor) {
DLOG << "CL InitNoPersistableMemory ";
for (const auto &block : program_desc_->Blocks()) {
for (const auto &var_desc : block->Vars()) {
auto var = program_.scope->Var(var_desc->Name());
auto cl_image = var->template GetMutable<CLImage>();
if (var_desc->Persistable()) {
if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
continue;
}
} else {
if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
cl_context context = program_.scope->GetCLScpoe()->Context();
cl_command_queue command_queue =
program_.scope->GetCLScpoe()->CommandQueue();
DDim tensor_dim = cl_image->dims();
DDim new_dim =
make_ddim({tensor_dim[0], tensor_dim[1], input_tensor.dims()[2],
input_tensor.dims()[3]});
cl_image->Resize(new_dim);
cl_image->InitEmptyImage(context, command_queue, new_dim);
}
}
}
}
std::shared_ptr<LoDTensor> output = GetOutput("fetch");
output->Resize(input_tensor.dims());
output->mutable_data<float>();
}
template <>
void Executor<GPU_CL, float>::SetInput(const Tensor &input,
const std::string &var_name) {
auto *target_var = program_.scope->FindVar(var_name);
PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
var_name.c_str());
auto *target_tensor = target_var->template GetMutable<LoDTensor>();
DLOG << "config_.load_when_predict " << config_.load_when_predict;
DLOG << "target_tensor->IsInitialized() " << target_tensor->IsInitialized();
DLOG << "target_tensor->dims() " << target_tensor->dims();
DLOG << "input.dims() " << input.dims();
if (config_.load_when_predict) {
if (target_tensor->dims() != input.dims()) {
if (!target_tensor->IsInitialized()) {
DLOG << "SetInput ---- > resize1";
std::cout << "SetInput ---- > resize1" << std::endl;
target_tensor->Resize(input.dims());
target_tensor->mutable_data<float>();
}
InitNoPersistableMemory(*target_tensor);
}
} else {
DLOG << "SetInput ---- > resize2";
target_tensor->Resize(input.dims());
DLOG << "SetInput ---- > ShareDataWith";
}
target_tensor->ShareDataWith(input);
}
template <typename Device, typename T>
void Executor<Device, T>::LoadMemory(const VarDesc var_desc, float *tensorInput,
char **data) {}
......@@ -639,6 +704,8 @@ void Executor<GPU_CL, float>::InitMemory() {
template <>
void Executor<GPU_CL, float>::InitCombineMemory() {
DLOG << "CL InitCombineMemory---- "
<< "config_.load_when_predict: " << config_.load_when_predict;
char *origin_data = nullptr;
bool self_alloc = false;
if (program_.combined_params_buf && program_.combined_params_len) {
......
......@@ -32,9 +32,8 @@ namespace framework {
template <typename Device, typename T = float>
class Executor {
public:
Executor(const Program<Device> &program, paddle_mobile::PaddleMobileConfigInternal config, int batch_size = 1,
const bool use_optimize = true, const bool lod_mode = false);
Executor(const Program<Device> &program, int batch_size = 1,
Executor(const Program<Device> &program,
paddle_mobile::PaddleMobileConfigInternal config, int batch_size = 1,
const bool use_optimize = true, const bool lod_mode = false);
PMStatus Predict(const std::vector<std::pair<std::string, Tensor>> &inputs);
......@@ -62,8 +61,6 @@ class Executor {
protected:
Executor() = default;
bool varInputMemory(const std::shared_ptr<VarDesc> &var_desc, Variable *var,
LoDTensor *tensor) const;
void InitMemory();
......@@ -89,7 +86,6 @@ class Executor {
// for super resoltion
DDim input_dim_;
#ifdef PADDLE_MOBILE_PROFILE
struct ProfInfo {
int tid = 0;
......
......@@ -25,7 +25,6 @@ namespace framework {
template <typename Device = CPU, typename T = float>
class Loader {
public:
/*
* @b load separate format fluid model
* @b 加载分开存储的fluid模型
......@@ -60,7 +59,6 @@ class Loader {
void InitMemoryFromProgram(
const std::shared_ptr<ProgramDesc> &originProgramDesc,
const std::shared_ptr<Scope> &scope);
};
} // namespace framework
......
......@@ -42,8 +42,8 @@ PMStatus PaddleMobile<Device, T>::Load(const std::string &dirname,
if (executor_.get() == nullptr) {
executor_ = std::make_shared<framework::Executor<Device, T>>(
loader_->Load(dirname, optimize, quantification), config_, batch_size, optimize,
loddable);
loader_->Load(dirname, optimize, quantification), config_, batch_size,
optimize, loddable);
} else {
LOG(kLOG_INFO) << "executor inited";
}
......@@ -64,7 +64,8 @@ PMStatus PaddleMobile<Device, T>::Load(const std::string &model_path,
if (executor_.get() == nullptr) {
executor_ = std::make_shared<framework::Executor<Device, T>>(
loader_->Load(model_path, para_path, optimize, quantification), config_, batch_size, optimize, loddable);
loader_->Load(model_path, para_path, optimize, quantification), config_,
batch_size, optimize, loddable);
} else {
LOG(kLOG_INFO) << "executor inited";
}
......@@ -86,8 +87,8 @@ bool PaddleMobile<Device, T>::LoadCombinedMemory(
executor_ = std::make_shared<framework::Executor<Device, T>>(
loader_->LoadCombinedMemory(model_len, model_buf, combined_params_len,
combined_params_buf, optimize,
quantification), config_,
batch_size, optimize, loddable);
quantification),
config_, batch_size, optimize, loddable);
} else {
LOG(kLOG_INFO) << "executor inited";
}
......
......@@ -33,12 +33,10 @@ limitations under the License. */
namespace paddle_mobile {
template <typename Device, typename T = float>
class PaddleMobile {
public:
PaddleMobile(PaddleMobileConfigInternal config): config_(config){
PaddleMobile(PaddleMobileConfigInternal config) : config_(config) {
#ifndef PADDLE_MOBILE_CL
bool is_gpu = std::is_same<DeviceType<kGPU_CL>, Device>::value;
PADDLE_MOBILE_ENFORCE(!is_gpu, "Please recompile with GPU_CL is on");
......
......@@ -1190,8 +1190,7 @@ void Gemm::WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
void Gemm::VectorKernel(int m, int n, int k, float alpha, const float *A,
int lda, const float *B, int ldb, float beta, float *C,
int ldc, bool relu) {
}
int ldc, bool relu) {}
#else
......
......@@ -27,7 +27,7 @@ int main() {
// auto program = loader.Load(g_super, true);
auto program = loader.Load(std::string(g_super) + "/model",
std::string(g_super) + "/params", false);
std::string(g_super) + "/params", false);
// program.originProgram->Description("program desc: ");
return 0;
......
......@@ -21,7 +21,7 @@ int main() {
paddle_mobile::PaddleMobileConfigInternal config;
config.load_when_predict = true;
paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile(config);
paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile(config);
// paddle_mobile.SetThreadNum(4);
auto time1 = paddle_mobile::time();
#ifdef PADDLE_MOBILE_CL
......@@ -40,21 +40,26 @@ int main() {
std::vector<float> input;
std::vector<int64_t> dims{1, 1, 300, 300};
GetInput<float>(g_yolo_img, &input, dims);
GetInput<float>(g_test_image_1x3x224x224, &input, dims);
std::vector<float> vec_result;
auto time3 = paddle_mobile::time();
int max = 1;
int max = 10;
for (int i = 0; i < max; ++i) {
auto time5 = paddle_mobile::time();
vec_result = paddle_mobile.Predict(input, dims);
auto time6 = paddle_mobile::time();
std::cout << "predict cost :第" << i << ": "
<< paddle_mobile::time_diff(time5, time6) << "ms" << std::endl;
}
auto time4 = paddle_mobile::time();
std::cout << "predict cost :"
<< paddle_mobile::time_diff(time3, time4) / max << "ms"
<< std::endl;
std::vector<float>::iterator biggest =
auto biggest =
std::max_element(std::begin(vec_result), std::end(vec_result));
std::cout << " Max element is " << *biggest << " at position "
<< std::distance(std::begin(vec_result), biggest) << std::endl;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册