executor.cpp 27.2 KB
Newer Older
W
wangliu 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

15
#include "framework/executor.h"
D
dolphin8 已提交
16
#include <algorithm>
17
#include <utility>
W
wangliu 已提交
18
#include <vector>
L
liuruilong 已提交
19
#include "common/enforce.h"
L
liuruilong 已提交
20
#include "common/log.h"
L
liuruilong 已提交
21
#include "framework/framework.pb-c.h"
L
liuruilong 已提交
22 23
#include "framework/lod_tensor.h"
#include "framework/operator.h"
L
liuruilong 已提交
24
#include "framework/program/program-optimize/program_optimize.h"
L
liuruilong 已提交
25 26 27 28
#include "framework/program/program_desc.h"
#include "framework/program/var_desc.h"
#include "framework/scope.h"
#include "framework/tensor.h"
Z
zhangyang 已提交
29
#include "memory/t_malloc.h"
L
update  
liuruilong 已提交
30 31 32 33

#ifdef PADDLE_MOBILE_CL
#include "framework/cl/cl_image.h"
#endif
W
wangliu 已提交
34 35

namespace paddle_mobile {
36
namespace framework {
37

W
wangliu 已提交
38 39
#pragma mark - executor

40
template <typename Device, typename T>
xiebaiyuan's avatar
xiebaiyuan 已提交
41 42 43 44
Executor<Device, T>::Executor(const Program<Device> &program,
                              paddle_mobile::PaddleMobileConfigInternal config,
                              int batch_size, const bool use_optimize,
                              const bool lod_mode)
45
    : program_(program),
H
hjchen2 已提交
46 47
      batch_size_(batch_size),
      use_optimize_(use_optimize),
xiebaiyuan's avatar
xiebaiyuan 已提交
48 49
      lod_mode_(lod_mode),
      config_(config) {
50 51
  DLOG << "executor in lod mode: " << lod_mode_;

W
wangliu 已提交
52
  Variable *variable_ptr = program_.scope->Var("batch_size");
H
hjchen2 已提交
53
  variable_ptr->SetValue<int>(batch_size);
54 55

  program_desc_ =
Refine  
陈后江 已提交
56
      use_optimize_ ? program_.optimizeProgram : program_.originProgram;
57 58 59 60
  PADDLE_MOBILE_ENFORCE(program_desc_ != nullptr,
                        "program_desc_ should not be nullptr");
  const auto &blocks = program_desc_->Blocks();
  ops_of_block_.resize(blocks.size());
61

W
wangliu 已提交
62
  for (int i = 0; i < blocks.size(); ++i) {
63 64
    std::shared_ptr<BlockDesc> block_desc = blocks[i];
    std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
W
wangliu 已提交
65
    for (int j = 0; j < ops.size(); ++j) {
66 67
      std::shared_ptr<OpDesc> op_desc = ops[j];
      DLOG << "create op: " << op_desc->Type();
68

69 70 71 72 73 74 75
      auto op_handler = OpRegistry<Device>::CreateOp(
          op_desc->Type(), op_desc->GetInputs(), op_desc->GetOutputs(),
          op_desc->GetAttrMap(), program_.scope);
      // infer shape to reshape inputs and outputs before predict,
      // but for lod mode, it still need to infer shape in runtime
      if (!lod_mode) {
        op_handler->InferShape();
xiebaiyuan's avatar
xiebaiyuan 已提交
76
      }
77
      ops_of_block_[i].push_back(op_handler);
W
wangliu 已提交
78 79
    }
  }
80

W
wangliu 已提交
81
  if (program_.combined) {
L
liuruilong 已提交
82 83 84 85
    InitCombineMemory();
  } else {
    InitMemory();
  }
86

87 88 89 90 91
#ifdef PADDLE_MOBILE_FPGA
  program_.scope->EraseVars({"feed", "fetch"});
  program_.scope->print_vars();
#endif

92 93 94 95 96 97 98
  int count = 0;
  for (int block_id = 0; block_id < ops_of_block_.size(); ++block_id) {
    for (auto &op_handler : ops_of_block_[block_id]) {
      DLOG << "Initialize op[" << count++ << "]: " << op_handler->Type();
      op_handler->Init();
      ops_list_.push_back(op_handler);
    }
L
liuruilong 已提交
99
  }
W
wangliu 已提交
100 101
}

102
template <typename T>
103
static void LoadMemInternal(void **data, LoDTensor *tensor,
104
                            bool quant_uint8 = false) {
Refine  
陈后江 已提交
105
  char **data_buf = reinterpret_cast<char **>(data);
106
  int64_t size = tensor->numel();
107
  T *tensor_data = tensor->mutable_data<T>();
108 109
  if (quant_uint8) {
    // should be moved into operator init function
110 111
    float min_value;
    float max_value;
112 113 114
    memory::Copy(&min_value, *data_buf, sizeof(float));
    memory::Copy(&max_value, *data_buf + sizeof(float), sizeof(float));
    *data_buf += 2 * sizeof(float);
115
    const float factor = (max_value - min_value) / 255.0;
116
    const uint8_t *uint8_data = reinterpret_cast<uint8_t *>(*data_buf);
117 118
    for (int k = 0; k < size; ++k) {
      tensor_data[k] = uint8_data[k] * factor + min_value;
W
wangliu 已提交
119
    }
120
    *data_buf += size * sizeof(uint8_t);
121
  } else {
122 123
    memory::Copy(tensor_data, *data_buf, size * sizeof(T));
    *data_buf += size * sizeof(T);
L
liuruilong 已提交
124
  }
125
}
W
wangliu 已提交
126

127 128 129 130
template <typename Device, typename T>
void Executor<Device, T>::LoadMemory(void **data,
                                     const std::shared_ptr<VarDesc> var_desc,
                                     LoDTensor *tensor) {
131
  char **data_buf = reinterpret_cast<char **>(data);
132
  // version
133
  uint32_t version = *(reinterpret_cast<uint32_t *>(*data_buf));
Refine  
陈后江 已提交
134
  *data_buf += sizeof(uint32_t);
135
  // lod information
H
hjchen2 已提交
136 137
  // uint64_t lod_level = *(reinterpret_cast<uint64_t *>(*data_buf));
  uint64_t lod_level = 0;
Z
zhangyang 已提交
138
  memory::Copy(&lod_level, *data_buf, sizeof(uint64_t));
Refine  
陈后江 已提交
139
  *data_buf += sizeof(uint64_t);
140 141 142 143

  auto *lod = tensor->mutable_lod();
  lod->resize(lod_level);
  for (uint64_t i = 0; i < lod_level; ++i) {
144
    uint64_t size = *(reinterpret_cast<uint64_t *>(*data_buf));
Refine  
陈后江 已提交
145
    *data_buf += sizeof(uint64_t);
146
    std::vector<size_t> tmp_dim(size / sizeof(size_t));
Z
zhangyang 已提交
147
    memory::Copy(tmp_dim.data(), *data_buf, size);
148
    (*lod)[i] = std::move(tmp_dim);
Refine  
陈后江 已提交
149
    *data_buf += size;
W
wangliu 已提交
150
  }
151
  // tensor version
152
  uint32_t tensor_version = *(reinterpret_cast<uint32_t *>(*data_buf));
Refine  
陈后江 已提交
153
  *data_buf += sizeof(uint32_t);
154
  // tensor desc size
155
  int32_t tensor_desc_size = *(reinterpret_cast<int32_t *>(*data_buf));
Refine  
陈后江 已提交
156
  *data_buf += sizeof(int32_t);
157
  // skip tensor desc
Refine  
陈后江 已提交
158
  *data_buf += tensor_desc_size;
159

160 161
  const TensorDesc &tensor_desc = var_desc->Tensor_desc();
  tensor->Resize(make_ddim(tensor_desc.Dims()));
162 163
  // parse tensor from stream
  switch (tensor_desc.DataType()) {
164
    case VARTYPE_TYPE_FP32:
165 166
      LoadMemInternal<float>(reinterpret_cast<void **>(data_buf), tensor,
                             program_.quantification);
W
wangliu 已提交
167
      break;
168
    case VARTYPE_TYPE_INT8:
169
      LoadMemInternal<int8_t>(reinterpret_cast<void **>(data_buf), tensor);
W
wangliu 已提交
170
      break;
171
    case VARTYPE_TYPE_INT32:
172
      LoadMemInternal<int>(reinterpret_cast<void **>(data_buf), tensor);
W
wangliu 已提交
173 174
      break;
    default:
175
      LOG(kLOG_ERROR) << "data type is not supported";
L
liuruilong 已提交
176
  }
W
wangliu 已提交
177 178
}

179 180 181
template <typename Device, typename T>
void Executor<Device, T>::InitMemory() {
  for (const auto &block : program_desc_->Blocks()) {
W
wangliu 已提交
182 183
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
184
      auto tensor = var->template GetMutable<LoDTensor>();
W
wangliu 已提交
185 186 187 188
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
Refine  
陈后江 已提交
189
        char *origin_data =
Refine  
陈后江 已提交
190
            ReadFileToBuff(program_.model_path + "/" + var_desc->Name());
Refine  
陈后江 已提交
191
        char *data = origin_data;
192 193
        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
        delete[] origin_data;
W
wangliu 已提交
194
      } else {
195
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
196
          varInputMemory(var_desc, var, tensor);
W
wangliu 已提交
197 198 199 200 201 202
        }
      }
    }
  }
}

203 204
template <typename Device, typename T>
void Executor<Device, T>::InitCombineMemory() {
Refine  
陈后江 已提交
205
  char *origin_data = nullptr;
Refine  
陈后江 已提交
206
  bool self_alloc = false;
207
  if (program_.combined_params_buf && program_.combined_params_len) {
208 209
    origin_data = reinterpret_cast<char *>(
        const_cast<uint8_t *>(program_.combined_params_buf));
210
  } else {
Refine  
陈后江 已提交
211
    self_alloc = true;
Refine  
陈后江 已提交
212
    origin_data = ReadFileToBuff(program_.para_path);
213
  }
Refine  
陈后江 已提交
214 215
  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "data == nullptr");
  char *data = origin_data;
216
  for (const auto &block : program_desc_->Blocks()) {
L
liuruilong 已提交
217 218
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
219
      auto tensor = var->template GetMutable<LoDTensor>();
L
liuruilong 已提交
220 221 222 223
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
L
liuruilong 已提交
224 225 226

        DLOG << " init combine memory persistable: " << var_desc->Name();

227
        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
L
liuruilong 已提交
228
      } else {
229
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
xiebaiyuan's avatar
xiebaiyuan 已提交
230 231
          DLOG << " init combine memory no persistable in lod: "
               << var_desc->Name();
232
          varInputMemory(var_desc, var, tensor);
L
liuruilong 已提交
233 234
        } else {
          DLOG << " init combine memory no persistable: " << var_desc->Name();
L
liuruilong 已提交
235 236 237 238
        }
      }
    }
  }
Refine  
陈后江 已提交
239
  if (self_alloc) {
240
    delete[] origin_data;
Refine  
陈后江 已提交
241 242
  }
  LOG(kLOG_INFO) << "init combine memory finish";
L
liuruilong 已提交
243
}
244

L
liuruilong 已提交
245
template <typename Device, typename T>
L
liuruilong 已提交
246
void Executor<Device, T>::InitNoPersistableMemory(const Tensor &input_tensor) {
L
liuruilong 已提交
247 248 249 250 251 252 253 254 255 256 257
  for (const auto &block : program_desc_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      auto tensor = var->template GetMutable<LoDTensor>();
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
      } else {
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
          DDim tensor_dim = tensor->dims();
xiebaiyuan's avatar
xiebaiyuan 已提交
258 259 260 261
          DDim new_dim =
              make_ddim({tensor_dim[0], tensor_dim[1], input_tensor.dims()[2],
                         input_tensor.dims()[3]});
          tensor->Resize(new_dim);
L
liuruilong 已提交
262 263 264 265 266 267 268 269 270 271 272
          tensor->template mutable_data<T>();
        }
      }
    }
  }

  std::shared_ptr<LoDTensor> output = GetOutput("fetch");
  output->Resize(input_tensor.dims());
  output->mutable_data<T>();
}

273 274 275 276
template <typename Device, typename T>
bool Executor<Device, T>::varInputMemory(
    const std::shared_ptr<VarDesc> &var_desc, Variable *var,
    LoDTensor *tensor) const {
277 278 279 280
#ifdef PADDLE_MOBILE_FPGA
  tensor->init(typeid(float));
  return true;
#endif
281 282
  auto type = var_desc->Tensor_desc().DataType();
  switch (type) {
283
    case VARTYPE_TYPE_FP32:
284
      tensor->mutable_data<float>();
xiebaiyuan's avatar
xiebaiyuan 已提交
285
      break;
286
    case VARTYPE_TYPE_INT8:
287
      tensor->mutable_data<int8_t>();
Refine  
陈后江 已提交
288
      break;
289
    case VARTYPE_TYPE_INT32:
290
      tensor->mutable_data<int32_t>();
xiebaiyuan's avatar
xiebaiyuan 已提交
291
      break;
292
    case VARTYPE_TYPE_INT64:
293
      tensor->mutable_data<int64_t>();
xiebaiyuan's avatar
xiebaiyuan 已提交
294
      break;
Refine  
陈后江 已提交
295
    default:
xiebaiyuan's avatar
xiebaiyuan 已提交
296 297
      break;
  }
298 299 300
  bool is_mute_match =
      (type == VARTYPE_TYPE_FP32) || (type == VARTYPE_TYPE_INT8) ||
      (type == VARTYPE_TYPE_INT32) || (type == VARTYPE_TYPE_INT64);
Refine  
陈后江 已提交
301
  PADDLE_MOBILE_ENFORCE(is_mute_match, "got unhandled data type : %d", type);
xiebaiyuan's avatar
xiebaiyuan 已提交
302 303
  return is_mute_match;
}
L
liuruilong 已提交
304

305 306 307 308 309
template <typename Device, typename T>
PMStatus Executor<Device, T>::Predict(
    const std::vector<std::pair<std::string, Tensor>> &inputs) {
  for (const auto &input : inputs) {
    SetInput(input.second, input.first);
D
dolphin8 已提交
310
  }
311 312 313 314 315 316 317 318
  return this->Predict();
}

template <typename Device, typename T>
PMStatus Executor<Device, T>::Predict(
    const std::vector<std::pair<std::string, LoDTensor>> &inputs) {
  for (const auto &input : inputs) {
    SetInput(input.second, input.first);
D
dolphin8 已提交
319
  }
320
  return this->Predict();
W
wangliu 已提交
321
}
xiebaiyuan's avatar
xiebaiyuan 已提交
322

323 324 325 326 327 328 329 330 331 332 333 334 335 336
template <typename Device, typename T>
std::vector<T> Executor<Device, T>::Predict(const std::vector<T> &input,
                                            const std::vector<int64_t> &dims) {
  Tensor feed_tensor(input, make_ddim(dims));
  SetInput(feed_tensor, "feed");
  std::vector<T> output;
  if (this->Predict() == PMSuccess) {
    const auto output_tensor = GetOutput("fetch");
    output.resize(output_tensor->numel());
    memcpy(output.data(), output_tensor->template data<T>(),
           output.size() * sizeof(T));
  }
  return output;
}
xiebaiyuan's avatar
xiebaiyuan 已提交
337

338 339 340 341 342 343
template <typename Device, typename T>
void Executor<Device, T>::SetInput(const Tensor &input,
                                   const std::string &var_name) {
  auto *target_var = program_.scope->FindVar(var_name);
  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
                        var_name.c_str());
L
liuruilong 已提交
344

345
  auto *target_tensor = target_var->template GetMutable<LoDTensor>();
L
liuruilong 已提交
346 347

  if (config_.load_when_predict) {
Z
zhaojiaying01 已提交
348 349 350
    if (input_dim_last_ != input.dims()) {
      InitNoPersistableMemory(input);
      input_dim_last_ = input.dims();
L
liuruilong 已提交
351 352 353
    }
  }

354 355 356
  target_tensor->Resize(input.dims());
  target_tensor->ShareDataWith(input);
}
xiebaiyuan's avatar
xiebaiyuan 已提交
357

358 359 360 361 362 363 364
template <typename Device, typename T>
void Executor<Device, T>::SetInput(const LoDTensor &input,
                                   const std::string &var_name) {
  auto *target_var = program_.scope->FindVar(var_name);
  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
                        var_name.c_str());
  auto *target_tensor = target_var->template GetMutable<LoDTensor>();
L
liuruilong 已提交
365 366

  if (config_.load_when_predict) {
Z
zhaojiaying01 已提交
367
    if (input_dim_last_ != input.dims()) {
L
liuruilong 已提交
368
      InitNoPersistableMemory(*target_tensor);
Z
zhaojiaying01 已提交
369
      input_dim_last_ = input.dims();
L
liuruilong 已提交
370 371 372
    }
  }

373 374 375 376
  target_tensor->Resize(input.dims());
  target_tensor->ShareDataWith(input);
  target_tensor->set_lod(input.lod());
}
xiebaiyuan's avatar
xiebaiyuan 已提交
377

378 379
template <typename Device, typename T>
PMStatus Executor<Device, T>::Predict() {
xiebaiyuan's avatar
xiebaiyuan 已提交
380
#ifdef PADDLE_MOBILE_PROFILE
381 382 383
  std::vector<ProfInfo> profile(ops_list_.size());
  struct timespec ts;
  int op_index = 0;
xiebaiyuan's avatar
xiebaiyuan 已提交
384
#endif
385 386
  for (auto &block : ops_of_block_) {
    for (auto &op_handler : block) {
xiebaiyuan's avatar
xiebaiyuan 已提交
387
#ifdef PADDLE_MOBILE_PROFILE
388 389
      clock_gettime(CLOCK_MONOTONIC, &ts);
      profile[op_index].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
xiebaiyuan's avatar
xiebaiyuan 已提交
390
#endif
391 392 393 394
      if (lod_mode_) {
        op_handler->InferShape();
      }
      op_handler->Run();
xiebaiyuan's avatar
xiebaiyuan 已提交
395
#ifdef PADDLE_MOBILE_PROFILE
396 397 398
      clock_gettime(CLOCK_MONOTONIC, &ts);
      profile[op_index].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
      ++op_index;
xiebaiyuan's avatar
xiebaiyuan 已提交
399
#endif
400
    }
xiebaiyuan's avatar
xiebaiyuan 已提交
401 402 403 404 405 406
  }
#ifdef PADDLE_MOBILE_PROFILE
  std::unordered_map<std::string, uint64_t> _tp;
  for (int i = 0; i < profile.size(); i++) {
    const auto &pInfo = profile[i];
    uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
407 408 409 410 411
    if (ops_list_[i]->Type() == "conv2d" ||
        ops_list_[i]->Type() == "depthwise_conv2d") {
      auto inputs = ops_list_[i]->Inputs();
      auto *filter =
          GetVarValue<LoDTensor>("Filter", inputs, *(program_.scope));
412
      int kernel_size = filter->dims()[2];
413 414 415
      _tp[ops_list_[i]->Type() + "_" + std::to_string(kernel_size)] += timeCost;
    } else {
      _tp[ops_list_[i]->Type()] += timeCost;
416
    }
xiebaiyuan's avatar
xiebaiyuan 已提交
417
  }
H
hjchen2 已提交
418
  printf("====================[ profile ]======================\n");
419
  typedef std::pair<std::string, uint64_t> prof_t;
xiebaiyuan's avatar
xiebaiyuan 已提交
420 421 422 423 424 425 426 427 428 429 430 431 432 433 434
  std::vector<prof_t> _tv(_tp.begin(), _tp.end());
  uint64_t _ptotal = 0;
  for (auto const &p : _tv) {
    _ptotal += p.second;
  }
  auto compf = [](const prof_t &a, const prof_t &b) {
    return a.second > b.second;
  };
  std::sort(_tv.begin(), _tv.end(), compf);
  _tv.push_back(std::make_pair("total", _ptotal));
  for (auto const &p : _tv) {
    printf("%-16s\t%-10.0f\t%-2.4f\n", p.first.c_str(),
           static_cast<float>(p.second),
           static_cast<float>(p.second) / _ptotal * 100.0);
  }
H
hjchen2 已提交
435
  printf("====================[---------]======================\n");
xiebaiyuan's avatar
xiebaiyuan 已提交
436
#endif
437
  return PMSuccess;
xiebaiyuan's avatar
xiebaiyuan 已提交
438 439
}

440 441 442 443 444 445 446 447
template <typename Device, typename T>
std::shared_ptr<LoDTensor> Executor<Device, T>::GetOutput(
    const std::string &var_name) {
  auto *target_var = program_.scope->FindVar(var_name);
  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
                        var_name.c_str());
  auto *output_tensor = target_var->template GetMutable<LoDTensor>();
  return std::make_shared<LoDTensor>(*output_tensor);
W
wangliu 已提交
448 449
}

450
#ifdef PADDLE_MOBILE_FPGA
451 452 453 454
template <typename Device, typename T>
void Executor<Device, T>::InjectVariable(const Tensor &t,
                                         std::string var_name) {
  Variable *g_feed_value = program_.scope->Var(var_name);
455
  Tensor *feed_tensor = g_feed_value->template GetMutable<LoDTensor>();
456 457
  feed_tensor->Resize(t.dims());
  feed_tensor->ShareDataWith(t);
458
}
459

460 461
template <typename Device, typename T>
void Executor<Device, T>::FeedData(const Tensor &t) {
Z
zhangyang0701 已提交
462
  InjectVariable(t, "feed0");
463
}
464

465
template <typename Device, typename T>
466
void Executor<Device, T>::FeedData(const std::vector<void *> &v) {
467 468
  auto input_size = v.size();
  auto vars = program_.scope->VarContain("feed");
469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488
  PADDLE_MOBILE_ENFORCE(input_size == vars.size(),
                        "input data number not correct");
  for (int i = 0; i < input_size; i++) {
    auto var = program_.scope->Var("feed", i);
    auto feed_tensor = var->template GetMutable<LoDTensor>();
    feed_tensor->external_data = v[i];
  }
}

template <typename Device, typename T>
void Executor<Device, T>::GetResults(std::vector<void *> *v) {
  auto output_size = v->size();
  PADDLE_MOBILE_ENFORCE(output_size > 0, "Empty output");
  auto vars = program_.scope->VarContain("fetch");
  PADDLE_MOBILE_ENFORCE(output_size == vars.size(),
                        "output data number not correct");
  for (int i = 0; i < output_size; i++) {
    auto var = program_.scope->Var("fetch", i);
    auto fetch_tensor = var->template GetMutable<LoDTensor>();
    (*v)[i] = fetch_tensor->template data<float>();
489 490 491
  }
}

492 493
template <typename Device, typename T>
std::shared_ptr<Tensor> Executor<Device, T>::FetchResult(int id) {
H
hjchen2 已提交
494
  auto &ops = ops_of_block_[0];
495

Z
zhangyang 已提交
496 497 498 499 500
  PADDLE_MOBILE_ENFORCE(id < (int)ops.size(), "Index out of range");
  auto op = id < 0 ? ops[ops.size() - 1] : ops[id];
  auto output_map = op->Outputs();
  std::vector<std::string> out_keys = op->GetOutKeys();
  PADDLE_MOBILE_ENFORCE(!out_keys.empty(), "this op contains no output");
501 502 503
  auto *output_tensor =
      GetVarValue<LoDTensor>(out_keys[0], output_map, *(program_.scope));
  return std::make_shared<Tensor>(Tensor(*output_tensor));
504
}
505

506 507
template <typename Device, typename T>
void Executor<Device, T>::Predict_From_To(int start, int end) {
H
hjchen2 已提交
508
  auto &ops = ops_of_block_[0];
509
  end = end < 0 ? static_cast<int>(ops.size()) : end;
510 511 512 513 514 515 516 517 518 519 520 521
  PADDLE_MOBILE_ENFORCE(start >= 0 && start < end && end <= ops.size(),
                        "start or end parameter is wrong");

#ifdef PADDLE_MOBILE_PROFILE
  std::vector<ProfInfo> profile(ops.size());
#endif
  for (int i = start; i < end; i++) {
#ifdef PADDLE_MOBILE_PROFILE
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
Z
zhangyang 已提交
522
    DLOG << "Running op: " << i << "  " << ops[i]->Type();
523 524 525 526 527 528 529
    ops[i]->Run();

#ifdef PADDLE_MOBILE_PROFILE
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
  }
530
}
531

532 533
template <typename Device, typename T>
void Executor<Device, T>::Predict_From(int start) {
534
  Predict_From_To(start);
535
}
536

537 538
template <typename Device, typename T>
void Executor<Device, T>::Predict_To(int end) {
539
  Predict_From_To(0, end);
540
}
541 542
#endif

Y
yangfei 已提交
543
#ifdef PADDLE_MOBILE_CL
xiebaiyuan's avatar
xiebaiyuan 已提交
544 545
template <>
void Executor<GPU_CL, float>::InitNoPersistableMemory(
546
    const Tensor &input_tensor) {
xiebaiyuan's avatar
xiebaiyuan 已提交
547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589
  DLOG << "CL InitNoPersistableMemory ";
  for (const auto &block : program_desc_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());

      auto cl_image = var->template GetMutable<CLImage>();

      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
      } else {
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
          cl_context context = program_.scope->GetCLScpoe()->Context();
          cl_command_queue command_queue =
              program_.scope->GetCLScpoe()->CommandQueue();

          DDim tensor_dim = cl_image->dims();
          DDim new_dim =
              make_ddim({tensor_dim[0], tensor_dim[1], input_tensor.dims()[2],
                         input_tensor.dims()[3]});
          cl_image->Resize(new_dim);
          cl_image->InitEmptyImage(context, command_queue, new_dim);
        }
      }
    }
  }
  std::shared_ptr<LoDTensor> output = GetOutput("fetch");
  output->Resize(input_tensor.dims());
  output->mutable_data<float>();
}
template <>
void Executor<GPU_CL, float>::SetInput(const Tensor &input,
                                       const std::string &var_name) {
  auto *target_var = program_.scope->FindVar(var_name);
  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
                        var_name.c_str());

  auto *target_tensor = target_var->template GetMutable<LoDTensor>();
  DLOG << "config_.load_when_predict   " << config_.load_when_predict;
  DLOG << "target_tensor->IsInitialized() " << target_tensor->IsInitialized();
  DLOG << "target_tensor->dims()   " << target_tensor->dims();
  DLOG << "input.dims()   " << input.dims();
590
  DLOG << "input_dim_last_   " << input_dim_last_;
xiebaiyuan's avatar
xiebaiyuan 已提交
591
  if (config_.load_when_predict) {
xiebaiyuan's avatar
xiebaiyuan 已提交
592
    if (input_dim_last_ != input.dims()) {
593 594 595
      DLOG << "SetInput ---- > resize1";
      target_tensor->Resize(input.dims());
      target_tensor->mutable_data<float>();
xiebaiyuan's avatar
xiebaiyuan 已提交
596 597 598 599 600 601 602 603
      InitNoPersistableMemory(*target_tensor);
    }
  } else {
    DLOG << "SetInput ---- > resize2";
    target_tensor->Resize(input.dims());
    DLOG << "SetInput ---- > ShareDataWith";
  }
  target_tensor->ShareDataWith(input);
604 605
  auto &dim = input.dims();
  input_dim_last_ = static_cast<DDim>(dim);
xiebaiyuan's avatar
xiebaiyuan 已提交
606 607
}

608 609 610
template <typename Device, typename T>
void Executor<Device, T>::LoadMemory(const VarDesc var_desc, float *tensorInput,
                                     char **data) {}
L
liuruilong 已提交
611

Y
yangfei 已提交
612
template <>
H
hjchen2 已提交
613 614
void Executor<GPU_CL, float>::LoadMemory(const VarDesc var_desc,
                                         float *tensorInput, char **data) {
615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651
  // 1. version
  uint32_t version = *reinterpret_cast<uint32_t *>(*data);

  (*data) += sizeof(uint32_t);

  // 2 Lod information
  uint64_t *lod_level_ptr = new uint64_t();
  memcpy(lod_level_ptr, (*data), sizeof(uint64_t));
  uint64_t lod_level = *lod_level_ptr;
  delete lod_level_ptr;
  (*data) += sizeof(uint64_t);

  for (uint64_t i = 0; i < lod_level; ++i) {
    uint64_t size = *reinterpret_cast<uint64_t *>(*data);
    (*data) += sizeof(uint64_t);
    std::vector<size_t> tmp(size / sizeof(size_t));

    for (int k = 0; k < tmp.size(); ++k) {
      tmp[k] = *reinterpret_cast<size_t *>(*data);
      (*data) += sizeof(size_t);
    }
  }

  // 3. tensor version
  uint32_t tensor_version = *reinterpret_cast<uint32_t *>(*data);
  (*data) += sizeof(uint32_t);

  // 4. tensor desc
  int32_t size = *reinterpret_cast<int32_t *>(*data);
  (*data) += sizeof(int32_t);

  std::unique_ptr<char[]> buf(new char[size]);
  for (int m = 0; m < size; ++m) {
    buf.get()[m] = (*data)[m];
  }
  (*data) += (sizeof(char) * size);

652
  const TensorDesc &desc = var_desc.Tensor_desc();
653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686
  int memory_size = 1;
  for (auto l : desc.Dims()) {
    memory_size *= l;
  }

  void *memory = nullptr;
  int type_size = 4;
  memory = tensorInput;
  if (program_.quantification) {
    float min_value;
    float max_value;

    memcpy(&min_value, *data, sizeof(float));
    memcpy(&max_value, *data + sizeof(float), sizeof(float));
    *data += 2 * sizeof(float);
    const float factor = (max_value - min_value) / 255.0;
    uint8_t *uint8_data = reinterpret_cast<uint8_t *>(*data);
    for (int k = 0; k < memory_size; ++k) {
      static_cast<float *>(memory)[k] = uint8_data[k] * factor + min_value;
    }
    *data += (memory_size * sizeof(uint8_t));
  } else {
    for (int n = 0; n < memory_size; n++) {
      float value;
      memcpy(&value, *data + n * type_size, type_size);
      if (value < 1e-30 && value > -1e-30) {
        static_cast<float *>(memory)[n] = 0.0;
      } else {
        static_cast<float *>(memory)[n] = value;
      }
    }
    (*data) += (sizeof(char) * memory_size * type_size);
  }
}
687

Y
yangfei 已提交
688
template <>
689 690
void Executor<GPU_CL, float>::InitMemory() {
  for (const auto &block : program_desc_->Blocks()) {
Y
yangfei 已提交
691 692 693
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      if (var_desc->Persistable()) {
L
liuruilong 已提交
694
        CLImage *cl_image = nullptr;
Y
yangfei 已提交
695
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
696
          var->template GetMutable<LoDTensor>();
Y
yangfei 已提交
697
          continue;
L
liuruilong 已提交
698
        } else {
699
          cl_image = var->template GetMutable<CLImage>();
Y
yangfei 已提交
700
        }
L
liuruilong 已提交
701

Y
yangfei 已提交
702
        char *origin_data =
L
liuruilong 已提交
703
            ReadFileToBuff(program_.model_path + "/" + var_desc->Name());
704
        char *data = origin_data;
Y
yangfei 已提交
705
        cl_context context = program_.scope->GetCLScpoe()->Context();
706
        const TensorDesc &desc = var_desc->Tensor_desc();
707 708 709 710 711
        int numel = 1;
        for (auto l : desc.Dims()) {
          numel *= l;
        }
        DLOG << var_desc->Name();
Y
yangfei 已提交
712
        float *tensorInput = static_cast<float *>(
713 714
            paddle_mobile::memory::Alloc(sizeof(float) * numel));
        LoadMemory(*var_desc, tensorInput, &data);
Y
yangfei 已提交
715

716
        DDim ddim = make_ddim(desc.Dims());
Y
yangfei 已提交
717

L
liuruilong 已提交
718 719
        // has not init
        cl_image->SetTensorData(tensorInput, ddim);
Y
yangfei 已提交
720

721
        delete origin_data;
Y
yangfei 已提交
722
        paddle_mobile::memory::Free(tensorInput);
723
      } else {
724 725
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
          auto cl_image = var->template GetMutable<CLImage>();
726
          cl_context context = program_.scope->GetCLScpoe()->Context();
L
liuruilong 已提交
727 728
          cl_command_queue command_queue =
              program_.scope->GetCLScpoe()->CommandQueue();
Y
yangfei 已提交
729

730 731 732
          const TensorDesc &desc = var_desc->Tensor_desc();
          //          DDim ddim = make_ddim(desc.Dims());
          DDim ddim = cl_image->dims();
733
          DLOG << var_desc->Name();
L
liuruilong 已提交
734
          cl_image->InitEmptyImage(context, command_queue, ddim);
735
        }
Y
yangfei 已提交
736 737 738 739
      }
    }
  }
}
740

Y
yangfei 已提交
741
template <>
742
void Executor<GPU_CL, float>::InitCombineMemory() {
xiebaiyuan's avatar
xiebaiyuan 已提交
743 744
  DLOG << "CL InitCombineMemory---- "
       << "config_.load_when_predict: " << config_.load_when_predict;
Y
yangfei 已提交
745 746
  char *origin_data = nullptr;
  bool self_alloc = false;
Y
yangfei 已提交
747 748
  if (program_.combined_params_buf && program_.combined_params_len) {
    LOG(kLOG_INFO) << "use outter memory";
749
    origin_data = reinterpret_cast<char *>(program_.combined_params_buf);
Y
yangfei 已提交
750 751
  } else {
    LOG(kLOG_INFO) << " begin init combine memory";
Y
yangfei 已提交
752
    self_alloc = true;
L
liuruilong 已提交
753
    origin_data = ReadFileToBuff(program_.para_path);
Y
yangfei 已提交
754 755
  }
  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "origin_data==nullptr!!!");
756
  float *data = reinterpret_cast<float *>(origin_data);
Y
yangfei 已提交
757

758
  for (const auto &block : program_desc_->Blocks()) {
Y
yangfei 已提交
759 760 761
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      if (var_desc->Persistable()) {
L
liuruilong 已提交
762
        CLImage *cl_image = nullptr;
Y
yangfei 已提交
763
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
764
          var->template GetMutable<LoDTensor>();
Y
yangfei 已提交
765
          continue;
L
liuruilong 已提交
766
        } else {
767
          cl_image = var->template GetMutable<CLImage>();
Y
yangfei 已提交
768 769 770 771
        }

        cl_context context = program_.scope->GetCLScpoe()->Context();

772 773
        const TensorDesc &desc = var_desc->Tensor_desc();
        DDim ddim = make_ddim(desc.Dims());
Y
yangfei 已提交
774 775 776 777 778

        int numel = 1;
        for (int i = 0; i < ddim.size(); i++) {
          numel = numel * ddim[i];
        }
779 780 781
        float *tensorInput = static_cast<float *>(
            paddle_mobile::memory::Alloc(sizeof(float) * numel));
        LoadMemory(*var_desc, tensorInput, &origin_data);
L
liuruilong 已提交
782 783 784 785

        // has not init
        cl_image->SetTensorData(tensorInput, ddim);

786 787
        paddle_mobile::memory::Free(tensorInput);
      } else {
788
        auto cl_image = var->template GetMutable<CLImage>();
Y
yangfei 已提交
789
        cl_context context = program_.scope->GetCLScpoe()->Context();
L
liuruilong 已提交
790 791
        cl_command_queue command_queue =
            program_.scope->GetCLScpoe()->CommandQueue();
792 793 794
        const TensorDesc &desc = var_desc->Tensor_desc();
        DDim ddim = cl_image->dims();
        //  DDim ddim = make_ddim(desc.Dims());
L
liuruilong 已提交
795
        cl_image->InitEmptyImage(context, command_queue, ddim);
Y
yangfei 已提交
796 797 798
      }
    }
  }
Y
yangfei 已提交
799
  if (self_alloc) {
800
    delete data;
Y
yangfei 已提交
801
  }
Y
yangfei 已提交
802
  LOG(kLOG_INFO) << " end init combine memory ";
803
}
Y
yangfei 已提交
804 805 806

#endif

807
template class Executor<CPU, float>;
Y
yangfei 已提交
808

809
template class Executor<FPGA, float>;
W
wangliu 已提交
810

811
template class Executor<GPU_CL, float>;
Y
yangfei 已提交
812

813
template class Executor<GPU_MALI, float>;
Y
yangfei 已提交
814 815

}  // namespace framework
W
wangliu 已提交
816
}  // namespace paddle_mobile