executor.cpp 28.3 KB
Newer Older
W
wangliu 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

15
#include "framework/executor.h"
D
dolphin8 已提交
16
#include <algorithm>
17
#include <utility>
W
wangliu 已提交
18
#include <vector>
L
liuruilong 已提交
19
#include "common/enforce.h"
L
liuruilong 已提交
20
#include "common/log.h"
L
liuruilong 已提交
21
#include "framework/framework.pb-c.h"
L
liuruilong 已提交
22 23
#include "framework/lod_tensor.h"
#include "framework/operator.h"
L
liuruilong 已提交
24
#include "framework/program/program-optimize/program_optimize.h"
L
liuruilong 已提交
25 26 27 28
#include "framework/program/program_desc.h"
#include "framework/program/var_desc.h"
#include "framework/scope.h"
#include "framework/tensor.h"
Z
zhangyang 已提交
29
#include "memory/t_malloc.h"
L
update  
liuruilong 已提交
30 31 32 33

#ifdef PADDLE_MOBILE_CL
#include "framework/cl/cl_image.h"
#endif
W
wangliu 已提交
34 35

namespace paddle_mobile {
36
namespace framework {
37

W
wangliu 已提交
38 39
#pragma mark - executor

40
template <typename Device, typename T>
xiebaiyuan's avatar
xiebaiyuan 已提交
41 42 43 44
Executor<Device, T>::Executor(const Program<Device> &program,
                              paddle_mobile::PaddleMobileConfigInternal config,
                              int batch_size, const bool use_optimize,
                              const bool lod_mode)
45
    : program_(program),
H
hjchen2 已提交
46 47
      batch_size_(batch_size),
      use_optimize_(use_optimize),
xiebaiyuan's avatar
xiebaiyuan 已提交
48 49
      lod_mode_(lod_mode),
      config_(config) {
50 51
  DLOG << "executor in lod mode: " << lod_mode_;

W
wangliu 已提交
52
  Variable *variable_ptr = program_.scope->Var("batch_size");
H
hjchen2 已提交
53
  variable_ptr->SetValue<int>(batch_size);
54 55

  program_desc_ =
Refine  
陈后江 已提交
56
      use_optimize_ ? program_.optimizeProgram : program_.originProgram;
57 58 59 60
  PADDLE_MOBILE_ENFORCE(program_desc_ != nullptr,
                        "program_desc_ should not be nullptr");
  const auto &blocks = program_desc_->Blocks();
  ops_of_block_.resize(blocks.size());
61

W
wangliu 已提交
62
  for (int i = 0; i < blocks.size(); ++i) {
63 64
    std::shared_ptr<BlockDesc> block_desc = blocks[i];
    std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
W
wangliu 已提交
65
    for (int j = 0; j < ops.size(); ++j) {
66 67
      std::shared_ptr<OpDesc> op_desc = ops[j];
      DLOG << "create op: " << op_desc->Type();
68

69 70 71 72 73 74 75
      auto op_handler = OpRegistry<Device>::CreateOp(
          op_desc->Type(), op_desc->GetInputs(), op_desc->GetOutputs(),
          op_desc->GetAttrMap(), program_.scope);
      // infer shape to reshape inputs and outputs before predict,
      // but for lod mode, it still need to infer shape in runtime
      if (!lod_mode) {
        op_handler->InferShape();
xiebaiyuan's avatar
xiebaiyuan 已提交
76
      }
77
      ops_of_block_[i].push_back(op_handler);
W
wangliu 已提交
78 79
    }
  }
80

W
wangliu 已提交
81
  if (program_.combined) {
L
liuruilong 已提交
82 83 84 85
    InitCombineMemory();
  } else {
    InitMemory();
  }
86

87 88 89 90 91
#ifdef PADDLE_MOBILE_FPGA
  program_.scope->EraseVars({"feed", "fetch"});
  program_.scope->print_vars();
#endif

92 93 94 95 96 97 98
  int count = 0;
  for (int block_id = 0; block_id < ops_of_block_.size(); ++block_id) {
    for (auto &op_handler : ops_of_block_[block_id]) {
      DLOG << "Initialize op[" << count++ << "]: " << op_handler->Type();
      op_handler->Init();
      ops_list_.push_back(op_handler);
    }
L
liuruilong 已提交
99
  }
W
wangliu 已提交
100 101
}

102
template <typename T>
103
static void LoadMemInternal(void **data, LoDTensor *tensor,
104
                            bool quant_uint8 = false) {
Refine  
陈后江 已提交
105
  char **data_buf = reinterpret_cast<char **>(data);
106
  int64_t size = tensor->numel();
107
  T *tensor_data = tensor->mutable_data<T>();
108 109
  if (quant_uint8) {
    // should be moved into operator init function
110 111
    float min_value;
    float max_value;
112 113 114
    memory::Copy(&min_value, *data_buf, sizeof(float));
    memory::Copy(&max_value, *data_buf + sizeof(float), sizeof(float));
    *data_buf += 2 * sizeof(float);
115
    const float factor = (max_value - min_value) / 255.0;
116
    const uint8_t *uint8_data = reinterpret_cast<uint8_t *>(*data_buf);
117 118
    for (int k = 0; k < size; ++k) {
      tensor_data[k] = uint8_data[k] * factor + min_value;
W
wangliu 已提交
119
    }
120
    *data_buf += size * sizeof(uint8_t);
121
  } else {
122 123
    memory::Copy(tensor_data, *data_buf, size * sizeof(T));
    *data_buf += size * sizeof(T);
L
liuruilong 已提交
124
  }
125
}
W
wangliu 已提交
126

127 128 129 130
template <typename Device, typename T>
void Executor<Device, T>::LoadMemory(void **data,
                                     const std::shared_ptr<VarDesc> var_desc,
                                     LoDTensor *tensor) {
131
  char **data_buf = reinterpret_cast<char **>(data);
132
  // version
133
  uint32_t version = *(reinterpret_cast<uint32_t *>(*data_buf));
Refine  
陈后江 已提交
134
  *data_buf += sizeof(uint32_t);
135
  // lod information
H
hjchen2 已提交
136 137
  // uint64_t lod_level = *(reinterpret_cast<uint64_t *>(*data_buf));
  uint64_t lod_level = 0;
Z
zhangyang 已提交
138
  memory::Copy(&lod_level, *data_buf, sizeof(uint64_t));
Refine  
陈后江 已提交
139
  *data_buf += sizeof(uint64_t);
140 141 142 143

  auto *lod = tensor->mutable_lod();
  lod->resize(lod_level);
  for (uint64_t i = 0; i < lod_level; ++i) {
144
    uint64_t size = *(reinterpret_cast<uint64_t *>(*data_buf));
Refine  
陈后江 已提交
145
    *data_buf += sizeof(uint64_t);
146
    std::vector<size_t> tmp_dim(size / sizeof(size_t));
Z
zhangyang 已提交
147
    memory::Copy(tmp_dim.data(), *data_buf, size);
148
    (*lod)[i] = std::move(tmp_dim);
Refine  
陈后江 已提交
149
    *data_buf += size;
W
wangliu 已提交
150
  }
151
  // tensor version
152
  uint32_t tensor_version = *(reinterpret_cast<uint32_t *>(*data_buf));
Refine  
陈后江 已提交
153
  *data_buf += sizeof(uint32_t);
154
  // tensor desc size
155
  int32_t tensor_desc_size = *(reinterpret_cast<int32_t *>(*data_buf));
Refine  
陈后江 已提交
156
  *data_buf += sizeof(int32_t);
157
  // skip tensor desc
Refine  
陈后江 已提交
158
  *data_buf += tensor_desc_size;
159

160 161
  const TensorDesc &tensor_desc = var_desc->Tensor_desc();
  tensor->Resize(make_ddim(tensor_desc.Dims()));
162 163
  // parse tensor from stream
  switch (tensor_desc.DataType()) {
164
    case VARTYPE_TYPE_FP32:
165 166
      LoadMemInternal<float>(reinterpret_cast<void **>(data_buf), tensor,
                             program_.quantification);
W
wangliu 已提交
167
      break;
168
    case VARTYPE_TYPE_INT8:
169
      LoadMemInternal<int8_t>(reinterpret_cast<void **>(data_buf), tensor);
W
wangliu 已提交
170
      break;
171
    case VARTYPE_TYPE_INT32:
172
      LoadMemInternal<int>(reinterpret_cast<void **>(data_buf), tensor);
W
wangliu 已提交
173 174
      break;
    default:
175
      LOG(kLOG_ERROR) << "data type is not supported";
L
liuruilong 已提交
176
  }
W
wangliu 已提交
177 178
}

179 180 181
template <typename Device, typename T>
void Executor<Device, T>::InitMemory() {
  for (const auto &block : program_desc_->Blocks()) {
W
wangliu 已提交
182 183
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
184
      auto tensor = var->template GetMutable<LoDTensor>();
W
wangliu 已提交
185 186 187 188
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
Refine  
陈后江 已提交
189
        char *origin_data =
Refine  
陈后江 已提交
190
            ReadFileToBuff(program_.model_path + "/" + var_desc->Name());
Refine  
陈后江 已提交
191
        char *data = origin_data;
192 193
        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
        delete[] origin_data;
W
wangliu 已提交
194
      } else {
195
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
196
          varInputMemory(var_desc, var, tensor);
W
wangliu 已提交
197 198 199 200 201 202
        }
      }
    }
  }
}

203 204
template <typename Device, typename T>
void Executor<Device, T>::InitCombineMemory() {
Refine  
陈后江 已提交
205
  char *origin_data = nullptr;
Refine  
陈后江 已提交
206
  bool self_alloc = false;
207
  if (program_.combined_params_buf && program_.combined_params_len) {
208 209
    origin_data = reinterpret_cast<char *>(
        const_cast<uint8_t *>(program_.combined_params_buf));
210
  } else {
Refine  
陈后江 已提交
211
    self_alloc = true;
Refine  
陈后江 已提交
212
    origin_data = ReadFileToBuff(program_.para_path);
213
  }
Refine  
陈后江 已提交
214 215
  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "data == nullptr");
  char *data = origin_data;
216
  for (const auto &block : program_desc_->Blocks()) {
L
liuruilong 已提交
217 218
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
219
      auto tensor = var->template GetMutable<LoDTensor>();
L
liuruilong 已提交
220 221 222 223
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
L
liuruilong 已提交
224 225 226

        DLOG << " init combine memory persistable: " << var_desc->Name();

227
        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
L
liuruilong 已提交
228
      } else {
229
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
xiebaiyuan's avatar
xiebaiyuan 已提交
230 231
          DLOG << " init combine memory no persistable in lod: "
               << var_desc->Name();
232
          varInputMemory(var_desc, var, tensor);
L
liuruilong 已提交
233 234
        } else {
          DLOG << " init combine memory no persistable: " << var_desc->Name();
L
liuruilong 已提交
235 236 237 238
        }
      }
    }
  }
Refine  
陈后江 已提交
239
  if (self_alloc) {
240
    delete[] origin_data;
Refine  
陈后江 已提交
241 242
  }
  LOG(kLOG_INFO) << "init combine memory finish";
L
liuruilong 已提交
243
}
244

L
liuruilong 已提交
245
template <typename Device, typename T>
L
liuruilong 已提交
246
void Executor<Device, T>::InitNoPersistableMemory(const Tensor &input_tensor) {
L
liuruilong 已提交
247 248 249 250 251 252 253 254 255 256 257
  for (const auto &block : program_desc_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      auto tensor = var->template GetMutable<LoDTensor>();
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
      } else {
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
          DDim tensor_dim = tensor->dims();
xiebaiyuan's avatar
xiebaiyuan 已提交
258 259 260 261
          DDim new_dim =
              make_ddim({tensor_dim[0], tensor_dim[1], input_tensor.dims()[2],
                         input_tensor.dims()[3]});
          tensor->Resize(new_dim);
L
liuruilong 已提交
262 263 264 265 266 267 268 269 270 271 272
          tensor->template mutable_data<T>();
        }
      }
    }
  }

  std::shared_ptr<LoDTensor> output = GetOutput("fetch");
  output->Resize(input_tensor.dims());
  output->mutable_data<T>();
}

273 274 275 276
template <typename Device, typename T>
bool Executor<Device, T>::varInputMemory(
    const std::shared_ptr<VarDesc> &var_desc, Variable *var,
    LoDTensor *tensor) const {
277 278 279 280
#ifdef PADDLE_MOBILE_FPGA
  tensor->init(typeid(float));
  return true;
#endif
281 282
  auto type = var_desc->Tensor_desc().DataType();
  switch (type) {
283
    case VARTYPE_TYPE_FP32:
284
      tensor->mutable_data<float>();
xiebaiyuan's avatar
xiebaiyuan 已提交
285
      break;
286
    case VARTYPE_TYPE_INT8:
287
      tensor->mutable_data<int8_t>();
Refine  
陈后江 已提交
288
      break;
289
    case VARTYPE_TYPE_INT32:
290
      tensor->mutable_data<int32_t>();
xiebaiyuan's avatar
xiebaiyuan 已提交
291
      break;
292
    case VARTYPE_TYPE_INT64:
293
      tensor->mutable_data<int64_t>();
xiebaiyuan's avatar
xiebaiyuan 已提交
294
      break;
Refine  
陈后江 已提交
295
    default:
xiebaiyuan's avatar
xiebaiyuan 已提交
296 297
      break;
  }
298 299 300
  bool is_mute_match =
      (type == VARTYPE_TYPE_FP32) || (type == VARTYPE_TYPE_INT8) ||
      (type == VARTYPE_TYPE_INT32) || (type == VARTYPE_TYPE_INT64);
Refine  
陈后江 已提交
301
  PADDLE_MOBILE_ENFORCE(is_mute_match, "got unhandled data type : %d", type);
xiebaiyuan's avatar
xiebaiyuan 已提交
302 303
  return is_mute_match;
}
L
liuruilong 已提交
304

305 306 307 308 309
template <typename Device, typename T>
PMStatus Executor<Device, T>::Predict(
    const std::vector<std::pair<std::string, Tensor>> &inputs) {
  for (const auto &input : inputs) {
    SetInput(input.second, input.first);
D
dolphin8 已提交
310
  }
311 312 313 314 315 316 317 318
  return this->Predict();
}

template <typename Device, typename T>
PMStatus Executor<Device, T>::Predict(
    const std::vector<std::pair<std::string, LoDTensor>> &inputs) {
  for (const auto &input : inputs) {
    SetInput(input.second, input.first);
D
dolphin8 已提交
319
  }
320
  return this->Predict();
W
wangliu 已提交
321
}
xiebaiyuan's avatar
xiebaiyuan 已提交
322

323 324 325 326 327 328 329 330 331 332 333 334 335 336
template <typename Device, typename T>
std::vector<T> Executor<Device, T>::Predict(const std::vector<T> &input,
                                            const std::vector<int64_t> &dims) {
  Tensor feed_tensor(input, make_ddim(dims));
  SetInput(feed_tensor, "feed");
  std::vector<T> output;
  if (this->Predict() == PMSuccess) {
    const auto output_tensor = GetOutput("fetch");
    output.resize(output_tensor->numel());
    memcpy(output.data(), output_tensor->template data<T>(),
           output.size() * sizeof(T));
  }
  return output;
}
xiebaiyuan's avatar
xiebaiyuan 已提交
337

338 339 340 341 342 343
template <typename Device, typename T>
void Executor<Device, T>::SetInput(const Tensor &input,
                                   const std::string &var_name) {
  auto *target_var = program_.scope->FindVar(var_name);
  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
                        var_name.c_str());
L
liuruilong 已提交
344

345
  auto *target_tensor = target_var->template GetMutable<LoDTensor>();
L
liuruilong 已提交
346 347

  if (config_.load_when_predict) {
Z
zhaojiaying01 已提交
348 349 350
    if (input_dim_last_ != input.dims()) {
      InitNoPersistableMemory(input);
      input_dim_last_ = input.dims();
L
liuruilong 已提交
351 352 353
    }
  }

354 355 356
  target_tensor->Resize(input.dims());
  target_tensor->ShareDataWith(input);
}
xiebaiyuan's avatar
xiebaiyuan 已提交
357

358 359 360 361 362 363 364
template <typename Device, typename T>
void Executor<Device, T>::SetInput(const LoDTensor &input,
                                   const std::string &var_name) {
  auto *target_var = program_.scope->FindVar(var_name);
  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
                        var_name.c_str());
  auto *target_tensor = target_var->template GetMutable<LoDTensor>();
L
liuruilong 已提交
365 366

  if (config_.load_when_predict) {
Z
zhaojiaying01 已提交
367
    if (input_dim_last_ != input.dims()) {
L
liuruilong 已提交
368
      InitNoPersistableMemory(*target_tensor);
Z
zhaojiaying01 已提交
369
      input_dim_last_ = input.dims();
L
liuruilong 已提交
370 371 372
    }
  }

373 374 375 376
  target_tensor->Resize(input.dims());
  target_tensor->ShareDataWith(input);
  target_tensor->set_lod(input.lod());
}
xiebaiyuan's avatar
xiebaiyuan 已提交
377

378 379
template <typename Device, typename T>
PMStatus Executor<Device, T>::Predict() {
xiebaiyuan's avatar
xiebaiyuan 已提交
380
#ifdef PADDLE_MOBILE_PROFILE
381 382 383
  std::vector<ProfInfo> profile(ops_list_.size());
  struct timespec ts;
  int op_index = 0;
xiebaiyuan's avatar
xiebaiyuan 已提交
384
#endif
385 386
  for (auto &block : ops_of_block_) {
    for (auto &op_handler : block) {
xiebaiyuan's avatar
xiebaiyuan 已提交
387
#ifdef PADDLE_MOBILE_PROFILE
388 389
      clock_gettime(CLOCK_MONOTONIC, &ts);
      profile[op_index].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
xiebaiyuan's avatar
xiebaiyuan 已提交
390
#endif
391 392 393 394
      if (lod_mode_) {
        op_handler->InferShape();
      }
      op_handler->Run();
xiebaiyuan's avatar
xiebaiyuan 已提交
395
#ifdef PADDLE_MOBILE_PROFILE
396 397 398
      clock_gettime(CLOCK_MONOTONIC, &ts);
      profile[op_index].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
      ++op_index;
xiebaiyuan's avatar
xiebaiyuan 已提交
399
#endif
400
    }
xiebaiyuan's avatar
xiebaiyuan 已提交
401 402 403 404 405 406
  }
#ifdef PADDLE_MOBILE_PROFILE
  std::unordered_map<std::string, uint64_t> _tp;
  for (int i = 0; i < profile.size(); i++) {
    const auto &pInfo = profile[i];
    uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
407 408 409 410 411
    if (ops_list_[i]->Type() == "conv2d" ||
        ops_list_[i]->Type() == "depthwise_conv2d") {
      auto inputs = ops_list_[i]->Inputs();
      auto *filter =
          GetVarValue<LoDTensor>("Filter", inputs, *(program_.scope));
412
      int kernel_size = filter->dims()[2];
413 414 415
      _tp[ops_list_[i]->Type() + "_" + std::to_string(kernel_size)] += timeCost;
    } else {
      _tp[ops_list_[i]->Type()] += timeCost;
416
    }
xiebaiyuan's avatar
xiebaiyuan 已提交
417
  }
H
hjchen2 已提交
418
  printf("====================[ profile ]======================\n");
419
  typedef std::pair<std::string, uint64_t> prof_t;
xiebaiyuan's avatar
xiebaiyuan 已提交
420 421 422 423 424 425 426 427 428 429 430 431 432 433 434
  std::vector<prof_t> _tv(_tp.begin(), _tp.end());
  uint64_t _ptotal = 0;
  for (auto const &p : _tv) {
    _ptotal += p.second;
  }
  auto compf = [](const prof_t &a, const prof_t &b) {
    return a.second > b.second;
  };
  std::sort(_tv.begin(), _tv.end(), compf);
  _tv.push_back(std::make_pair("total", _ptotal));
  for (auto const &p : _tv) {
    printf("%-16s\t%-10.0f\t%-2.4f\n", p.first.c_str(),
           static_cast<float>(p.second),
           static_cast<float>(p.second) / _ptotal * 100.0);
  }
H
hjchen2 已提交
435
  printf("====================[---------]======================\n");
xiebaiyuan's avatar
xiebaiyuan 已提交
436
#endif
437
  return PMSuccess;
xiebaiyuan's avatar
xiebaiyuan 已提交
438 439
}

440 441 442 443 444 445 446 447
template <typename Device, typename T>
std::shared_ptr<LoDTensor> Executor<Device, T>::GetOutput(
    const std::string &var_name) {
  auto *target_var = program_.scope->FindVar(var_name);
  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
                        var_name.c_str());
  auto *output_tensor = target_var->template GetMutable<LoDTensor>();
  return std::make_shared<LoDTensor>(*output_tensor);
W
wangliu 已提交
448 449
}

450
#ifdef PADDLE_MOBILE_FPGA
451 452 453 454
template <typename Device, typename T>
void Executor<Device, T>::InjectVariable(const Tensor &t,
                                         std::string var_name) {
  Variable *g_feed_value = program_.scope->Var(var_name);
455
  Tensor *feed_tensor = g_feed_value->template GetMutable<LoDTensor>();
456 457
  feed_tensor->Resize(t.dims());
  feed_tensor->ShareDataWith(t);
458
}
459

460 461
template <typename Device, typename T>
void Executor<Device, T>::FeedData(const Tensor &t) {
Z
zhangyang0701 已提交
462
  InjectVariable(t, "feed0");
463
}
464

465
template <typename Device, typename T>
466
void Executor<Device, T>::FeedData(const std::vector<void *> &v) {
467 468
  auto input_size = v.size();
  auto vars = program_.scope->VarContain("feed");
469 470 471 472 473 474 475 476 477
  PADDLE_MOBILE_ENFORCE(input_size == vars.size(),
                        "input data number not correct");
  for (int i = 0; i < input_size; i++) {
    auto var = program_.scope->Var("feed", i);
    auto feed_tensor = var->template GetMutable<LoDTensor>();
    feed_tensor->external_data = v[i];
  }
}

478 479 480 481 482 483 484 485 486 487 488 489 490
template <typename Device, typename T>
void Executor<Device, T>::FeedData(const vector<framework::Tensor> &v) {
  auto input_size = v.size();
  auto vars = program_.scope->VarContain("feed");
  PADDLE_MOBILE_ENFORCE(input_size == vars.size(),
                        "input data number not correct");
  for (int i = 0; i < input_size; i++) {
    auto var = program_.scope->Var("feed", i);
    auto feed_tensor = var->template GetMutable<LoDTensor>();
    feed_tensor->ShareDataWith(v[i]);
  }
}

491 492 493 494 495 496 497 498 499 500 501
template <typename Device, typename T>
void Executor<Device, T>::GetResults(std::vector<void *> *v) {
  auto output_size = v->size();
  PADDLE_MOBILE_ENFORCE(output_size > 0, "Empty output");
  auto vars = program_.scope->VarContain("fetch");
  PADDLE_MOBILE_ENFORCE(output_size == vars.size(),
                        "output data number not correct");
  for (int i = 0; i < output_size; i++) {
    auto var = program_.scope->Var("fetch", i);
    auto fetch_tensor = var->template GetMutable<LoDTensor>();
    (*v)[i] = fetch_tensor->template data<float>();
502 503 504
  }
}

505 506 507 508 509 510 511 512 513 514 515 516 517 518
template <typename Device, typename T>
void Executor<Device, T>::GetResults(std::vector<framework::Tensor *> *v) {
  auto output_size = v->size();
  PADDLE_MOBILE_ENFORCE(output_size > 0, "Empty output");
  auto vars = program_.scope->VarContain("fetch");
  PADDLE_MOBILE_ENFORCE(output_size == vars.size(),
                        "output data number not correct");
  for (int i = 0; i < output_size; i++) {
    auto var = program_.scope->Var("fetch", i);
    auto fetch_tensor = var->template GetMutable<LoDTensor>();
    (*v)[i] = fetch_tensor;
  }
}

519 520
template <typename Device, typename T>
std::shared_ptr<Tensor> Executor<Device, T>::FetchResult(int id) {
H
hjchen2 已提交
521
  auto &ops = ops_of_block_[0];
522

Z
zhangyang 已提交
523 524 525 526 527
  PADDLE_MOBILE_ENFORCE(id < (int)ops.size(), "Index out of range");
  auto op = id < 0 ? ops[ops.size() - 1] : ops[id];
  auto output_map = op->Outputs();
  std::vector<std::string> out_keys = op->GetOutKeys();
  PADDLE_MOBILE_ENFORCE(!out_keys.empty(), "this op contains no output");
528 529 530
  auto *output_tensor =
      GetVarValue<LoDTensor>(out_keys[0], output_map, *(program_.scope));
  return std::make_shared<Tensor>(Tensor(*output_tensor));
531
}
532

533 534
template <typename Device, typename T>
void Executor<Device, T>::Predict_From_To(int start, int end) {
H
hjchen2 已提交
535
  auto &ops = ops_of_block_[0];
536
  end = end < 0 ? static_cast<int>(ops.size()) : end;
537 538 539 540 541 542 543 544 545 546 547 548
  PADDLE_MOBILE_ENFORCE(start >= 0 && start < end && end <= ops.size(),
                        "start or end parameter is wrong");

#ifdef PADDLE_MOBILE_PROFILE
  std::vector<ProfInfo> profile(ops.size());
#endif
  for (int i = start; i < end; i++) {
#ifdef PADDLE_MOBILE_PROFILE
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
Z
zhangyang 已提交
549
    DLOG << "Running op: " << i << "  " << ops[i]->Type();
550 551 552 553 554 555 556
    ops[i]->Run();

#ifdef PADDLE_MOBILE_PROFILE
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
  }
557
}
558

559 560
template <typename Device, typename T>
void Executor<Device, T>::Predict_From(int start) {
561
  Predict_From_To(start);
562
}
563

564 565
template <typename Device, typename T>
void Executor<Device, T>::Predict_To(int end) {
566
  Predict_From_To(0, end);
567
}
568 569
#endif

Y
yangfei 已提交
570
#ifdef PADDLE_MOBILE_CL
xiebaiyuan's avatar
xiebaiyuan 已提交
571 572
template <>
void Executor<GPU_CL, float>::InitNoPersistableMemory(
573
    const Tensor &input_tensor) {
xiebaiyuan's avatar
xiebaiyuan 已提交
574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616
  DLOG << "CL InitNoPersistableMemory ";
  for (const auto &block : program_desc_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());

      auto cl_image = var->template GetMutable<CLImage>();

      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
      } else {
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
          cl_context context = program_.scope->GetCLScpoe()->Context();
          cl_command_queue command_queue =
              program_.scope->GetCLScpoe()->CommandQueue();

          DDim tensor_dim = cl_image->dims();
          DDim new_dim =
              make_ddim({tensor_dim[0], tensor_dim[1], input_tensor.dims()[2],
                         input_tensor.dims()[3]});
          cl_image->Resize(new_dim);
          cl_image->InitEmptyImage(context, command_queue, new_dim);
        }
      }
    }
  }
  std::shared_ptr<LoDTensor> output = GetOutput("fetch");
  output->Resize(input_tensor.dims());
  output->mutable_data<float>();
}
template <>
void Executor<GPU_CL, float>::SetInput(const Tensor &input,
                                       const std::string &var_name) {
  auto *target_var = program_.scope->FindVar(var_name);
  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
                        var_name.c_str());

  auto *target_tensor = target_var->template GetMutable<LoDTensor>();
  DLOG << "config_.load_when_predict   " << config_.load_when_predict;
  DLOG << "target_tensor->IsInitialized() " << target_tensor->IsInitialized();
  DLOG << "target_tensor->dims()   " << target_tensor->dims();
  DLOG << "input.dims()   " << input.dims();
617
  DLOG << "input_dim_last_   " << input_dim_last_;
xiebaiyuan's avatar
xiebaiyuan 已提交
618
  if (config_.load_when_predict) {
xiebaiyuan's avatar
xiebaiyuan 已提交
619
    if (input_dim_last_ != input.dims()) {
620 621 622
      DLOG << "SetInput ---- > resize1";
      target_tensor->Resize(input.dims());
      target_tensor->mutable_data<float>();
xiebaiyuan's avatar
xiebaiyuan 已提交
623 624 625 626 627 628 629 630
      InitNoPersistableMemory(*target_tensor);
    }
  } else {
    DLOG << "SetInput ---- > resize2";
    target_tensor->Resize(input.dims());
    DLOG << "SetInput ---- > ShareDataWith";
  }
  target_tensor->ShareDataWith(input);
631 632
  auto &dim = input.dims();
  input_dim_last_ = static_cast<DDim>(dim);
xiebaiyuan's avatar
xiebaiyuan 已提交
633 634
}

635 636 637
template <typename Device, typename T>
void Executor<Device, T>::LoadMemory(const VarDesc var_desc, float *tensorInput,
                                     char **data) {}
L
liuruilong 已提交
638

Y
yangfei 已提交
639
template <>
H
hjchen2 已提交
640 641
void Executor<GPU_CL, float>::LoadMemory(const VarDesc var_desc,
                                         float *tensorInput, char **data) {
642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678
  // 1. version
  uint32_t version = *reinterpret_cast<uint32_t *>(*data);

  (*data) += sizeof(uint32_t);

  // 2 Lod information
  uint64_t *lod_level_ptr = new uint64_t();
  memcpy(lod_level_ptr, (*data), sizeof(uint64_t));
  uint64_t lod_level = *lod_level_ptr;
  delete lod_level_ptr;
  (*data) += sizeof(uint64_t);

  for (uint64_t i = 0; i < lod_level; ++i) {
    uint64_t size = *reinterpret_cast<uint64_t *>(*data);
    (*data) += sizeof(uint64_t);
    std::vector<size_t> tmp(size / sizeof(size_t));

    for (int k = 0; k < tmp.size(); ++k) {
      tmp[k] = *reinterpret_cast<size_t *>(*data);
      (*data) += sizeof(size_t);
    }
  }

  // 3. tensor version
  uint32_t tensor_version = *reinterpret_cast<uint32_t *>(*data);
  (*data) += sizeof(uint32_t);

  // 4. tensor desc
  int32_t size = *reinterpret_cast<int32_t *>(*data);
  (*data) += sizeof(int32_t);

  std::unique_ptr<char[]> buf(new char[size]);
  for (int m = 0; m < size; ++m) {
    buf.get()[m] = (*data)[m];
  }
  (*data) += (sizeof(char) * size);

679
  const TensorDesc &desc = var_desc.Tensor_desc();
680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713
  int memory_size = 1;
  for (auto l : desc.Dims()) {
    memory_size *= l;
  }

  void *memory = nullptr;
  int type_size = 4;
  memory = tensorInput;
  if (program_.quantification) {
    float min_value;
    float max_value;

    memcpy(&min_value, *data, sizeof(float));
    memcpy(&max_value, *data + sizeof(float), sizeof(float));
    *data += 2 * sizeof(float);
    const float factor = (max_value - min_value) / 255.0;
    uint8_t *uint8_data = reinterpret_cast<uint8_t *>(*data);
    for (int k = 0; k < memory_size; ++k) {
      static_cast<float *>(memory)[k] = uint8_data[k] * factor + min_value;
    }
    *data += (memory_size * sizeof(uint8_t));
  } else {
    for (int n = 0; n < memory_size; n++) {
      float value;
      memcpy(&value, *data + n * type_size, type_size);
      if (value < 1e-30 && value > -1e-30) {
        static_cast<float *>(memory)[n] = 0.0;
      } else {
        static_cast<float *>(memory)[n] = value;
      }
    }
    (*data) += (sizeof(char) * memory_size * type_size);
  }
}
714

Y
yangfei 已提交
715
template <>
716 717
void Executor<GPU_CL, float>::InitMemory() {
  for (const auto &block : program_desc_->Blocks()) {
Y
yangfei 已提交
718 719 720
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      if (var_desc->Persistable()) {
L
liuruilong 已提交
721
        CLImage *cl_image = nullptr;
Y
yangfei 已提交
722
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
723
          var->template GetMutable<LoDTensor>();
Y
yangfei 已提交
724
          continue;
L
liuruilong 已提交
725
        } else {
726
          cl_image = var->template GetMutable<CLImage>();
Y
yangfei 已提交
727
        }
L
liuruilong 已提交
728

Y
yangfei 已提交
729
        char *origin_data =
L
liuruilong 已提交
730
            ReadFileToBuff(program_.model_path + "/" + var_desc->Name());
731
        char *data = origin_data;
Y
yangfei 已提交
732
        cl_context context = program_.scope->GetCLScpoe()->Context();
733
        const TensorDesc &desc = var_desc->Tensor_desc();
734 735 736 737 738
        int numel = 1;
        for (auto l : desc.Dims()) {
          numel *= l;
        }
        DLOG << var_desc->Name();
Y
yangfei 已提交
739
        float *tensorInput = static_cast<float *>(
740 741
            paddle_mobile::memory::Alloc(sizeof(float) * numel));
        LoadMemory(*var_desc, tensorInput, &data);
Y
yangfei 已提交
742

743
        DDim ddim = make_ddim(desc.Dims());
Y
yangfei 已提交
744

L
liuruilong 已提交
745 746
        // has not init
        cl_image->SetTensorData(tensorInput, ddim);
Y
yangfei 已提交
747

748
        delete origin_data;
Y
yangfei 已提交
749
        paddle_mobile::memory::Free(tensorInput);
750
      } else {
751 752
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
          auto cl_image = var->template GetMutable<CLImage>();
753
          cl_context context = program_.scope->GetCLScpoe()->Context();
L
liuruilong 已提交
754 755
          cl_command_queue command_queue =
              program_.scope->GetCLScpoe()->CommandQueue();
Y
yangfei 已提交
756

757 758 759
          const TensorDesc &desc = var_desc->Tensor_desc();
          //          DDim ddim = make_ddim(desc.Dims());
          DDim ddim = cl_image->dims();
760
          DLOG << var_desc->Name();
L
liuruilong 已提交
761
          cl_image->InitEmptyImage(context, command_queue, ddim);
762
        }
Y
yangfei 已提交
763 764 765 766
      }
    }
  }
}
767

Y
yangfei 已提交
768
template <>
769
void Executor<GPU_CL, float>::InitCombineMemory() {
xiebaiyuan's avatar
xiebaiyuan 已提交
770 771
  DLOG << "CL InitCombineMemory---- "
       << "config_.load_when_predict: " << config_.load_when_predict;
Y
yangfei 已提交
772 773
  char *origin_data = nullptr;
  bool self_alloc = false;
Y
yangfei 已提交
774 775
  if (program_.combined_params_buf && program_.combined_params_len) {
    LOG(kLOG_INFO) << "use outter memory";
776
    origin_data = reinterpret_cast<char *>(program_.combined_params_buf);
Y
yangfei 已提交
777 778
  } else {
    LOG(kLOG_INFO) << " begin init combine memory";
Y
yangfei 已提交
779
    self_alloc = true;
L
liuruilong 已提交
780
    origin_data = ReadFileToBuff(program_.para_path);
Y
yangfei 已提交
781 782
  }
  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "origin_data==nullptr!!!");
783
  float *data = reinterpret_cast<float *>(origin_data);
Y
yangfei 已提交
784

785
  for (const auto &block : program_desc_->Blocks()) {
Y
yangfei 已提交
786 787 788
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      if (var_desc->Persistable()) {
L
liuruilong 已提交
789
        CLImage *cl_image = nullptr;
Y
yangfei 已提交
790
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
791
          var->template GetMutable<LoDTensor>();
Y
yangfei 已提交
792
          continue;
L
liuruilong 已提交
793
        } else {
794
          cl_image = var->template GetMutable<CLImage>();
Y
yangfei 已提交
795 796 797 798
        }

        cl_context context = program_.scope->GetCLScpoe()->Context();

799 800
        const TensorDesc &desc = var_desc->Tensor_desc();
        DDim ddim = make_ddim(desc.Dims());
Y
yangfei 已提交
801 802 803 804 805

        int numel = 1;
        for (int i = 0; i < ddim.size(); i++) {
          numel = numel * ddim[i];
        }
806 807 808
        float *tensorInput = static_cast<float *>(
            paddle_mobile::memory::Alloc(sizeof(float) * numel));
        LoadMemory(*var_desc, tensorInput, &origin_data);
L
liuruilong 已提交
809 810 811 812

        // has not init
        cl_image->SetTensorData(tensorInput, ddim);

813 814
        paddle_mobile::memory::Free(tensorInput);
      } else {
815
        auto cl_image = var->template GetMutable<CLImage>();
Y
yangfei 已提交
816
        cl_context context = program_.scope->GetCLScpoe()->Context();
L
liuruilong 已提交
817 818
        cl_command_queue command_queue =
            program_.scope->GetCLScpoe()->CommandQueue();
819 820 821
        const TensorDesc &desc = var_desc->Tensor_desc();
        DDim ddim = cl_image->dims();
        //  DDim ddim = make_ddim(desc.Dims());
L
liuruilong 已提交
822
        cl_image->InitEmptyImage(context, command_queue, ddim);
Y
yangfei 已提交
823 824 825
      }
    }
  }
Y
yangfei 已提交
826
  if (self_alloc) {
827
    delete data;
Y
yangfei 已提交
828
  }
Y
yangfei 已提交
829
  LOG(kLOG_INFO) << " end init combine memory ";
830
}
Y
yangfei 已提交
831 832 833

#endif

834
template class Executor<CPU, float>;
Y
yangfei 已提交
835

836
template class Executor<FPGA, float>;
W
wangliu 已提交
837

838
template class Executor<GPU_CL, float>;
Y
yangfei 已提交
839

840
template class Executor<GPU_MALI, float>;
Y
yangfei 已提交
841 842

}  // namespace framework
W
wangliu 已提交
843
}  // namespace paddle_mobile