executor.cpp 28.4 KB
Newer Older
W
wangliu 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

15
#include "framework/executor.h"
D
dolphin8 已提交
16
#include <algorithm>
17
#include <utility>
W
wangliu 已提交
18
#include <vector>
L
liuruilong 已提交
19
#include "common/enforce.h"
L
liuruilong 已提交
20
#include "common/log.h"
L
liuruilong 已提交
21
#include "framework/framework.pb-c.h"
L
liuruilong 已提交
22 23
#include "framework/lod_tensor.h"
#include "framework/operator.h"
L
liuruilong 已提交
24
#include "framework/program/program-optimize/program_optimize.h"
L
liuruilong 已提交
25 26 27 28
#include "framework/program/program_desc.h"
#include "framework/program/var_desc.h"
#include "framework/scope.h"
#include "framework/tensor.h"
Z
zhangyang 已提交
29
#include "memory/t_malloc.h"
L
update  
liuruilong 已提交
30 31 32 33

#ifdef PADDLE_MOBILE_CL
#include "framework/cl/cl_image.h"
#endif
W
wangliu 已提交
34 35

namespace paddle_mobile {
36
namespace framework {
37

W
wangliu 已提交
38 39
#pragma mark - executor

40
template <typename Device, typename T>
xiebaiyuan's avatar
xiebaiyuan 已提交
41 42 43 44
Executor<Device, T>::Executor(const Program<Device> &program,
                              paddle_mobile::PaddleMobileConfigInternal config,
                              int batch_size, const bool use_optimize,
                              const bool lod_mode)
45
    : program_(program),
H
hjchen2 已提交
46 47
      batch_size_(batch_size),
      use_optimize_(use_optimize),
xiebaiyuan's avatar
xiebaiyuan 已提交
48 49
      lod_mode_(lod_mode),
      config_(config) {
50 51
  DLOG << "executor in lod mode: " << lod_mode_;

W
wangliu 已提交
52
  Variable *variable_ptr = program_.scope->Var("batch_size");
H
hjchen2 已提交
53
  variable_ptr->SetValue<int>(batch_size);
54 55

  program_desc_ =
Refine  
陈后江 已提交
56
      use_optimize_ ? program_.optimizeProgram : program_.originProgram;
57 58 59 60
  PADDLE_MOBILE_ENFORCE(program_desc_ != nullptr,
                        "program_desc_ should not be nullptr");
  const auto &blocks = program_desc_->Blocks();
  ops_of_block_.resize(blocks.size());
61

W
wangliu 已提交
62
  for (int i = 0; i < blocks.size(); ++i) {
63 64
    std::shared_ptr<BlockDesc> block_desc = blocks[i];
    std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
W
wangliu 已提交
65
    for (int j = 0; j < ops.size(); ++j) {
66 67
      std::shared_ptr<OpDesc> op_desc = ops[j];
      DLOG << "create op: " << op_desc->Type();
68

69 70 71 72 73 74 75
      auto op_handler = OpRegistry<Device>::CreateOp(
          op_desc->Type(), op_desc->GetInputs(), op_desc->GetOutputs(),
          op_desc->GetAttrMap(), program_.scope);
      // infer shape to reshape inputs and outputs before predict,
      // but for lod mode, it still need to infer shape in runtime
      if (!lod_mode) {
        op_handler->InferShape();
xiebaiyuan's avatar
xiebaiyuan 已提交
76
      }
77
      ops_of_block_[i].push_back(op_handler);
W
wangliu 已提交
78 79
    }
  }
80

W
wangliu 已提交
81
  if (program_.combined) {
L
liuruilong 已提交
82 83 84 85
    InitCombineMemory();
  } else {
    InitMemory();
  }
86 87 88 89 90 91 92 93

  int count = 0;
  for (int block_id = 0; block_id < ops_of_block_.size(); ++block_id) {
    for (auto &op_handler : ops_of_block_[block_id]) {
      DLOG << "Initialize op[" << count++ << "]: " << op_handler->Type();
      op_handler->Init();
      ops_list_.push_back(op_handler);
    }
L
liuruilong 已提交
94
  }
95 96 97 98 99 100 101 102
#ifdef PADDLE_MOBILE_FPGA
  TalorFeedOp();
  DLOG << "TalorFeed finished";
  TalorFetchdOp();
  DLOG << "TalorFetch finished";
  program_.scope->print_vars();

#endif
W
wangliu 已提交
103 104
}

105
template <typename T>
106
static void LoadMemInternal(void **data, LoDTensor *tensor,
107
                            bool quant_uint8 = false) {
Refine  
陈后江 已提交
108
  char **data_buf = reinterpret_cast<char **>(data);
109
  int64_t size = tensor->numel();
110
  T *tensor_data = tensor->mutable_data<T>();
111 112
  if (quant_uint8) {
    // should be moved into operator init function
113 114
    float min_value;
    float max_value;
115 116 117
    memory::Copy(&min_value, *data_buf, sizeof(float));
    memory::Copy(&max_value, *data_buf + sizeof(float), sizeof(float));
    *data_buf += 2 * sizeof(float);
118
    const float factor = (max_value - min_value) / 255.0;
119
    const uint8_t *uint8_data = reinterpret_cast<uint8_t *>(*data_buf);
120 121
    for (int k = 0; k < size; ++k) {
      tensor_data[k] = uint8_data[k] * factor + min_value;
W
wangliu 已提交
122
    }
123
    *data_buf += size * sizeof(uint8_t);
124
  } else {
125 126
    memory::Copy(tensor_data, *data_buf, size * sizeof(T));
    *data_buf += size * sizeof(T);
L
liuruilong 已提交
127
  }
128
}
W
wangliu 已提交
129

130 131 132 133
template <typename Device, typename T>
void Executor<Device, T>::LoadMemory(void **data,
                                     const std::shared_ptr<VarDesc> var_desc,
                                     LoDTensor *tensor) {
134
  char **data_buf = reinterpret_cast<char **>(data);
135
  // version
136
  uint32_t version = *(reinterpret_cast<uint32_t *>(*data_buf));
Refine  
陈后江 已提交
137
  *data_buf += sizeof(uint32_t);
138
  // lod information
H
hjchen2 已提交
139 140
  // uint64_t lod_level = *(reinterpret_cast<uint64_t *>(*data_buf));
  uint64_t lod_level = 0;
Z
zhangyang 已提交
141
  memory::Copy(&lod_level, *data_buf, sizeof(uint64_t));
Refine  
陈后江 已提交
142
  *data_buf += sizeof(uint64_t);
143 144 145 146

  auto *lod = tensor->mutable_lod();
  lod->resize(lod_level);
  for (uint64_t i = 0; i < lod_level; ++i) {
147
    uint64_t size = *(reinterpret_cast<uint64_t *>(*data_buf));
Refine  
陈后江 已提交
148
    *data_buf += sizeof(uint64_t);
149
    std::vector<size_t> tmp_dim(size / sizeof(size_t));
Z
zhangyang 已提交
150
    memory::Copy(tmp_dim.data(), *data_buf, size);
151
    (*lod)[i] = std::move(tmp_dim);
Refine  
陈后江 已提交
152
    *data_buf += size;
W
wangliu 已提交
153
  }
154
  // tensor version
155
  uint32_t tensor_version = *(reinterpret_cast<uint32_t *>(*data_buf));
Refine  
陈后江 已提交
156
  *data_buf += sizeof(uint32_t);
157
  // tensor desc size
158
  int32_t tensor_desc_size = *(reinterpret_cast<int32_t *>(*data_buf));
Refine  
陈后江 已提交
159
  *data_buf += sizeof(int32_t);
160
  // skip tensor desc
Refine  
陈后江 已提交
161
  *data_buf += tensor_desc_size;
162

163 164
  const TensorDesc &tensor_desc = var_desc->Tensor_desc();
  tensor->Resize(make_ddim(tensor_desc.Dims()));
165 166
  // parse tensor from stream
  switch (tensor_desc.DataType()) {
167
    case VARTYPE_TYPE_FP32:
168 169
      LoadMemInternal<float>(reinterpret_cast<void **>(data_buf), tensor,
                             program_.quantification);
W
wangliu 已提交
170
      break;
171
    case VARTYPE_TYPE_INT8:
172
      LoadMemInternal<int8_t>(reinterpret_cast<void **>(data_buf), tensor);
W
wangliu 已提交
173
      break;
174
    case VARTYPE_TYPE_INT32:
175
      LoadMemInternal<int>(reinterpret_cast<void **>(data_buf), tensor);
W
wangliu 已提交
176 177
      break;
    default:
178
      LOG(kLOG_ERROR) << "data type is not supported";
L
liuruilong 已提交
179
  }
W
wangliu 已提交
180 181
}

182 183 184
template <typename Device, typename T>
void Executor<Device, T>::InitMemory() {
  for (const auto &block : program_desc_->Blocks()) {
W
wangliu 已提交
185 186
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
187
      auto tensor = var->template GetMutable<LoDTensor>();
W
wangliu 已提交
188 189 190 191
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
Refine  
陈后江 已提交
192
        char *origin_data =
Refine  
陈后江 已提交
193
            ReadFileToBuff(program_.model_path + "/" + var_desc->Name());
Refine  
陈后江 已提交
194
        char *data = origin_data;
195 196
        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
        delete[] origin_data;
W
wangliu 已提交
197
      } else {
198
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
199
          varInputMemory(var_desc, var, tensor);
W
wangliu 已提交
200 201 202 203 204 205
        }
      }
    }
  }
}

206 207
template <typename Device, typename T>
void Executor<Device, T>::InitCombineMemory() {
Refine  
陈后江 已提交
208
  char *origin_data = nullptr;
Refine  
陈后江 已提交
209
  bool self_alloc = false;
210
  if (program_.combined_params_buf && program_.combined_params_len) {
211 212
    origin_data = reinterpret_cast<char *>(
        const_cast<uint8_t *>(program_.combined_params_buf));
213
  } else {
Refine  
陈后江 已提交
214
    self_alloc = true;
Refine  
陈后江 已提交
215
    origin_data = ReadFileToBuff(program_.para_path);
216
  }
Refine  
陈后江 已提交
217 218
  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "data == nullptr");
  char *data = origin_data;
219
  for (const auto &block : program_desc_->Blocks()) {
L
liuruilong 已提交
220 221
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
222
      auto tensor = var->template GetMutable<LoDTensor>();
L
liuruilong 已提交
223 224 225 226
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
L
liuruilong 已提交
227 228 229

        DLOG << " init combine memory persistable: " << var_desc->Name();

230
        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
L
liuruilong 已提交
231
      } else {
232
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
xiebaiyuan's avatar
xiebaiyuan 已提交
233 234
          DLOG << " init combine memory no persistable in lod: "
               << var_desc->Name();
235
          varInputMemory(var_desc, var, tensor);
L
liuruilong 已提交
236 237
        } else {
          DLOG << " init combine memory no persistable: " << var_desc->Name();
L
liuruilong 已提交
238 239 240 241
        }
      }
    }
  }
Refine  
陈后江 已提交
242
  if (self_alloc) {
243
    delete[] origin_data;
Refine  
陈后江 已提交
244 245
  }
  LOG(kLOG_INFO) << "init combine memory finish";
L
liuruilong 已提交
246
}
247

L
liuruilong 已提交
248
template <typename Device, typename T>
L
liuruilong 已提交
249
void Executor<Device, T>::InitNoPersistableMemory(const Tensor &input_tensor) {
L
liuruilong 已提交
250 251 252 253 254 255 256 257 258 259 260
  for (const auto &block : program_desc_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      auto tensor = var->template GetMutable<LoDTensor>();
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
      } else {
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
          DDim tensor_dim = tensor->dims();
xiebaiyuan's avatar
xiebaiyuan 已提交
261 262 263 264
          DDim new_dim =
              make_ddim({tensor_dim[0], tensor_dim[1], input_tensor.dims()[2],
                         input_tensor.dims()[3]});
          tensor->Resize(new_dim);
L
liuruilong 已提交
265 266 267 268 269 270 271 272 273 274 275
          tensor->template mutable_data<T>();
        }
      }
    }
  }

  std::shared_ptr<LoDTensor> output = GetOutput("fetch");
  output->Resize(input_tensor.dims());
  output->mutable_data<T>();
}

276 277 278 279
template <typename Device, typename T>
bool Executor<Device, T>::varInputMemory(
    const std::shared_ptr<VarDesc> &var_desc, Variable *var,
    LoDTensor *tensor) const {
280 281 282 283
#ifdef PADDLE_MOBILE_FPGA
  tensor->init(typeid(float));
  return true;
#endif
284 285
  auto type = var_desc->Tensor_desc().DataType();
  switch (type) {
286
    case VARTYPE_TYPE_FP32:
287
      tensor->mutable_data<float>();
xiebaiyuan's avatar
xiebaiyuan 已提交
288
      break;
289
    case VARTYPE_TYPE_INT8:
290
      tensor->mutable_data<int8_t>();
Refine  
陈后江 已提交
291
      break;
292
    case VARTYPE_TYPE_INT32:
293
      tensor->mutable_data<int32_t>();
xiebaiyuan's avatar
xiebaiyuan 已提交
294
      break;
295
    case VARTYPE_TYPE_INT64:
296
      tensor->mutable_data<int64_t>();
xiebaiyuan's avatar
xiebaiyuan 已提交
297
      break;
Refine  
陈后江 已提交
298
    default:
xiebaiyuan's avatar
xiebaiyuan 已提交
299 300
      break;
  }
301 302 303
  bool is_mute_match =
      (type == VARTYPE_TYPE_FP32) || (type == VARTYPE_TYPE_INT8) ||
      (type == VARTYPE_TYPE_INT32) || (type == VARTYPE_TYPE_INT64);
Refine  
陈后江 已提交
304
  PADDLE_MOBILE_ENFORCE(is_mute_match, "got unhandled data type : %d", type);
xiebaiyuan's avatar
xiebaiyuan 已提交
305 306
  return is_mute_match;
}
L
liuruilong 已提交
307

308 309 310 311 312
template <typename Device, typename T>
PMStatus Executor<Device, T>::Predict(
    const std::vector<std::pair<std::string, Tensor>> &inputs) {
  for (const auto &input : inputs) {
    SetInput(input.second, input.first);
D
dolphin8 已提交
313
  }
314 315 316 317 318 319 320 321
  return this->Predict();
}

template <typename Device, typename T>
PMStatus Executor<Device, T>::Predict(
    const std::vector<std::pair<std::string, LoDTensor>> &inputs) {
  for (const auto &input : inputs) {
    SetInput(input.second, input.first);
D
dolphin8 已提交
322
  }
323
  return this->Predict();
W
wangliu 已提交
324
}
xiebaiyuan's avatar
xiebaiyuan 已提交
325

326 327 328 329 330 331 332 333 334 335 336 337 338 339
template <typename Device, typename T>
std::vector<T> Executor<Device, T>::Predict(const std::vector<T> &input,
                                            const std::vector<int64_t> &dims) {
  Tensor feed_tensor(input, make_ddim(dims));
  SetInput(feed_tensor, "feed");
  std::vector<T> output;
  if (this->Predict() == PMSuccess) {
    const auto output_tensor = GetOutput("fetch");
    output.resize(output_tensor->numel());
    memcpy(output.data(), output_tensor->template data<T>(),
           output.size() * sizeof(T));
  }
  return output;
}
xiebaiyuan's avatar
xiebaiyuan 已提交
340

341 342 343 344 345 346
template <typename Device, typename T>
void Executor<Device, T>::SetInput(const Tensor &input,
                                   const std::string &var_name) {
  auto *target_var = program_.scope->FindVar(var_name);
  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
                        var_name.c_str());
L
liuruilong 已提交
347

348
  auto *target_tensor = target_var->template GetMutable<LoDTensor>();
L
liuruilong 已提交
349 350

  if (config_.load_when_predict) {
Z
zhaojiaying01 已提交
351 352 353
    if (input_dim_last_ != input.dims()) {
      InitNoPersistableMemory(input);
      input_dim_last_ = input.dims();
L
liuruilong 已提交
354 355 356
    }
  }

357 358 359
  target_tensor->Resize(input.dims());
  target_tensor->ShareDataWith(input);
}
xiebaiyuan's avatar
xiebaiyuan 已提交
360

361 362 363 364 365 366 367
template <typename Device, typename T>
void Executor<Device, T>::SetInput(const LoDTensor &input,
                                   const std::string &var_name) {
  auto *target_var = program_.scope->FindVar(var_name);
  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
                        var_name.c_str());
  auto *target_tensor = target_var->template GetMutable<LoDTensor>();
L
liuruilong 已提交
368 369

  if (config_.load_when_predict) {
Z
zhaojiaying01 已提交
370
    if (input_dim_last_ != input.dims()) {
L
liuruilong 已提交
371
      InitNoPersistableMemory(*target_tensor);
Z
zhaojiaying01 已提交
372
      input_dim_last_ = input.dims();
L
liuruilong 已提交
373 374 375
    }
  }

376 377 378 379
  target_tensor->Resize(input.dims());
  target_tensor->ShareDataWith(input);
  target_tensor->set_lod(input.lod());
}
xiebaiyuan's avatar
xiebaiyuan 已提交
380

381 382
template <typename Device, typename T>
PMStatus Executor<Device, T>::Predict() {
xiebaiyuan's avatar
xiebaiyuan 已提交
383
#ifdef PADDLE_MOBILE_PROFILE
384 385 386
  std::vector<ProfInfo> profile(ops_list_.size());
  struct timespec ts;
  int op_index = 0;
xiebaiyuan's avatar
xiebaiyuan 已提交
387
#endif
388 389
  for (auto &block : ops_of_block_) {
    for (auto &op_handler : block) {
xiebaiyuan's avatar
xiebaiyuan 已提交
390
#ifdef PADDLE_MOBILE_PROFILE
391 392
      clock_gettime(CLOCK_MONOTONIC, &ts);
      profile[op_index].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
xiebaiyuan's avatar
xiebaiyuan 已提交
393
#endif
394 395 396 397
      if (lod_mode_) {
        op_handler->InferShape();
      }
      op_handler->Run();
xiebaiyuan's avatar
xiebaiyuan 已提交
398
#ifdef PADDLE_MOBILE_PROFILE
399 400 401
      clock_gettime(CLOCK_MONOTONIC, &ts);
      profile[op_index].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
      ++op_index;
xiebaiyuan's avatar
xiebaiyuan 已提交
402
#endif
403
    }
xiebaiyuan's avatar
xiebaiyuan 已提交
404 405 406 407 408 409
  }
#ifdef PADDLE_MOBILE_PROFILE
  std::unordered_map<std::string, uint64_t> _tp;
  for (int i = 0; i < profile.size(); i++) {
    const auto &pInfo = profile[i];
    uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
410 411 412 413 414
    if (ops_list_[i]->Type() == "conv2d" ||
        ops_list_[i]->Type() == "depthwise_conv2d") {
      auto inputs = ops_list_[i]->Inputs();
      auto *filter =
          GetVarValue<LoDTensor>("Filter", inputs, *(program_.scope));
415
      int kernel_size = filter->dims()[2];
416 417 418
      _tp[ops_list_[i]->Type() + "_" + std::to_string(kernel_size)] += timeCost;
    } else {
      _tp[ops_list_[i]->Type()] += timeCost;
419
    }
xiebaiyuan's avatar
xiebaiyuan 已提交
420
  }
H
hjchen2 已提交
421
  printf("====================[ profile ]======================\n");
422
  typedef std::pair<std::string, uint64_t> prof_t;
xiebaiyuan's avatar
xiebaiyuan 已提交
423 424 425 426 427 428 429 430 431 432 433 434 435 436 437
  std::vector<prof_t> _tv(_tp.begin(), _tp.end());
  uint64_t _ptotal = 0;
  for (auto const &p : _tv) {
    _ptotal += p.second;
  }
  auto compf = [](const prof_t &a, const prof_t &b) {
    return a.second > b.second;
  };
  std::sort(_tv.begin(), _tv.end(), compf);
  _tv.push_back(std::make_pair("total", _ptotal));
  for (auto const &p : _tv) {
    printf("%-16s\t%-10.0f\t%-2.4f\n", p.first.c_str(),
           static_cast<float>(p.second),
           static_cast<float>(p.second) / _ptotal * 100.0);
  }
H
hjchen2 已提交
438
  printf("====================[---------]======================\n");
xiebaiyuan's avatar
xiebaiyuan 已提交
439
#endif
440
  return PMSuccess;
xiebaiyuan's avatar
xiebaiyuan 已提交
441 442
}

443 444 445 446 447 448 449 450
template <typename Device, typename T>
std::shared_ptr<LoDTensor> Executor<Device, T>::GetOutput(
    const std::string &var_name) {
  auto *target_var = program_.scope->FindVar(var_name);
  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
                        var_name.c_str());
  auto *output_tensor = target_var->template GetMutable<LoDTensor>();
  return std::make_shared<LoDTensor>(*output_tensor);
W
wangliu 已提交
451 452
}

453
#ifdef PADDLE_MOBILE_FPGA
454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496
template <typename Device, typename T>
void Executor<Device, T>::TalorFeedOp() {
  auto &ops = ops_of_block_[0];
  int num = 0;
  program_.scope->EraseVars(std::vector<string>{string("feed")});
  for (auto op : ops) {
    if (op->Type() == "feed") {
      auto new_name = string("feed") + std::to_string(num++);
      auto var = program_.scope->Var(new_name);
      auto tensor = var->template GetMutable<LoDTensor>();
      auto output_map = op->Outputs();
      std::vector<std::string> out_keys = op->GetOutKeys();
      PADDLE_MOBILE_ENFORCE(!out_keys.empty(), "this op contains no output");
      auto output_tensor =
          GetVarValue<LoDTensor>(out_keys[0], output_map, *(program_.scope));
      tensor->Resize(output_tensor->dims());
      tensor->init(typeid(float));
      op->ChangeNameMap("X", std::vector<string>{new_name});
    }
  }
}
template <typename Device, typename T>
void Executor<Device, T>::TalorFetchdOp() {
  auto &ops = ops_of_block_[0];
  int num = 0;
  program_.scope->EraseVars(std::vector<string>{string("fetch")});
  for (auto op : ops) {
    if (op->Type() == "fetch") {
      auto new_name = string("fetch") + std::to_string(num++);
      auto var = program_.scope->Var(new_name);
      auto tensor = var->template GetMutable<LoDTensor>();
      auto input_map = op->Inputs();
      std::vector<std::string> in_keys = op->GetInputKeys();
      PADDLE_MOBILE_ENFORCE(!in_keys.empty(), "this op contains no input");
      auto input_tensor =
          GetVarValue<LoDTensor>(in_keys[0], input_map, *(program_.scope));
      tensor->Resize(input_tensor->dims());
      tensor->init(typeid(float));
      op->ChangeNameMap("Out", std::vector<string>{new_name});
    }
  }
}

497 498 499 500
template <typename Device, typename T>
void Executor<Device, T>::InjectVariable(const Tensor &t,
                                         std::string var_name) {
  Variable *g_feed_value = program_.scope->Var(var_name);
501
  Tensor *feed_tensor = g_feed_value->template GetMutable<LoDTensor>();
502 503
  feed_tensor->Resize(t.dims());
  feed_tensor->ShareDataWith(t);
504
}
505

506 507
template <typename Device, typename T>
void Executor<Device, T>::FeedData(const Tensor &t) {
508
  InjectVariable(t, "feed");
509
}
510

511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526
template <typename Device, typename T>
void Executor<Device, T>::FeedData(const std::vector<Tensor> &v) {
  auto input_size = v.size();
  PADDLE_MOBILE_ENFORCE(input_size > 0, "Empty input");
  int counter = 0;
  auto vars = program_.scope->VarContain("feed");
  for (auto var : vars) {
    Tensor *feed_tensor = var->template GetMutable<LoDTensor>();
    feed_tensor->Resize(v[counter].dims());
    feed_tensor->ShareDataWith(v[counter]);
    if (++counter > v.size()) {
      return;
    }
  }
}

527 528
template <typename Device, typename T>
std::shared_ptr<Tensor> Executor<Device, T>::FetchResult(int id) {
H
hjchen2 已提交
529
  auto &ops = ops_of_block_[0];
530

Z
zhangyang 已提交
531 532 533 534 535
  PADDLE_MOBILE_ENFORCE(id < (int)ops.size(), "Index out of range");
  auto op = id < 0 ? ops[ops.size() - 1] : ops[id];
  auto output_map = op->Outputs();
  std::vector<std::string> out_keys = op->GetOutKeys();
  PADDLE_MOBILE_ENFORCE(!out_keys.empty(), "this op contains no output");
536 537 538
  auto *output_tensor =
      GetVarValue<LoDTensor>(out_keys[0], output_map, *(program_.scope));
  return std::make_shared<Tensor>(Tensor(*output_tensor));
539
}
540

541 542
template <typename Device, typename T>
void Executor<Device, T>::Predict_From_To(int start, int end) {
H
hjchen2 已提交
543
  auto &ops = ops_of_block_[0];
544
  end = end < 0 ? static_cast<int>(ops.size()) : end;
545 546 547 548 549 550 551 552 553 554 555 556
  PADDLE_MOBILE_ENFORCE(start >= 0 && start < end && end <= ops.size(),
                        "start or end parameter is wrong");

#ifdef PADDLE_MOBILE_PROFILE
  std::vector<ProfInfo> profile(ops.size());
#endif
  for (int i = start; i < end; i++) {
#ifdef PADDLE_MOBILE_PROFILE
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
Z
zhangyang 已提交
557
    DLOG << "Running op: " << i << "  " << ops[i]->Type();
558 559 560 561 562 563 564
    ops[i]->Run();

#ifdef PADDLE_MOBILE_PROFILE
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
  }
565
}
566

567 568
template <typename Device, typename T>
void Executor<Device, T>::Predict_From(int start) {
569
  Predict_From_To(start);
570
}
571

572 573
template <typename Device, typename T>
void Executor<Device, T>::Predict_To(int end) {
574
  Predict_From_To(0, end);
575
}
576 577
#endif

Y
yangfei 已提交
578
#ifdef PADDLE_MOBILE_CL
xiebaiyuan's avatar
xiebaiyuan 已提交
579 580
template <>
void Executor<GPU_CL, float>::InitNoPersistableMemory(
581
    const Tensor &input_tensor) {
xiebaiyuan's avatar
xiebaiyuan 已提交
582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624
  DLOG << "CL InitNoPersistableMemory ";
  for (const auto &block : program_desc_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());

      auto cl_image = var->template GetMutable<CLImage>();

      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
      } else {
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
          cl_context context = program_.scope->GetCLScpoe()->Context();
          cl_command_queue command_queue =
              program_.scope->GetCLScpoe()->CommandQueue();

          DDim tensor_dim = cl_image->dims();
          DDim new_dim =
              make_ddim({tensor_dim[0], tensor_dim[1], input_tensor.dims()[2],
                         input_tensor.dims()[3]});
          cl_image->Resize(new_dim);
          cl_image->InitEmptyImage(context, command_queue, new_dim);
        }
      }
    }
  }
  std::shared_ptr<LoDTensor> output = GetOutput("fetch");
  output->Resize(input_tensor.dims());
  output->mutable_data<float>();
}
template <>
void Executor<GPU_CL, float>::SetInput(const Tensor &input,
                                       const std::string &var_name) {
  auto *target_var = program_.scope->FindVar(var_name);
  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
                        var_name.c_str());

  auto *target_tensor = target_var->template GetMutable<LoDTensor>();
  DLOG << "config_.load_when_predict   " << config_.load_when_predict;
  DLOG << "target_tensor->IsInitialized() " << target_tensor->IsInitialized();
  DLOG << "target_tensor->dims()   " << target_tensor->dims();
  DLOG << "input.dims()   " << input.dims();
625
  DLOG << "input_dim_last_   " << input_dim_last_;
xiebaiyuan's avatar
xiebaiyuan 已提交
626
  if (config_.load_when_predict) {
xiebaiyuan's avatar
xiebaiyuan 已提交
627
    if (input_dim_last_ != input.dims()) {
628 629 630
      DLOG << "SetInput ---- > resize1";
      target_tensor->Resize(input.dims());
      target_tensor->mutable_data<float>();
xiebaiyuan's avatar
xiebaiyuan 已提交
631 632 633 634 635 636 637 638
      InitNoPersistableMemory(*target_tensor);
    }
  } else {
    DLOG << "SetInput ---- > resize2";
    target_tensor->Resize(input.dims());
    DLOG << "SetInput ---- > ShareDataWith";
  }
  target_tensor->ShareDataWith(input);
639 640
  auto &dim = input.dims();
  input_dim_last_ = static_cast<DDim>(dim);
xiebaiyuan's avatar
xiebaiyuan 已提交
641 642
}

643 644 645
template <typename Device, typename T>
void Executor<Device, T>::LoadMemory(const VarDesc var_desc, float *tensorInput,
                                     char **data) {}
L
liuruilong 已提交
646

Y
yangfei 已提交
647
template <>
H
hjchen2 已提交
648 649
void Executor<GPU_CL, float>::LoadMemory(const VarDesc var_desc,
                                         float *tensorInput, char **data) {
650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686
  // 1. version
  uint32_t version = *reinterpret_cast<uint32_t *>(*data);

  (*data) += sizeof(uint32_t);

  // 2 Lod information
  uint64_t *lod_level_ptr = new uint64_t();
  memcpy(lod_level_ptr, (*data), sizeof(uint64_t));
  uint64_t lod_level = *lod_level_ptr;
  delete lod_level_ptr;
  (*data) += sizeof(uint64_t);

  for (uint64_t i = 0; i < lod_level; ++i) {
    uint64_t size = *reinterpret_cast<uint64_t *>(*data);
    (*data) += sizeof(uint64_t);
    std::vector<size_t> tmp(size / sizeof(size_t));

    for (int k = 0; k < tmp.size(); ++k) {
      tmp[k] = *reinterpret_cast<size_t *>(*data);
      (*data) += sizeof(size_t);
    }
  }

  // 3. tensor version
  uint32_t tensor_version = *reinterpret_cast<uint32_t *>(*data);
  (*data) += sizeof(uint32_t);

  // 4. tensor desc
  int32_t size = *reinterpret_cast<int32_t *>(*data);
  (*data) += sizeof(int32_t);

  std::unique_ptr<char[]> buf(new char[size]);
  for (int m = 0; m < size; ++m) {
    buf.get()[m] = (*data)[m];
  }
  (*data) += (sizeof(char) * size);

687
  const TensorDesc &desc = var_desc.Tensor_desc();
688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721
  int memory_size = 1;
  for (auto l : desc.Dims()) {
    memory_size *= l;
  }

  void *memory = nullptr;
  int type_size = 4;
  memory = tensorInput;
  if (program_.quantification) {
    float min_value;
    float max_value;

    memcpy(&min_value, *data, sizeof(float));
    memcpy(&max_value, *data + sizeof(float), sizeof(float));
    *data += 2 * sizeof(float);
    const float factor = (max_value - min_value) / 255.0;
    uint8_t *uint8_data = reinterpret_cast<uint8_t *>(*data);
    for (int k = 0; k < memory_size; ++k) {
      static_cast<float *>(memory)[k] = uint8_data[k] * factor + min_value;
    }
    *data += (memory_size * sizeof(uint8_t));
  } else {
    for (int n = 0; n < memory_size; n++) {
      float value;
      memcpy(&value, *data + n * type_size, type_size);
      if (value < 1e-30 && value > -1e-30) {
        static_cast<float *>(memory)[n] = 0.0;
      } else {
        static_cast<float *>(memory)[n] = value;
      }
    }
    (*data) += (sizeof(char) * memory_size * type_size);
  }
}
722

Y
yangfei 已提交
723
template <>
724 725
void Executor<GPU_CL, float>::InitMemory() {
  for (const auto &block : program_desc_->Blocks()) {
Y
yangfei 已提交
726 727 728
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      if (var_desc->Persistable()) {
L
liuruilong 已提交
729
        CLImage *cl_image = nullptr;
Y
yangfei 已提交
730
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
731
          var->template GetMutable<LoDTensor>();
Y
yangfei 已提交
732
          continue;
L
liuruilong 已提交
733
        } else {
734
          cl_image = var->template GetMutable<CLImage>();
Y
yangfei 已提交
735
        }
L
liuruilong 已提交
736

Y
yangfei 已提交
737
        char *origin_data =
L
liuruilong 已提交
738
            ReadFileToBuff(program_.model_path + "/" + var_desc->Name());
739
        char *data = origin_data;
Y
yangfei 已提交
740
        cl_context context = program_.scope->GetCLScpoe()->Context();
741
        const TensorDesc &desc = var_desc->Tensor_desc();
742 743 744 745 746
        int numel = 1;
        for (auto l : desc.Dims()) {
          numel *= l;
        }
        DLOG << var_desc->Name();
Y
yangfei 已提交
747
        float *tensorInput = static_cast<float *>(
748 749
            paddle_mobile::memory::Alloc(sizeof(float) * numel));
        LoadMemory(*var_desc, tensorInput, &data);
Y
yangfei 已提交
750

751
        DDim ddim = make_ddim(desc.Dims());
Y
yangfei 已提交
752

L
liuruilong 已提交
753 754
        // has not init
        cl_image->SetTensorData(tensorInput, ddim);
Y
yangfei 已提交
755

756
        delete origin_data;
Y
yangfei 已提交
757
        paddle_mobile::memory::Free(tensorInput);
758
      } else {
759 760
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
          auto cl_image = var->template GetMutable<CLImage>();
761
          cl_context context = program_.scope->GetCLScpoe()->Context();
L
liuruilong 已提交
762 763
          cl_command_queue command_queue =
              program_.scope->GetCLScpoe()->CommandQueue();
Y
yangfei 已提交
764

765 766 767
          const TensorDesc &desc = var_desc->Tensor_desc();
          //          DDim ddim = make_ddim(desc.Dims());
          DDim ddim = cl_image->dims();
768
          DLOG << var_desc->Name();
L
liuruilong 已提交
769
          cl_image->InitEmptyImage(context, command_queue, ddim);
770
        }
Y
yangfei 已提交
771 772 773 774
      }
    }
  }
}
775

Y
yangfei 已提交
776
template <>
777
void Executor<GPU_CL, float>::InitCombineMemory() {
xiebaiyuan's avatar
xiebaiyuan 已提交
778 779
  DLOG << "CL InitCombineMemory---- "
       << "config_.load_when_predict: " << config_.load_when_predict;
Y
yangfei 已提交
780 781
  char *origin_data = nullptr;
  bool self_alloc = false;
Y
yangfei 已提交
782 783
  if (program_.combined_params_buf && program_.combined_params_len) {
    LOG(kLOG_INFO) << "use outter memory";
784
    origin_data = reinterpret_cast<char *>(program_.combined_params_buf);
Y
yangfei 已提交
785 786
  } else {
    LOG(kLOG_INFO) << " begin init combine memory";
Y
yangfei 已提交
787
    self_alloc = true;
L
liuruilong 已提交
788
    origin_data = ReadFileToBuff(program_.para_path);
Y
yangfei 已提交
789 790
  }
  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "origin_data==nullptr!!!");
791
  float *data = reinterpret_cast<float *>(origin_data);
Y
yangfei 已提交
792

793
  for (const auto &block : program_desc_->Blocks()) {
Y
yangfei 已提交
794 795 796
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      if (var_desc->Persistable()) {
L
liuruilong 已提交
797
        CLImage *cl_image = nullptr;
Y
yangfei 已提交
798
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
799
          var->template GetMutable<LoDTensor>();
Y
yangfei 已提交
800
          continue;
L
liuruilong 已提交
801
        } else {
802
          cl_image = var->template GetMutable<CLImage>();
Y
yangfei 已提交
803 804 805 806
        }

        cl_context context = program_.scope->GetCLScpoe()->Context();

807 808
        const TensorDesc &desc = var_desc->Tensor_desc();
        DDim ddim = make_ddim(desc.Dims());
Y
yangfei 已提交
809 810 811 812 813

        int numel = 1;
        for (int i = 0; i < ddim.size(); i++) {
          numel = numel * ddim[i];
        }
814 815 816
        float *tensorInput = static_cast<float *>(
            paddle_mobile::memory::Alloc(sizeof(float) * numel));
        LoadMemory(*var_desc, tensorInput, &origin_data);
L
liuruilong 已提交
817 818 819 820

        // has not init
        cl_image->SetTensorData(tensorInput, ddim);

821 822
        paddle_mobile::memory::Free(tensorInput);
      } else {
823
        auto cl_image = var->template GetMutable<CLImage>();
Y
yangfei 已提交
824
        cl_context context = program_.scope->GetCLScpoe()->Context();
L
liuruilong 已提交
825 826
        cl_command_queue command_queue =
            program_.scope->GetCLScpoe()->CommandQueue();
827 828 829
        const TensorDesc &desc = var_desc->Tensor_desc();
        DDim ddim = cl_image->dims();
        //  DDim ddim = make_ddim(desc.Dims());
L
liuruilong 已提交
830
        cl_image->InitEmptyImage(context, command_queue, ddim);
Y
yangfei 已提交
831 832 833
      }
    }
  }
Y
yangfei 已提交
834
  if (self_alloc) {
835
    delete data;
Y
yangfei 已提交
836
  }
Y
yangfei 已提交
837
  LOG(kLOG_INFO) << " end init combine memory ";
838
}
Y
yangfei 已提交
839 840 841

#endif

842
template class Executor<CPU, float>;
Y
yangfei 已提交
843

844
template class Executor<FPGA, float>;
W
wangliu 已提交
845

846
template class Executor<GPU_CL, float>;
Y
yangfei 已提交
847

848
template class Executor<GPU_MALI, float>;
Y
yangfei 已提交
849 850

}  // namespace framework
W
wangliu 已提交
851
}  // namespace paddle_mobile