executor.cpp 28.3 KB
Newer Older
W
wangliu 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

15
#include "framework/executor.h"
D
dolphin8 已提交
16
#include <algorithm>
17
#include <utility>
W
wangliu 已提交
18
#include <vector>
L
liuruilong 已提交
19
#include "common/enforce.h"
L
liuruilong 已提交
20
#include "common/log.h"
L
liuruilong 已提交
21
#include "framework/framework.pb-c.h"
L
liuruilong 已提交
22 23
#include "framework/lod_tensor.h"
#include "framework/operator.h"
L
liuruilong 已提交
24
#include "framework/program/program-optimize/program_optimize.h"
L
liuruilong 已提交
25 26 27 28
#include "framework/program/program_desc.h"
#include "framework/program/var_desc.h"
#include "framework/scope.h"
#include "framework/tensor.h"
Z
zhangyang 已提交
29
#include "memory/t_malloc.h"
L
update  
liuruilong 已提交
30 31 32 33

#ifdef PADDLE_MOBILE_CL
#include "framework/cl/cl_image.h"
#endif
W
wangliu 已提交
34 35

namespace paddle_mobile {
36
namespace framework {
37

W
wangliu 已提交
38 39
#pragma mark - executor

40
template <typename Device, typename T>
xiebaiyuan's avatar
xiebaiyuan 已提交
41 42 43 44
Executor<Device, T>::Executor(const Program<Device> &program,
                              paddle_mobile::PaddleMobileConfigInternal config,
                              int batch_size, const bool use_optimize,
                              const bool lod_mode)
45
    : program_(program),
H
hjchen2 已提交
46 47
      batch_size_(batch_size),
      use_optimize_(use_optimize),
xiebaiyuan's avatar
xiebaiyuan 已提交
48 49
      lod_mode_(lod_mode),
      config_(config) {
50 51
  DLOG << "executor in lod mode: " << lod_mode_;

W
wangliu 已提交
52
  Variable *variable_ptr = program_.scope->Var("batch_size");
H
hjchen2 已提交
53
  variable_ptr->SetValue<int>(batch_size);
54 55

  program_desc_ =
Refine  
陈后江 已提交
56
      use_optimize_ ? program_.optimizeProgram : program_.originProgram;
57 58 59 60
  PADDLE_MOBILE_ENFORCE(program_desc_ != nullptr,
                        "program_desc_ should not be nullptr");
  const auto &blocks = program_desc_->Blocks();
  ops_of_block_.resize(blocks.size());
61

W
wangliu 已提交
62
  for (int i = 0; i < blocks.size(); ++i) {
63 64
    std::shared_ptr<BlockDesc> block_desc = blocks[i];
    std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
W
wangliu 已提交
65
    for (int j = 0; j < ops.size(); ++j) {
66 67
      std::shared_ptr<OpDesc> op_desc = ops[j];
      DLOG << "create op: " << op_desc->Type();
68

69 70 71 72 73 74 75
      auto op_handler = OpRegistry<Device>::CreateOp(
          op_desc->Type(), op_desc->GetInputs(), op_desc->GetOutputs(),
          op_desc->GetAttrMap(), program_.scope);
      // infer shape to reshape inputs and outputs before predict,
      // but for lod mode, it still need to infer shape in runtime
      if (!lod_mode) {
        op_handler->InferShape();
xiebaiyuan's avatar
xiebaiyuan 已提交
76
      }
77
      ops_of_block_[i].push_back(op_handler);
W
wangliu 已提交
78 79
    }
  }
80

W
wangliu 已提交
81
  if (program_.combined) {
L
liuruilong 已提交
82 83 84 85
    InitCombineMemory();
  } else {
    InitMemory();
  }
86

87 88 89 90 91
#ifdef PADDLE_MOBILE_FPGA
  program_.scope->EraseVars({"feed", "fetch"});
  program_.scope->print_vars();
#endif

92 93 94 95 96 97 98
  int count = 0;
  for (int block_id = 0; block_id < ops_of_block_.size(); ++block_id) {
    for (auto &op_handler : ops_of_block_[block_id]) {
      DLOG << "Initialize op[" << count++ << "]: " << op_handler->Type();
      op_handler->Init();
      ops_list_.push_back(op_handler);
    }
L
liuruilong 已提交
99
  }
W
wangliu 已提交
100 101
}

102
template <typename T>
103
static void LoadMemInternal(void **data, LoDTensor *tensor,
104
                            bool quant_uint8 = false) {
Refine  
陈后江 已提交
105
  char **data_buf = reinterpret_cast<char **>(data);
106
  int64_t size = tensor->numel();
107
  T *tensor_data = tensor->mutable_data<T>();
108 109
  if (quant_uint8) {
    // should be moved into operator init function
110 111
    float min_value;
    float max_value;
112 113 114
    memory::Copy(&min_value, *data_buf, sizeof(float));
    memory::Copy(&max_value, *data_buf + sizeof(float), sizeof(float));
    *data_buf += 2 * sizeof(float);
115
    const float factor = (max_value - min_value) / 255.0;
116
    const uint8_t *uint8_data = reinterpret_cast<uint8_t *>(*data_buf);
117 118
    for (int k = 0; k < size; ++k) {
      tensor_data[k] = uint8_data[k] * factor + min_value;
W
wangliu 已提交
119
    }
120
    *data_buf += size * sizeof(uint8_t);
121
  } else {
122 123
    memory::Copy(tensor_data, *data_buf, size * sizeof(T));
    *data_buf += size * sizeof(T);
L
liuruilong 已提交
124
  }
125
}
W
wangliu 已提交
126

127 128 129 130
template <typename Device, typename T>
void Executor<Device, T>::LoadMemory(void **data,
                                     const std::shared_ptr<VarDesc> var_desc,
                                     LoDTensor *tensor) {
131
  char **data_buf = reinterpret_cast<char **>(data);
132
  // version
133
  uint32_t version = *(reinterpret_cast<uint32_t *>(*data_buf));
Refine  
陈后江 已提交
134
  *data_buf += sizeof(uint32_t);
135
  // lod information
H
hjchen2 已提交
136 137
  // uint64_t lod_level = *(reinterpret_cast<uint64_t *>(*data_buf));
  uint64_t lod_level = 0;
Z
zhangyang 已提交
138
  memory::Copy(&lod_level, *data_buf, sizeof(uint64_t));
Refine  
陈后江 已提交
139
  *data_buf += sizeof(uint64_t);
140 141 142 143

  auto *lod = tensor->mutable_lod();
  lod->resize(lod_level);
  for (uint64_t i = 0; i < lod_level; ++i) {
144
    uint64_t size = *(reinterpret_cast<uint64_t *>(*data_buf));
Refine  
陈后江 已提交
145
    *data_buf += sizeof(uint64_t);
146
    std::vector<size_t> tmp_dim(size / sizeof(size_t));
Z
zhangyang 已提交
147
    memory::Copy(tmp_dim.data(), *data_buf, size);
148
    (*lod)[i] = std::move(tmp_dim);
Refine  
陈后江 已提交
149
    *data_buf += size;
W
wangliu 已提交
150
  }
151
  // tensor version
152
  uint32_t tensor_version = *(reinterpret_cast<uint32_t *>(*data_buf));
Refine  
陈后江 已提交
153
  *data_buf += sizeof(uint32_t);
154
  // tensor desc size
155
  int32_t tensor_desc_size = *(reinterpret_cast<int32_t *>(*data_buf));
Refine  
陈后江 已提交
156
  *data_buf += sizeof(int32_t);
157
  // skip tensor desc
Refine  
陈后江 已提交
158
  *data_buf += tensor_desc_size;
159

160 161
  const TensorDesc &tensor_desc = var_desc->Tensor_desc();
  tensor->Resize(make_ddim(tensor_desc.Dims()));
162 163
  // parse tensor from stream
  switch (tensor_desc.DataType()) {
164
    case VARTYPE_TYPE_FP32:
165 166
      LoadMemInternal<float>(reinterpret_cast<void **>(data_buf), tensor,
                             program_.quantification);
W
wangliu 已提交
167
      break;
168
    case VARTYPE_TYPE_INT8:
169
      LoadMemInternal<int8_t>(reinterpret_cast<void **>(data_buf), tensor);
W
wangliu 已提交
170
      break;
171
    case VARTYPE_TYPE_INT32:
172
      LoadMemInternal<int>(reinterpret_cast<void **>(data_buf), tensor);
W
wangliu 已提交
173 174
      break;
    default:
175
      LOG(kLOG_ERROR) << "data type is not supported";
L
liuruilong 已提交
176
  }
W
wangliu 已提交
177 178
}

179 180 181
template <typename Device, typename T>
void Executor<Device, T>::InitMemory() {
  for (const auto &block : program_desc_->Blocks()) {
W
wangliu 已提交
182 183
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
184
      auto tensor = var->template GetMutable<LoDTensor>();
W
wangliu 已提交
185 186 187 188
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
Refine  
陈后江 已提交
189
        char *origin_data =
Refine  
陈后江 已提交
190
            ReadFileToBuff(program_.model_path + "/" + var_desc->Name());
Refine  
陈后江 已提交
191
        char *data = origin_data;
192 193
        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
        delete[] origin_data;
W
wangliu 已提交
194
      } else {
195
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
196
          varInputMemory(var_desc, var, tensor);
W
wangliu 已提交
197 198 199 200 201 202
        }
      }
    }
  }
}

203 204
template <typename Device, typename T>
void Executor<Device, T>::InitCombineMemory() {
Refine  
陈后江 已提交
205
  char *origin_data = nullptr;
Refine  
陈后江 已提交
206
  bool self_alloc = false;
207
  if (program_.combined_params_buf && program_.combined_params_len) {
208 209
    origin_data = reinterpret_cast<char *>(
        const_cast<uint8_t *>(program_.combined_params_buf));
210
  } else {
Refine  
陈后江 已提交
211
    self_alloc = true;
Refine  
陈后江 已提交
212
    origin_data = ReadFileToBuff(program_.para_path);
213
  }
Refine  
陈后江 已提交
214 215
  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "data == nullptr");
  char *data = origin_data;
216
  for (const auto &block : program_desc_->Blocks()) {
L
liuruilong 已提交
217 218
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
219
      auto tensor = var->template GetMutable<LoDTensor>();
L
liuruilong 已提交
220 221 222 223
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
L
liuruilong 已提交
224 225 226

        DLOG << " init combine memory persistable: " << var_desc->Name();

227
        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
L
liuruilong 已提交
228
      } else {
229
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
xiebaiyuan's avatar
xiebaiyuan 已提交
230 231
          DLOG << " init combine memory no persistable in lod: "
               << var_desc->Name();
232
          varInputMemory(var_desc, var, tensor);
L
liuruilong 已提交
233 234
        } else {
          DLOG << " init combine memory no persistable: " << var_desc->Name();
L
liuruilong 已提交
235 236 237 238
        }
      }
    }
  }
Refine  
陈后江 已提交
239
  if (self_alloc) {
240
    delete[] origin_data;
Refine  
陈后江 已提交
241 242
  }
  LOG(kLOG_INFO) << "init combine memory finish";
L
liuruilong 已提交
243
}
244

L
liuruilong 已提交
245
template <typename Device, typename T>
L
liuruilong 已提交
246
void Executor<Device, T>::InitNoPersistableMemory(const Tensor &input_tensor) {
L
liuruilong 已提交
247 248 249 250 251 252 253 254 255 256 257
  for (const auto &block : program_desc_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      auto tensor = var->template GetMutable<LoDTensor>();
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
      } else {
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
          DDim tensor_dim = tensor->dims();
xiebaiyuan's avatar
xiebaiyuan 已提交
258 259 260 261
          DDim new_dim =
              make_ddim({tensor_dim[0], tensor_dim[1], input_tensor.dims()[2],
                         input_tensor.dims()[3]});
          tensor->Resize(new_dim);
L
liuruilong 已提交
262 263 264 265 266 267 268 269 270 271 272
          tensor->template mutable_data<T>();
        }
      }
    }
  }

  std::shared_ptr<LoDTensor> output = GetOutput("fetch");
  output->Resize(input_tensor.dims());
  output->mutable_data<T>();
}

273 274 275 276
template <typename Device, typename T>
bool Executor<Device, T>::varInputMemory(
    const std::shared_ptr<VarDesc> &var_desc, Variable *var,
    LoDTensor *tensor) const {
277 278 279 280
#ifdef PADDLE_MOBILE_FPGA
  tensor->init(typeid(float));
  return true;
#endif
281 282
  auto type = var_desc->Tensor_desc().DataType();
  switch (type) {
283
    case VARTYPE_TYPE_FP32:
284
      tensor->mutable_data<float>();
xiebaiyuan's avatar
xiebaiyuan 已提交
285
      break;
286
    case VARTYPE_TYPE_INT8:
287
      tensor->mutable_data<int8_t>();
Refine  
陈后江 已提交
288
      break;
289
    case VARTYPE_TYPE_INT32:
290
      tensor->mutable_data<int32_t>();
xiebaiyuan's avatar
xiebaiyuan 已提交
291
      break;
292
    case VARTYPE_TYPE_INT64:
293
      tensor->mutable_data<int64_t>();
xiebaiyuan's avatar
xiebaiyuan 已提交
294
      break;
Refine  
陈后江 已提交
295
    default:
xiebaiyuan's avatar
xiebaiyuan 已提交
296 297
      break;
  }
298 299 300
  bool is_mute_match =
      (type == VARTYPE_TYPE_FP32) || (type == VARTYPE_TYPE_INT8) ||
      (type == VARTYPE_TYPE_INT32) || (type == VARTYPE_TYPE_INT64);
Refine  
陈后江 已提交
301
  PADDLE_MOBILE_ENFORCE(is_mute_match, "got unhandled data type : %d", type);
xiebaiyuan's avatar
xiebaiyuan 已提交
302 303
  return is_mute_match;
}
L
liuruilong 已提交
304

305 306 307 308 309
template <typename Device, typename T>
PMStatus Executor<Device, T>::Predict(
    const std::vector<std::pair<std::string, Tensor>> &inputs) {
  for (const auto &input : inputs) {
    SetInput(input.second, input.first);
D
dolphin8 已提交
310
  }
311 312 313 314 315 316 317 318
  return this->Predict();
}

template <typename Device, typename T>
PMStatus Executor<Device, T>::Predict(
    const std::vector<std::pair<std::string, LoDTensor>> &inputs) {
  for (const auto &input : inputs) {
    SetInput(input.second, input.first);
D
dolphin8 已提交
319
  }
320
  return this->Predict();
W
wangliu 已提交
321
}
xiebaiyuan's avatar
xiebaiyuan 已提交
322

323 324 325 326 327 328 329 330 331 332 333 334 335 336
template <typename Device, typename T>
std::vector<T> Executor<Device, T>::Predict(const std::vector<T> &input,
                                            const std::vector<int64_t> &dims) {
  Tensor feed_tensor(input, make_ddim(dims));
  SetInput(feed_tensor, "feed");
  std::vector<T> output;
  if (this->Predict() == PMSuccess) {
    const auto output_tensor = GetOutput("fetch");
    output.resize(output_tensor->numel());
    memcpy(output.data(), output_tensor->template data<T>(),
           output.size() * sizeof(T));
  }
  return output;
}
xiebaiyuan's avatar
xiebaiyuan 已提交
337

338 339 340 341 342 343
template <typename Device, typename T>
void Executor<Device, T>::SetInput(const Tensor &input,
                                   const std::string &var_name) {
  auto *target_var = program_.scope->FindVar(var_name);
  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
                        var_name.c_str());
L
liuruilong 已提交
344

345
  auto *target_tensor = target_var->template GetMutable<LoDTensor>();
L
liuruilong 已提交
346 347

  if (config_.load_when_predict) {
Z
zhaojiaying01 已提交
348 349 350
    if (input_dim_last_ != input.dims()) {
      InitNoPersistableMemory(input);
      input_dim_last_ = input.dims();
L
liuruilong 已提交
351 352 353
    }
  }

354 355 356
  target_tensor->Resize(input.dims());
  target_tensor->ShareDataWith(input);
}
xiebaiyuan's avatar
xiebaiyuan 已提交
357

358 359 360 361 362 363 364
template <typename Device, typename T>
void Executor<Device, T>::SetInput(const LoDTensor &input,
                                   const std::string &var_name) {
  auto *target_var = program_.scope->FindVar(var_name);
  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
                        var_name.c_str());
  auto *target_tensor = target_var->template GetMutable<LoDTensor>();
L
liuruilong 已提交
365 366

  if (config_.load_when_predict) {
Z
zhaojiaying01 已提交
367
    if (input_dim_last_ != input.dims()) {
L
liuruilong 已提交
368
      InitNoPersistableMemory(*target_tensor);
Z
zhaojiaying01 已提交
369
      input_dim_last_ = input.dims();
L
liuruilong 已提交
370 371 372
    }
  }

373 374 375 376
  target_tensor->Resize(input.dims());
  target_tensor->ShareDataWith(input);
  target_tensor->set_lod(input.lod());
}
xiebaiyuan's avatar
xiebaiyuan 已提交
377

378 379
template <typename Device, typename T>
PMStatus Executor<Device, T>::Predict() {
xiebaiyuan's avatar
xiebaiyuan 已提交
380
#ifdef PADDLE_MOBILE_PROFILE
381 382 383
  std::vector<ProfInfo> profile(ops_list_.size());
  struct timespec ts;
  int op_index = 0;
xiebaiyuan's avatar
xiebaiyuan 已提交
384
#endif
385 386
  for (auto &block : ops_of_block_) {
    for (auto &op_handler : block) {
xiebaiyuan's avatar
xiebaiyuan 已提交
387
#ifdef PADDLE_MOBILE_PROFILE
388 389
      clock_gettime(CLOCK_MONOTONIC, &ts);
      profile[op_index].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
xiebaiyuan's avatar
xiebaiyuan 已提交
390
#endif
391 392 393 394
      if (lod_mode_) {
        op_handler->InferShape();
      }
      op_handler->Run();
xiebaiyuan's avatar
xiebaiyuan 已提交
395
#ifdef PADDLE_MOBILE_PROFILE
396 397 398
      clock_gettime(CLOCK_MONOTONIC, &ts);
      profile[op_index].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
      ++op_index;
xiebaiyuan's avatar
xiebaiyuan 已提交
399
#endif
400
    }
xiebaiyuan's avatar
xiebaiyuan 已提交
401 402 403 404 405 406
  }
#ifdef PADDLE_MOBILE_PROFILE
  std::unordered_map<std::string, uint64_t> _tp;
  for (int i = 0; i < profile.size(); i++) {
    const auto &pInfo = profile[i];
    uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
407 408 409 410 411
    if (ops_list_[i]->Type() == "conv2d" ||
        ops_list_[i]->Type() == "depthwise_conv2d") {
      auto inputs = ops_list_[i]->Inputs();
      auto *filter =
          GetVarValue<LoDTensor>("Filter", inputs, *(program_.scope));
412
      int kernel_size = filter->dims()[2];
413 414 415
      _tp[ops_list_[i]->Type() + "_" + std::to_string(kernel_size)] += timeCost;
    } else {
      _tp[ops_list_[i]->Type()] += timeCost;
416
    }
xiebaiyuan's avatar
xiebaiyuan 已提交
417
  }
H
hjchen2 已提交
418
  printf("====================[ profile ]======================\n");
419
  typedef std::pair<std::string, uint64_t> prof_t;
xiebaiyuan's avatar
xiebaiyuan 已提交
420 421 422 423 424 425 426 427 428 429 430 431 432 433 434
  std::vector<prof_t> _tv(_tp.begin(), _tp.end());
  uint64_t _ptotal = 0;
  for (auto const &p : _tv) {
    _ptotal += p.second;
  }
  auto compf = [](const prof_t &a, const prof_t &b) {
    return a.second > b.second;
  };
  std::sort(_tv.begin(), _tv.end(), compf);
  _tv.push_back(std::make_pair("total", _ptotal));
  for (auto const &p : _tv) {
    printf("%-16s\t%-10.0f\t%-2.4f\n", p.first.c_str(),
           static_cast<float>(p.second),
           static_cast<float>(p.second) / _ptotal * 100.0);
  }
H
hjchen2 已提交
435
  printf("====================[---------]======================\n");
xiebaiyuan's avatar
xiebaiyuan 已提交
436
#endif
437
  return PMSuccess;
xiebaiyuan's avatar
xiebaiyuan 已提交
438 439
}

440 441 442 443 444 445 446 447
template <typename Device, typename T>
std::shared_ptr<LoDTensor> Executor<Device, T>::GetOutput(
    const std::string &var_name) {
  auto *target_var = program_.scope->FindVar(var_name);
  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
                        var_name.c_str());
  auto *output_tensor = target_var->template GetMutable<LoDTensor>();
  return std::make_shared<LoDTensor>(*output_tensor);
W
wangliu 已提交
448 449
}

450
#ifdef PADDLE_MOBILE_FPGA
451 452 453 454
template <typename Device, typename T>
void Executor<Device, T>::InjectVariable(const Tensor &t,
                                         std::string var_name) {
  Variable *g_feed_value = program_.scope->Var(var_name);
455
  Tensor *feed_tensor = g_feed_value->template GetMutable<LoDTensor>();
456 457
  feed_tensor->Resize(t.dims());
  feed_tensor->ShareDataWith(t);
458
}
459

460 461
template <typename Device, typename T>
void Executor<Device, T>::FeedData(const Tensor &t) {
Z
zhangyang0701 已提交
462
  InjectVariable(t, "feed0");
463
}
464

465
template <typename Device, typename T>
466
void Executor<Device, T>::FeedData(const std::vector<void *> &v) {
467 468
  auto input_size = v.size();
  auto vars = program_.scope->VarContain("feed");
469 470 471 472 473 474 475 476 477
  PADDLE_MOBILE_ENFORCE(input_size == vars.size(),
                        "input data number not correct");
  for (int i = 0; i < input_size; i++) {
    auto var = program_.scope->Var("feed", i);
    auto feed_tensor = var->template GetMutable<LoDTensor>();
    feed_tensor->external_data = v[i];
  }
}

478
template <typename Device, typename T>
479
void Executor<Device, T>::FeedTensorData(const vector<framework::Tensor> &v) {
480 481 482 483 484 485 486 487 488 489 490
  auto input_size = v.size();
  auto vars = program_.scope->VarContain("feed");
  PADDLE_MOBILE_ENFORCE(input_size == vars.size(),
                        "input data number not correct");
  for (int i = 0; i < input_size; i++) {
    auto var = program_.scope->Var("feed", i);
    auto feed_tensor = var->template GetMutable<LoDTensor>();
    feed_tensor->ShareDataWith(v[i]);
  }
}

491 492 493 494 495 496 497 498 499 500 501
template <typename Device, typename T>
void Executor<Device, T>::GetResults(std::vector<void *> *v) {
  auto output_size = v->size();
  PADDLE_MOBILE_ENFORCE(output_size > 0, "Empty output");
  auto vars = program_.scope->VarContain("fetch");
  PADDLE_MOBILE_ENFORCE(output_size == vars.size(),
                        "output data number not correct");
  for (int i = 0; i < output_size; i++) {
    auto var = program_.scope->Var("fetch", i);
    auto fetch_tensor = var->template GetMutable<LoDTensor>();
    (*v)[i] = fetch_tensor->template data<float>();
502 503 504
  }
}

505
template <typename Device, typename T>
506 507
void Executor<Device, T>::GetTensorResults(
    std::vector<framework::Tensor *> *v) {
508 509 510 511 512 513 514 515 516 517 518 519
  auto output_size = v->size();
  PADDLE_MOBILE_ENFORCE(output_size > 0, "Empty output");
  auto vars = program_.scope->VarContain("fetch");
  PADDLE_MOBILE_ENFORCE(output_size == vars.size(),
                        "output data number not correct");
  for (int i = 0; i < output_size; i++) {
    auto var = program_.scope->Var("fetch", i);
    auto fetch_tensor = var->template GetMutable<LoDTensor>();
    (*v)[i] = fetch_tensor;
  }
}

520 521
template <typename Device, typename T>
std::shared_ptr<Tensor> Executor<Device, T>::FetchResult(int id) {
H
hjchen2 已提交
522
  auto &ops = ops_of_block_[0];
523

Z
zhangyang 已提交
524 525 526 527 528
  PADDLE_MOBILE_ENFORCE(id < (int)ops.size(), "Index out of range");
  auto op = id < 0 ? ops[ops.size() - 1] : ops[id];
  auto output_map = op->Outputs();
  std::vector<std::string> out_keys = op->GetOutKeys();
  PADDLE_MOBILE_ENFORCE(!out_keys.empty(), "this op contains no output");
529 530 531
  auto *output_tensor =
      GetVarValue<LoDTensor>(out_keys[0], output_map, *(program_.scope));
  return std::make_shared<Tensor>(Tensor(*output_tensor));
532
}
533

534 535
template <typename Device, typename T>
void Executor<Device, T>::Predict_From_To(int start, int end) {
H
hjchen2 已提交
536
  auto &ops = ops_of_block_[0];
537
  end = end < 0 ? static_cast<int>(ops.size()) : end;
538 539 540 541 542 543 544 545 546 547 548 549
  PADDLE_MOBILE_ENFORCE(start >= 0 && start < end && end <= ops.size(),
                        "start or end parameter is wrong");

#ifdef PADDLE_MOBILE_PROFILE
  std::vector<ProfInfo> profile(ops.size());
#endif
  for (int i = start; i < end; i++) {
#ifdef PADDLE_MOBILE_PROFILE
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
Z
zhangyang 已提交
550
    DLOG << "Running op: " << i << "  " << ops[i]->Type();
551 552 553 554 555 556 557
    ops[i]->Run();

#ifdef PADDLE_MOBILE_PROFILE
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
  }
558
}
559

560 561
template <typename Device, typename T>
void Executor<Device, T>::Predict_From(int start) {
562
  Predict_From_To(start);
563
}
564

565 566
template <typename Device, typename T>
void Executor<Device, T>::Predict_To(int end) {
567
  Predict_From_To(0, end);
568
}
569 570
#endif

Y
yangfei 已提交
571
#ifdef PADDLE_MOBILE_CL
xiebaiyuan's avatar
xiebaiyuan 已提交
572 573
template <>
void Executor<GPU_CL, float>::InitNoPersistableMemory(
574
    const Tensor &input_tensor) {
xiebaiyuan's avatar
xiebaiyuan 已提交
575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617
  DLOG << "CL InitNoPersistableMemory ";
  for (const auto &block : program_desc_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());

      auto cl_image = var->template GetMutable<CLImage>();

      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
      } else {
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
          cl_context context = program_.scope->GetCLScpoe()->Context();
          cl_command_queue command_queue =
              program_.scope->GetCLScpoe()->CommandQueue();

          DDim tensor_dim = cl_image->dims();
          DDim new_dim =
              make_ddim({tensor_dim[0], tensor_dim[1], input_tensor.dims()[2],
                         input_tensor.dims()[3]});
          cl_image->Resize(new_dim);
          cl_image->InitEmptyImage(context, command_queue, new_dim);
        }
      }
    }
  }
  std::shared_ptr<LoDTensor> output = GetOutput("fetch");
  output->Resize(input_tensor.dims());
  output->mutable_data<float>();
}
template <>
void Executor<GPU_CL, float>::SetInput(const Tensor &input,
                                       const std::string &var_name) {
  auto *target_var = program_.scope->FindVar(var_name);
  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
                        var_name.c_str());

  auto *target_tensor = target_var->template GetMutable<LoDTensor>();
  DLOG << "config_.load_when_predict   " << config_.load_when_predict;
  DLOG << "target_tensor->IsInitialized() " << target_tensor->IsInitialized();
  DLOG << "target_tensor->dims()   " << target_tensor->dims();
  DLOG << "input.dims()   " << input.dims();
618
  DLOG << "input_dim_last_   " << input_dim_last_;
xiebaiyuan's avatar
xiebaiyuan 已提交
619
  if (config_.load_when_predict) {
xiebaiyuan's avatar
xiebaiyuan 已提交
620
    if (input_dim_last_ != input.dims()) {
621 622 623
      DLOG << "SetInput ---- > resize1";
      target_tensor->Resize(input.dims());
      target_tensor->mutable_data<float>();
xiebaiyuan's avatar
xiebaiyuan 已提交
624 625 626 627 628 629 630 631
      InitNoPersistableMemory(*target_tensor);
    }
  } else {
    DLOG << "SetInput ---- > resize2";
    target_tensor->Resize(input.dims());
    DLOG << "SetInput ---- > ShareDataWith";
  }
  target_tensor->ShareDataWith(input);
632 633
  auto &dim = input.dims();
  input_dim_last_ = static_cast<DDim>(dim);
xiebaiyuan's avatar
xiebaiyuan 已提交
634 635
}

636 637 638
template <typename Device, typename T>
void Executor<Device, T>::LoadMemory(const VarDesc var_desc, float *tensorInput,
                                     char **data) {}
L
liuruilong 已提交
639

Y
yangfei 已提交
640
template <>
H
hjchen2 已提交
641 642
void Executor<GPU_CL, float>::LoadMemory(const VarDesc var_desc,
                                         float *tensorInput, char **data) {
643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679
  // 1. version
  uint32_t version = *reinterpret_cast<uint32_t *>(*data);

  (*data) += sizeof(uint32_t);

  // 2 Lod information
  uint64_t *lod_level_ptr = new uint64_t();
  memcpy(lod_level_ptr, (*data), sizeof(uint64_t));
  uint64_t lod_level = *lod_level_ptr;
  delete lod_level_ptr;
  (*data) += sizeof(uint64_t);

  for (uint64_t i = 0; i < lod_level; ++i) {
    uint64_t size = *reinterpret_cast<uint64_t *>(*data);
    (*data) += sizeof(uint64_t);
    std::vector<size_t> tmp(size / sizeof(size_t));

    for (int k = 0; k < tmp.size(); ++k) {
      tmp[k] = *reinterpret_cast<size_t *>(*data);
      (*data) += sizeof(size_t);
    }
  }

  // 3. tensor version
  uint32_t tensor_version = *reinterpret_cast<uint32_t *>(*data);
  (*data) += sizeof(uint32_t);

  // 4. tensor desc
  int32_t size = *reinterpret_cast<int32_t *>(*data);
  (*data) += sizeof(int32_t);

  std::unique_ptr<char[]> buf(new char[size]);
  for (int m = 0; m < size; ++m) {
    buf.get()[m] = (*data)[m];
  }
  (*data) += (sizeof(char) * size);

680
  const TensorDesc &desc = var_desc.Tensor_desc();
681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714
  int memory_size = 1;
  for (auto l : desc.Dims()) {
    memory_size *= l;
  }

  void *memory = nullptr;
  int type_size = 4;
  memory = tensorInput;
  if (program_.quantification) {
    float min_value;
    float max_value;

    memcpy(&min_value, *data, sizeof(float));
    memcpy(&max_value, *data + sizeof(float), sizeof(float));
    *data += 2 * sizeof(float);
    const float factor = (max_value - min_value) / 255.0;
    uint8_t *uint8_data = reinterpret_cast<uint8_t *>(*data);
    for (int k = 0; k < memory_size; ++k) {
      static_cast<float *>(memory)[k] = uint8_data[k] * factor + min_value;
    }
    *data += (memory_size * sizeof(uint8_t));
  } else {
    for (int n = 0; n < memory_size; n++) {
      float value;
      memcpy(&value, *data + n * type_size, type_size);
      if (value < 1e-30 && value > -1e-30) {
        static_cast<float *>(memory)[n] = 0.0;
      } else {
        static_cast<float *>(memory)[n] = value;
      }
    }
    (*data) += (sizeof(char) * memory_size * type_size);
  }
}
715

Y
yangfei 已提交
716
template <>
717 718
void Executor<GPU_CL, float>::InitMemory() {
  for (const auto &block : program_desc_->Blocks()) {
Y
yangfei 已提交
719 720 721
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      if (var_desc->Persistable()) {
L
liuruilong 已提交
722
        CLImage *cl_image = nullptr;
Y
yangfei 已提交
723
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
724
          var->template GetMutable<LoDTensor>();
Y
yangfei 已提交
725
          continue;
L
liuruilong 已提交
726
        } else {
727
          cl_image = var->template GetMutable<CLImage>();
Y
yangfei 已提交
728
        }
L
liuruilong 已提交
729

Y
yangfei 已提交
730
        char *origin_data =
L
liuruilong 已提交
731
            ReadFileToBuff(program_.model_path + "/" + var_desc->Name());
732
        char *data = origin_data;
Y
yangfei 已提交
733
        cl_context context = program_.scope->GetCLScpoe()->Context();
734
        const TensorDesc &desc = var_desc->Tensor_desc();
735 736 737 738 739
        int numel = 1;
        for (auto l : desc.Dims()) {
          numel *= l;
        }
        DLOG << var_desc->Name();
Y
yangfei 已提交
740
        float *tensorInput = static_cast<float *>(
741 742
            paddle_mobile::memory::Alloc(sizeof(float) * numel));
        LoadMemory(*var_desc, tensorInput, &data);
Y
yangfei 已提交
743

744
        DDim ddim = make_ddim(desc.Dims());
Y
yangfei 已提交
745

L
liuruilong 已提交
746 747
        // has not init
        cl_image->SetTensorData(tensorInput, ddim);
Y
yangfei 已提交
748

749
        delete origin_data;
Y
yangfei 已提交
750
        paddle_mobile::memory::Free(tensorInput);
751
      } else {
752 753
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
          auto cl_image = var->template GetMutable<CLImage>();
754
          cl_context context = program_.scope->GetCLScpoe()->Context();
L
liuruilong 已提交
755 756
          cl_command_queue command_queue =
              program_.scope->GetCLScpoe()->CommandQueue();
Y
yangfei 已提交
757

758 759 760
          const TensorDesc &desc = var_desc->Tensor_desc();
          //          DDim ddim = make_ddim(desc.Dims());
          DDim ddim = cl_image->dims();
761
          DLOG << var_desc->Name();
L
liuruilong 已提交
762
          cl_image->InitEmptyImage(context, command_queue, ddim);
763
        }
Y
yangfei 已提交
764 765 766 767
      }
    }
  }
}
768

Y
yangfei 已提交
769
template <>
770
void Executor<GPU_CL, float>::InitCombineMemory() {
xiebaiyuan's avatar
xiebaiyuan 已提交
771 772
  DLOG << "CL InitCombineMemory---- "
       << "config_.load_when_predict: " << config_.load_when_predict;
Y
yangfei 已提交
773 774
  char *origin_data = nullptr;
  bool self_alloc = false;
Y
yangfei 已提交
775 776
  if (program_.combined_params_buf && program_.combined_params_len) {
    LOG(kLOG_INFO) << "use outter memory";
777
    origin_data = reinterpret_cast<char *>(program_.combined_params_buf);
Y
yangfei 已提交
778 779
  } else {
    LOG(kLOG_INFO) << " begin init combine memory";
Y
yangfei 已提交
780
    self_alloc = true;
L
liuruilong 已提交
781
    origin_data = ReadFileToBuff(program_.para_path);
Y
yangfei 已提交
782 783
  }
  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "origin_data==nullptr!!!");
784
  float *data = reinterpret_cast<float *>(origin_data);
Y
yangfei 已提交
785

786
  for (const auto &block : program_desc_->Blocks()) {
Y
yangfei 已提交
787 788 789
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      if (var_desc->Persistable()) {
L
liuruilong 已提交
790
        CLImage *cl_image = nullptr;
Y
yangfei 已提交
791
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
792
          var->template GetMutable<LoDTensor>();
Y
yangfei 已提交
793
          continue;
L
liuruilong 已提交
794
        } else {
795
          cl_image = var->template GetMutable<CLImage>();
Y
yangfei 已提交
796 797 798 799
        }

        cl_context context = program_.scope->GetCLScpoe()->Context();

800 801
        const TensorDesc &desc = var_desc->Tensor_desc();
        DDim ddim = make_ddim(desc.Dims());
Y
yangfei 已提交
802 803 804 805 806

        int numel = 1;
        for (int i = 0; i < ddim.size(); i++) {
          numel = numel * ddim[i];
        }
807 808 809
        float *tensorInput = static_cast<float *>(
            paddle_mobile::memory::Alloc(sizeof(float) * numel));
        LoadMemory(*var_desc, tensorInput, &origin_data);
L
liuruilong 已提交
810 811 812 813

        // has not init
        cl_image->SetTensorData(tensorInput, ddim);

814 815
        paddle_mobile::memory::Free(tensorInput);
      } else {
816
        auto cl_image = var->template GetMutable<CLImage>();
Y
yangfei 已提交
817
        cl_context context = program_.scope->GetCLScpoe()->Context();
L
liuruilong 已提交
818 819
        cl_command_queue command_queue =
            program_.scope->GetCLScpoe()->CommandQueue();
820 821 822
        const TensorDesc &desc = var_desc->Tensor_desc();
        DDim ddim = cl_image->dims();
        //  DDim ddim = make_ddim(desc.Dims());
L
liuruilong 已提交
823
        cl_image->InitEmptyImage(context, command_queue, ddim);
Y
yangfei 已提交
824 825 826
      }
    }
  }
Y
yangfei 已提交
827
  if (self_alloc) {
828
    delete data;
Y
yangfei 已提交
829
  }
Y
yangfei 已提交
830
  LOG(kLOG_INFO) << " end init combine memory ";
831
}
Y
yangfei 已提交
832 833 834

#endif

835
template class Executor<CPU, float>;
Y
yangfei 已提交
836

837
template class Executor<FPGA, float>;
W
wangliu 已提交
838

839
template class Executor<GPU_CL, float>;
Y
yangfei 已提交
840

841
template class Executor<GPU_MALI, float>;
Y
yangfei 已提交
842 843

}  // namespace framework
W
wangliu 已提交
844
}  // namespace paddle_mobile