executor.cpp 26.1 KB
Newer Older
W
wangliu 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

15
#include "framework/executor.h"
D
dolphin8 已提交
16
#include <algorithm>
17
#include <utility>
W
wangliu 已提交
18
#include <vector>
L
liuruilong 已提交
19
#include "common/enforce.h"
L
liuruilong 已提交
20
#include "common/log.h"
L
liuruilong 已提交
21
#include "framework/framework.pb-c.h"
L
liuruilong 已提交
22 23
#include "framework/lod_tensor.h"
#include "framework/operator.h"
L
liuruilong 已提交
24
#include "framework/program/program-optimize/program_optimize.h"
L
liuruilong 已提交
25 26 27 28
#include "framework/program/program_desc.h"
#include "framework/program/var_desc.h"
#include "framework/scope.h"
#include "framework/tensor.h"
Z
zhangyang 已提交
29
#include "memory/t_malloc.h"
L
update  
liuruilong 已提交
30 31 32 33

#ifdef PADDLE_MOBILE_CL
#include "framework/cl/cl_image.h"
#endif
W
wangliu 已提交
34 35

namespace paddle_mobile {
36
namespace framework {
37

W
wangliu 已提交
38 39
#pragma mark - executor

40
template <typename Device, typename T>
xiebaiyuan's avatar
xiebaiyuan 已提交
41 42 43 44
Executor<Device, T>::Executor(const Program<Device> &program,
                              paddle_mobile::PaddleMobileConfigInternal config,
                              int batch_size, const bool use_optimize,
                              const bool lod_mode)
45
    : program_(program),
H
hjchen2 已提交
46 47
      batch_size_(batch_size),
      use_optimize_(use_optimize),
xiebaiyuan's avatar
xiebaiyuan 已提交
48 49
      lod_mode_(lod_mode),
      config_(config) {
50 51
  DLOG << "executor in lod mode: " << lod_mode_;

W
wangliu 已提交
52
  Variable *variable_ptr = program_.scope->Var("batch_size");
H
hjchen2 已提交
53
  variable_ptr->SetValue<int>(batch_size);
54 55

  program_desc_ =
Refine  
陈后江 已提交
56
      use_optimize_ ? program_.optimizeProgram : program_.originProgram;
57 58 59 60
  PADDLE_MOBILE_ENFORCE(program_desc_ != nullptr,
                        "program_desc_ should not be nullptr");
  const auto &blocks = program_desc_->Blocks();
  ops_of_block_.resize(blocks.size());
61

W
wangliu 已提交
62
  for (int i = 0; i < blocks.size(); ++i) {
63 64
    std::shared_ptr<BlockDesc> block_desc = blocks[i];
    std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
W
wangliu 已提交
65
    for (int j = 0; j < ops.size(); ++j) {
66 67 68 69 70 71 72 73 74
      std::shared_ptr<OpDesc> op_desc = ops[j];
      DLOG << "create op: " << op_desc->Type();
      auto op_handler = OpRegistry<Device>::CreateOp(
          op_desc->Type(), op_desc->GetInputs(), op_desc->GetOutputs(),
          op_desc->GetAttrMap(), program_.scope);
      // infer shape to reshape inputs and outputs before predict,
      // but for lod mode, it still need to infer shape in runtime
      if (!lod_mode) {
        op_handler->InferShape();
xiebaiyuan's avatar
xiebaiyuan 已提交
75
      }
76
      ops_of_block_[i].push_back(op_handler);
W
wangliu 已提交
77 78
    }
  }
79

W
wangliu 已提交
80
  if (program_.combined) {
L
liuruilong 已提交
81 82 83 84
    InitCombineMemory();
  } else {
    InitMemory();
  }
85 86 87 88 89 90 91 92

  int count = 0;
  for (int block_id = 0; block_id < ops_of_block_.size(); ++block_id) {
    for (auto &op_handler : ops_of_block_[block_id]) {
      DLOG << "Initialize op[" << count++ << "]: " << op_handler->Type();
      op_handler->Init();
      ops_list_.push_back(op_handler);
    }
L
liuruilong 已提交
93
  }
W
wangliu 已提交
94 95
}

96
template <typename T>
97
static void LoadMemInternal(void **data, LoDTensor *tensor,
98
                            bool quant_uint8 = false) {
Refine  
陈后江 已提交
99
  char **data_buf = reinterpret_cast<char **>(data);
100
  int64_t size = tensor->numel();
101
  T *tensor_data = tensor->mutable_data<T>();
102 103
  if (quant_uint8) {
    // should be moved into operator init function
104 105
    float min_value;
    float max_value;
106 107 108
    memory::Copy(&min_value, *data_buf, sizeof(float));
    memory::Copy(&max_value, *data_buf + sizeof(float), sizeof(float));
    *data_buf += 2 * sizeof(float);
109
    const float factor = (max_value - min_value) / 255.0;
110
    const uint8_t *uint8_data = reinterpret_cast<uint8_t *>(*data_buf);
111 112
    for (int k = 0; k < size; ++k) {
      tensor_data[k] = uint8_data[k] * factor + min_value;
W
wangliu 已提交
113
    }
114
    *data_buf += size * sizeof(uint8_t);
115
  } else {
116 117
    memory::Copy(tensor_data, *data_buf, size * sizeof(T));
    *data_buf += size * sizeof(T);
L
liuruilong 已提交
118
  }
119
}
W
wangliu 已提交
120

121 122 123 124
template <typename Device, typename T>
void Executor<Device, T>::LoadMemory(void **data,
                                     const std::shared_ptr<VarDesc> var_desc,
                                     LoDTensor *tensor) {
125
  char **data_buf = reinterpret_cast<char **>(data);
126
  // version
127
  uint32_t version = *(reinterpret_cast<uint32_t *>(*data_buf));
Refine  
陈后江 已提交
128
  *data_buf += sizeof(uint32_t);
129
  // lod information
H
hjchen2 已提交
130 131
  // uint64_t lod_level = *(reinterpret_cast<uint64_t *>(*data_buf));
  uint64_t lod_level = 0;
Z
zhangyang 已提交
132
  memory::Copy(&lod_level, *data_buf, sizeof(uint64_t));
Refine  
陈后江 已提交
133
  *data_buf += sizeof(uint64_t);
134 135 136 137

  auto *lod = tensor->mutable_lod();
  lod->resize(lod_level);
  for (uint64_t i = 0; i < lod_level; ++i) {
138
    uint64_t size = *(reinterpret_cast<uint64_t *>(*data_buf));
Refine  
陈后江 已提交
139
    *data_buf += sizeof(uint64_t);
140
    std::vector<size_t> tmp_dim(size / sizeof(size_t));
Z
zhangyang 已提交
141
    memory::Copy(tmp_dim.data(), *data_buf, size);
142
    (*lod)[i] = std::move(tmp_dim);
Refine  
陈后江 已提交
143
    *data_buf += size;
W
wangliu 已提交
144
  }
145
  // tensor version
146
  uint32_t tensor_version = *(reinterpret_cast<uint32_t *>(*data_buf));
Refine  
陈后江 已提交
147
  *data_buf += sizeof(uint32_t);
148
  // tensor desc size
149
  int32_t tensor_desc_size = *(reinterpret_cast<int32_t *>(*data_buf));
Refine  
陈后江 已提交
150
  *data_buf += sizeof(int32_t);
151
  // skip tensor desc
Refine  
陈后江 已提交
152
  *data_buf += tensor_desc_size;
153

154 155
  const TensorDesc &tensor_desc = var_desc->Tensor_desc();
  tensor->Resize(make_ddim(tensor_desc.Dims()));
156 157
  // parse tensor from stream
  switch (tensor_desc.DataType()) {
158
    case VARTYPE_TYPE_FP32:
159 160
      LoadMemInternal<float>(reinterpret_cast<void **>(data_buf), tensor,
                             program_.quantification);
W
wangliu 已提交
161
      break;
162
    case VARTYPE_TYPE_INT8:
163
      LoadMemInternal<int8_t>(reinterpret_cast<void **>(data_buf), tensor);
W
wangliu 已提交
164
      break;
165
    case VARTYPE_TYPE_INT32:
166
      LoadMemInternal<int>(reinterpret_cast<void **>(data_buf), tensor);
W
wangliu 已提交
167 168
      break;
    default:
169
      LOG(kLOG_ERROR) << "data type is not supported";
L
liuruilong 已提交
170
  }
W
wangliu 已提交
171 172
}

173 174 175
template <typename Device, typename T>
void Executor<Device, T>::InitMemory() {
  for (const auto &block : program_desc_->Blocks()) {
W
wangliu 已提交
176 177
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
178
      auto tensor = var->template GetMutable<LoDTensor>();
W
wangliu 已提交
179 180 181 182
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
Refine  
陈后江 已提交
183
        char *origin_data =
Refine  
陈后江 已提交
184
            ReadFileToBuff(program_.model_path + "/" + var_desc->Name());
Refine  
陈后江 已提交
185
        char *data = origin_data;
186 187
        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
        delete[] origin_data;
W
wangliu 已提交
188
      } else {
189
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
190
          varInputMemory(var_desc, var, tensor);
W
wangliu 已提交
191 192 193 194 195 196
        }
      }
    }
  }
}

197 198
template <typename Device, typename T>
void Executor<Device, T>::InitCombineMemory() {
Refine  
陈后江 已提交
199
  char *origin_data = nullptr;
Refine  
陈后江 已提交
200
  bool self_alloc = false;
201
  if (program_.combined_params_buf && program_.combined_params_len) {
202 203
    origin_data = reinterpret_cast<char *>(
        const_cast<uint8_t *>(program_.combined_params_buf));
204
  } else {
Refine  
陈后江 已提交
205
    self_alloc = true;
Refine  
陈后江 已提交
206
    origin_data = ReadFileToBuff(program_.para_path);
207
  }
Refine  
陈后江 已提交
208 209
  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "data == nullptr");
  char *data = origin_data;
210
  for (const auto &block : program_desc_->Blocks()) {
L
liuruilong 已提交
211 212
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
213
      auto tensor = var->template GetMutable<LoDTensor>();
L
liuruilong 已提交
214 215 216 217
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
L
liuruilong 已提交
218 219 220

        DLOG << " init combine memory persistable: " << var_desc->Name();

221
        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
L
liuruilong 已提交
222
      } else {
223
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
xiebaiyuan's avatar
xiebaiyuan 已提交
224 225
          DLOG << " init combine memory no persistable in lod: "
               << var_desc->Name();
226
          varInputMemory(var_desc, var, tensor);
L
liuruilong 已提交
227 228
        } else {
          DLOG << " init combine memory no persistable: " << var_desc->Name();
L
liuruilong 已提交
229 230 231 232
        }
      }
    }
  }
Refine  
陈后江 已提交
233
  if (self_alloc) {
234
    delete[] origin_data;
Refine  
陈后江 已提交
235 236
  }
  LOG(kLOG_INFO) << "init combine memory finish";
L
liuruilong 已提交
237
}
238

L
liuruilong 已提交
239
template <typename Device, typename T>
L
liuruilong 已提交
240
void Executor<Device, T>::InitNoPersistableMemory(const Tensor &input_tensor) {
L
liuruilong 已提交
241 242 243 244 245 246 247 248 249 250 251
  for (const auto &block : program_desc_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      auto tensor = var->template GetMutable<LoDTensor>();
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
      } else {
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
          DDim tensor_dim = tensor->dims();
xiebaiyuan's avatar
xiebaiyuan 已提交
252 253 254 255
          DDim new_dim =
              make_ddim({tensor_dim[0], tensor_dim[1], input_tensor.dims()[2],
                         input_tensor.dims()[3]});
          tensor->Resize(new_dim);
L
liuruilong 已提交
256 257 258 259 260 261 262 263 264 265 266
          tensor->template mutable_data<T>();
        }
      }
    }
  }

  std::shared_ptr<LoDTensor> output = GetOutput("fetch");
  output->Resize(input_tensor.dims());
  output->mutable_data<T>();
}

267 268 269 270
template <typename Device, typename T>
bool Executor<Device, T>::varInputMemory(
    const std::shared_ptr<VarDesc> &var_desc, Variable *var,
    LoDTensor *tensor) const {
271 272 273 274
#ifdef PADDLE_MOBILE_FPGA
  tensor->init(typeid(float));
  return true;
#endif
275 276
  auto type = var_desc->Tensor_desc().DataType();
  switch (type) {
277
    case VARTYPE_TYPE_FP32:
278
      tensor->mutable_data<float>();
xiebaiyuan's avatar
xiebaiyuan 已提交
279
      break;
280
    case VARTYPE_TYPE_INT8:
281
      tensor->mutable_data<int8_t>();
Refine  
陈后江 已提交
282
      break;
283
    case VARTYPE_TYPE_INT32:
284
      tensor->mutable_data<int32_t>();
xiebaiyuan's avatar
xiebaiyuan 已提交
285
      break;
286
    case VARTYPE_TYPE_INT64:
287
      tensor->mutable_data<int64_t>();
xiebaiyuan's avatar
xiebaiyuan 已提交
288
      break;
Refine  
陈后江 已提交
289
    default:
xiebaiyuan's avatar
xiebaiyuan 已提交
290 291
      break;
  }
292 293 294
  bool is_mute_match =
      (type == VARTYPE_TYPE_FP32) || (type == VARTYPE_TYPE_INT8) ||
      (type == VARTYPE_TYPE_INT32) || (type == VARTYPE_TYPE_INT64);
Refine  
陈后江 已提交
295
  PADDLE_MOBILE_ENFORCE(is_mute_match, "got unhandled data type : %d", type);
xiebaiyuan's avatar
xiebaiyuan 已提交
296 297
  return is_mute_match;
}
L
liuruilong 已提交
298

299 300 301 302 303
template <typename Device, typename T>
PMStatus Executor<Device, T>::Predict(
    const std::vector<std::pair<std::string, Tensor>> &inputs) {
  for (const auto &input : inputs) {
    SetInput(input.second, input.first);
D
dolphin8 已提交
304
  }
305 306 307 308 309 310 311 312
  return this->Predict();
}

template <typename Device, typename T>
PMStatus Executor<Device, T>::Predict(
    const std::vector<std::pair<std::string, LoDTensor>> &inputs) {
  for (const auto &input : inputs) {
    SetInput(input.second, input.first);
D
dolphin8 已提交
313
  }
314
  return this->Predict();
W
wangliu 已提交
315
}
xiebaiyuan's avatar
xiebaiyuan 已提交
316

317 318 319 320 321 322 323 324 325 326 327 328 329 330
template <typename Device, typename T>
std::vector<T> Executor<Device, T>::Predict(const std::vector<T> &input,
                                            const std::vector<int64_t> &dims) {
  Tensor feed_tensor(input, make_ddim(dims));
  SetInput(feed_tensor, "feed");
  std::vector<T> output;
  if (this->Predict() == PMSuccess) {
    const auto output_tensor = GetOutput("fetch");
    output.resize(output_tensor->numel());
    memcpy(output.data(), output_tensor->template data<T>(),
           output.size() * sizeof(T));
  }
  return output;
}
xiebaiyuan's avatar
xiebaiyuan 已提交
331

332 333 334 335 336 337
template <typename Device, typename T>
void Executor<Device, T>::SetInput(const Tensor &input,
                                   const std::string &var_name) {
  auto *target_var = program_.scope->FindVar(var_name);
  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
                        var_name.c_str());
L
liuruilong 已提交
338

339
  auto *target_tensor = target_var->template GetMutable<LoDTensor>();
L
liuruilong 已提交
340 341

  if (config_.load_when_predict) {
Z
zhaojiaying01 已提交
342 343 344
    if (input_dim_last_ != input.dims()) {
      InitNoPersistableMemory(input);
      input_dim_last_ = input.dims();
L
liuruilong 已提交
345 346 347
    }
  }

348 349 350
  target_tensor->Resize(input.dims());
  target_tensor->ShareDataWith(input);
}
xiebaiyuan's avatar
xiebaiyuan 已提交
351

352 353 354 355 356 357 358
template <typename Device, typename T>
void Executor<Device, T>::SetInput(const LoDTensor &input,
                                   const std::string &var_name) {
  auto *target_var = program_.scope->FindVar(var_name);
  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
                        var_name.c_str());
  auto *target_tensor = target_var->template GetMutable<LoDTensor>();
L
liuruilong 已提交
359 360

  if (config_.load_when_predict) {
Z
zhaojiaying01 已提交
361
    if (input_dim_last_ != input.dims()) {
L
liuruilong 已提交
362
      InitNoPersistableMemory(*target_tensor);
Z
zhaojiaying01 已提交
363
      input_dim_last_ = input.dims();
L
liuruilong 已提交
364 365 366
    }
  }

367 368 369 370
  target_tensor->Resize(input.dims());
  target_tensor->ShareDataWith(input);
  target_tensor->set_lod(input.lod());
}
xiebaiyuan's avatar
xiebaiyuan 已提交
371

372 373
template <typename Device, typename T>
PMStatus Executor<Device, T>::Predict() {
xiebaiyuan's avatar
xiebaiyuan 已提交
374
#ifdef PADDLE_MOBILE_PROFILE
375 376 377
  std::vector<ProfInfo> profile(ops_list_.size());
  struct timespec ts;
  int op_index = 0;
xiebaiyuan's avatar
xiebaiyuan 已提交
378
#endif
379 380
  for (auto &block : ops_of_block_) {
    for (auto &op_handler : block) {
xiebaiyuan's avatar
xiebaiyuan 已提交
381
#ifdef PADDLE_MOBILE_PROFILE
382 383
      clock_gettime(CLOCK_MONOTONIC, &ts);
      profile[op_index].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
xiebaiyuan's avatar
xiebaiyuan 已提交
384
#endif
385 386 387 388
      if (lod_mode_) {
        op_handler->InferShape();
      }
      op_handler->Run();
xiebaiyuan's avatar
xiebaiyuan 已提交
389
#ifdef PADDLE_MOBILE_PROFILE
390 391 392
      clock_gettime(CLOCK_MONOTONIC, &ts);
      profile[op_index].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
      ++op_index;
xiebaiyuan's avatar
xiebaiyuan 已提交
393
#endif
394
    }
xiebaiyuan's avatar
xiebaiyuan 已提交
395 396 397 398 399 400
  }
#ifdef PADDLE_MOBILE_PROFILE
  std::unordered_map<std::string, uint64_t> _tp;
  for (int i = 0; i < profile.size(); i++) {
    const auto &pInfo = profile[i];
    uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
401 402 403 404 405
    if (ops_list_[i]->Type() == "conv2d" ||
        ops_list_[i]->Type() == "depthwise_conv2d") {
      auto inputs = ops_list_[i]->Inputs();
      auto *filter =
          GetVarValue<LoDTensor>("Filter", inputs, *(program_.scope));
406
      int kernel_size = filter->dims()[2];
407 408 409
      _tp[ops_list_[i]->Type() + "_" + std::to_string(kernel_size)] += timeCost;
    } else {
      _tp[ops_list_[i]->Type()] += timeCost;
410
    }
xiebaiyuan's avatar
xiebaiyuan 已提交
411
  }
H
hjchen2 已提交
412
  printf("====================[ profile ]======================\n");
413
  typedef std::pair<std::string, uint64_t> prof_t;
xiebaiyuan's avatar
xiebaiyuan 已提交
414 415 416 417 418 419 420 421 422 423 424 425 426 427 428
  std::vector<prof_t> _tv(_tp.begin(), _tp.end());
  uint64_t _ptotal = 0;
  for (auto const &p : _tv) {
    _ptotal += p.second;
  }
  auto compf = [](const prof_t &a, const prof_t &b) {
    return a.second > b.second;
  };
  std::sort(_tv.begin(), _tv.end(), compf);
  _tv.push_back(std::make_pair("total", _ptotal));
  for (auto const &p : _tv) {
    printf("%-16s\t%-10.0f\t%-2.4f\n", p.first.c_str(),
           static_cast<float>(p.second),
           static_cast<float>(p.second) / _ptotal * 100.0);
  }
H
hjchen2 已提交
429
  printf("====================[---------]======================\n");
xiebaiyuan's avatar
xiebaiyuan 已提交
430
#endif
431
  return PMSuccess;
xiebaiyuan's avatar
xiebaiyuan 已提交
432 433
}

434 435 436 437 438 439 440 441
template <typename Device, typename T>
std::shared_ptr<LoDTensor> Executor<Device, T>::GetOutput(
    const std::string &var_name) {
  auto *target_var = program_.scope->FindVar(var_name);
  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
                        var_name.c_str());
  auto *output_tensor = target_var->template GetMutable<LoDTensor>();
  return std::make_shared<LoDTensor>(*output_tensor);
W
wangliu 已提交
442 443
}

444
#ifdef PADDLE_MOBILE_FPGA
445 446 447 448 449
template <typename Device, typename T>
void Executor<Device, T>::InjectVariable(const Tensor &t,
                                         std::string var_name) {
  Variable *g_feed_value = program_.scope->Var(var_name);
  Tensor *feed_tensor = g_feed_value->GetMutable<LoDTensor>();
450 451
  feed_tensor->Resize(t.dims());
  feed_tensor->ShareDataWith(t);
452
}
453

454 455
template <typename Device, typename T>
void Executor<Device, T>::FeedData(const Tensor &t) {
456
  InjectVariable(t, "feed");
457
}
458

459 460
template <typename Device, typename T>
std::shared_ptr<Tensor> Executor<Device, T>::FetchResult(int id) {
H
hjchen2 已提交
461
  auto &ops = ops_of_block_[0];
462

Z
zhangyang 已提交
463 464 465 466 467
  PADDLE_MOBILE_ENFORCE(id < (int)ops.size(), "Index out of range");
  auto op = id < 0 ? ops[ops.size() - 1] : ops[id];
  auto output_map = op->Outputs();
  std::vector<std::string> out_keys = op->GetOutKeys();
  PADDLE_MOBILE_ENFORCE(!out_keys.empty(), "this op contains no output");
468 469 470
  auto *output_tensor =
      GetVarValue<LoDTensor>(out_keys[0], output_map, *(program_.scope));
  return std::make_shared<Tensor>(Tensor(*output_tensor));
471
}
472

473 474
template <typename Device, typename T>
void Executor<Device, T>::Predict_From_To(int start, int end) {
H
hjchen2 已提交
475
  auto &ops = ops_of_block_[0];
476
  end = end < 0 ? static_cast<int>(ops.size()) : end;
477 478 479 480 481 482 483 484 485 486 487 488
  PADDLE_MOBILE_ENFORCE(start >= 0 && start < end && end <= ops.size(),
                        "start or end parameter is wrong");

#ifdef PADDLE_MOBILE_PROFILE
  std::vector<ProfInfo> profile(ops.size());
#endif
  for (int i = start; i < end; i++) {
#ifdef PADDLE_MOBILE_PROFILE
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
Z
zhangyang 已提交
489
    DLOG << "Running op: " << i << "  " << ops[i]->Type();
490 491 492 493 494 495 496
    ops[i]->Run();

#ifdef PADDLE_MOBILE_PROFILE
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
  }
497
}
498

499 500
template <typename Device, typename T>
void Executor<Device, T>::Predict_From(int start) {
501
  Predict_From_To(start);
502
}
503

504 505
template <typename Device, typename T>
void Executor<Device, T>::Predict_To(int end) {
506
  Predict_From_To(0, end);
507
}
508 509
#endif

Y
yangfei 已提交
510
#ifdef PADDLE_MOBILE_CL
xiebaiyuan's avatar
xiebaiyuan 已提交
511 512
template <>
void Executor<GPU_CL, float>::InitNoPersistableMemory(
513
    const Tensor &input_tensor) {
xiebaiyuan's avatar
xiebaiyuan 已提交
514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556
  DLOG << "CL InitNoPersistableMemory ";
  for (const auto &block : program_desc_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());

      auto cl_image = var->template GetMutable<CLImage>();

      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
      } else {
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
          cl_context context = program_.scope->GetCLScpoe()->Context();
          cl_command_queue command_queue =
              program_.scope->GetCLScpoe()->CommandQueue();

          DDim tensor_dim = cl_image->dims();
          DDim new_dim =
              make_ddim({tensor_dim[0], tensor_dim[1], input_tensor.dims()[2],
                         input_tensor.dims()[3]});
          cl_image->Resize(new_dim);
          cl_image->InitEmptyImage(context, command_queue, new_dim);
        }
      }
    }
  }
  std::shared_ptr<LoDTensor> output = GetOutput("fetch");
  output->Resize(input_tensor.dims());
  output->mutable_data<float>();
}
template <>
void Executor<GPU_CL, float>::SetInput(const Tensor &input,
                                       const std::string &var_name) {
  auto *target_var = program_.scope->FindVar(var_name);
  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
                        var_name.c_str());

  auto *target_tensor = target_var->template GetMutable<LoDTensor>();
  DLOG << "config_.load_when_predict   " << config_.load_when_predict;
  DLOG << "target_tensor->IsInitialized() " << target_tensor->IsInitialized();
  DLOG << "target_tensor->dims()   " << target_tensor->dims();
  DLOG << "input.dims()   " << input.dims();
557
  DLOG << "input_dim_last_   " << input_dim_last_;
xiebaiyuan's avatar
xiebaiyuan 已提交
558
  if (config_.load_when_predict) {
xiebaiyuan's avatar
xiebaiyuan 已提交
559
    if (input_dim_last_ != input.dims()) {
560 561 562
      DLOG << "SetInput ---- > resize1";
      target_tensor->Resize(input.dims());
      target_tensor->mutable_data<float>();
xiebaiyuan's avatar
xiebaiyuan 已提交
563 564 565 566 567 568 569 570
      InitNoPersistableMemory(*target_tensor);
    }
  } else {
    DLOG << "SetInput ---- > resize2";
    target_tensor->Resize(input.dims());
    DLOG << "SetInput ---- > ShareDataWith";
  }
  target_tensor->ShareDataWith(input);
571 572
  auto &dim = input.dims();
  input_dim_last_ = static_cast<DDim>(dim);
xiebaiyuan's avatar
xiebaiyuan 已提交
573 574
}

575 576 577
template <typename Device, typename T>
void Executor<Device, T>::LoadMemory(const VarDesc var_desc, float *tensorInput,
                                     char **data) {}
L
liuruilong 已提交
578

Y
yangfei 已提交
579
template <>
H
hjchen2 已提交
580 581
void Executor<GPU_CL, float>::LoadMemory(const VarDesc var_desc,
                                         float *tensorInput, char **data) {
582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618
  // 1. version
  uint32_t version = *reinterpret_cast<uint32_t *>(*data);

  (*data) += sizeof(uint32_t);

  // 2 Lod information
  uint64_t *lod_level_ptr = new uint64_t();
  memcpy(lod_level_ptr, (*data), sizeof(uint64_t));
  uint64_t lod_level = *lod_level_ptr;
  delete lod_level_ptr;
  (*data) += sizeof(uint64_t);

  for (uint64_t i = 0; i < lod_level; ++i) {
    uint64_t size = *reinterpret_cast<uint64_t *>(*data);
    (*data) += sizeof(uint64_t);
    std::vector<size_t> tmp(size / sizeof(size_t));

    for (int k = 0; k < tmp.size(); ++k) {
      tmp[k] = *reinterpret_cast<size_t *>(*data);
      (*data) += sizeof(size_t);
    }
  }

  // 3. tensor version
  uint32_t tensor_version = *reinterpret_cast<uint32_t *>(*data);
  (*data) += sizeof(uint32_t);

  // 4. tensor desc
  int32_t size = *reinterpret_cast<int32_t *>(*data);
  (*data) += sizeof(int32_t);

  std::unique_ptr<char[]> buf(new char[size]);
  for (int m = 0; m < size; ++m) {
    buf.get()[m] = (*data)[m];
  }
  (*data) += (sizeof(char) * size);

619
  const TensorDesc &desc = var_desc.Tensor_desc();
620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653
  int memory_size = 1;
  for (auto l : desc.Dims()) {
    memory_size *= l;
  }

  void *memory = nullptr;
  int type_size = 4;
  memory = tensorInput;
  if (program_.quantification) {
    float min_value;
    float max_value;

    memcpy(&min_value, *data, sizeof(float));
    memcpy(&max_value, *data + sizeof(float), sizeof(float));
    *data += 2 * sizeof(float);
    const float factor = (max_value - min_value) / 255.0;
    uint8_t *uint8_data = reinterpret_cast<uint8_t *>(*data);
    for (int k = 0; k < memory_size; ++k) {
      static_cast<float *>(memory)[k] = uint8_data[k] * factor + min_value;
    }
    *data += (memory_size * sizeof(uint8_t));
  } else {
    for (int n = 0; n < memory_size; n++) {
      float value;
      memcpy(&value, *data + n * type_size, type_size);
      if (value < 1e-30 && value > -1e-30) {
        static_cast<float *>(memory)[n] = 0.0;
      } else {
        static_cast<float *>(memory)[n] = value;
      }
    }
    (*data) += (sizeof(char) * memory_size * type_size);
  }
}
654

Y
yangfei 已提交
655
template <>
656 657
void Executor<GPU_CL, float>::InitMemory() {
  for (const auto &block : program_desc_->Blocks()) {
Y
yangfei 已提交
658 659 660
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      if (var_desc->Persistable()) {
L
liuruilong 已提交
661
        CLImage *cl_image = nullptr;
Y
yangfei 已提交
662
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
663
          var->template GetMutable<LoDTensor>();
Y
yangfei 已提交
664
          continue;
L
liuruilong 已提交
665
        } else {
666
          cl_image = var->template GetMutable<CLImage>();
Y
yangfei 已提交
667
        }
L
liuruilong 已提交
668

Y
yangfei 已提交
669
        char *origin_data =
L
liuruilong 已提交
670
            ReadFileToBuff(program_.model_path + "/" + var_desc->Name());
671
        char *data = origin_data;
Y
yangfei 已提交
672
        cl_context context = program_.scope->GetCLScpoe()->Context();
673
        const TensorDesc &desc = var_desc->Tensor_desc();
674 675 676 677 678
        int numel = 1;
        for (auto l : desc.Dims()) {
          numel *= l;
        }
        DLOG << var_desc->Name();
Y
yangfei 已提交
679
        float *tensorInput = static_cast<float *>(
680 681
            paddle_mobile::memory::Alloc(sizeof(float) * numel));
        LoadMemory(*var_desc, tensorInput, &data);
Y
yangfei 已提交
682

683
        DDim ddim = make_ddim(desc.Dims());
Y
yangfei 已提交
684

L
liuruilong 已提交
685 686
        // has not init
        cl_image->SetTensorData(tensorInput, ddim);
Y
yangfei 已提交
687

688
        delete origin_data;
Y
yangfei 已提交
689
        paddle_mobile::memory::Free(tensorInput);
690
      } else {
691 692
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
          auto cl_image = var->template GetMutable<CLImage>();
693
          cl_context context = program_.scope->GetCLScpoe()->Context();
L
liuruilong 已提交
694 695
          cl_command_queue command_queue =
              program_.scope->GetCLScpoe()->CommandQueue();
Y
yangfei 已提交
696

697 698 699
          const TensorDesc &desc = var_desc->Tensor_desc();
          //          DDim ddim = make_ddim(desc.Dims());
          DDim ddim = cl_image->dims();
700
          DLOG << var_desc->Name();
L
liuruilong 已提交
701
          cl_image->InitEmptyImage(context, command_queue, ddim);
702
        }
Y
yangfei 已提交
703 704 705 706
      }
    }
  }
}
707

Y
yangfei 已提交
708
template <>
709
void Executor<GPU_CL, float>::InitCombineMemory() {
xiebaiyuan's avatar
xiebaiyuan 已提交
710 711
  DLOG << "CL InitCombineMemory---- "
       << "config_.load_when_predict: " << config_.load_when_predict;
Y
yangfei 已提交
712 713
  char *origin_data = nullptr;
  bool self_alloc = false;
Y
yangfei 已提交
714 715
  if (program_.combined_params_buf && program_.combined_params_len) {
    LOG(kLOG_INFO) << "use outter memory";
716
    origin_data = reinterpret_cast<char *>(program_.combined_params_buf);
Y
yangfei 已提交
717 718
  } else {
    LOG(kLOG_INFO) << " begin init combine memory";
Y
yangfei 已提交
719
    self_alloc = true;
L
liuruilong 已提交
720
    origin_data = ReadFileToBuff(program_.para_path);
Y
yangfei 已提交
721 722
  }
  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "origin_data==nullptr!!!");
723
  float *data = reinterpret_cast<float *>(origin_data);
Y
yangfei 已提交
724

725
  for (const auto &block : program_desc_->Blocks()) {
Y
yangfei 已提交
726 727 728
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      if (var_desc->Persistable()) {
L
liuruilong 已提交
729
        CLImage *cl_image = nullptr;
Y
yangfei 已提交
730
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
731
          var->template GetMutable<LoDTensor>();
Y
yangfei 已提交
732
          continue;
L
liuruilong 已提交
733
        } else {
734
          cl_image = var->template GetMutable<CLImage>();
Y
yangfei 已提交
735 736 737 738
        }

        cl_context context = program_.scope->GetCLScpoe()->Context();

739 740
        const TensorDesc &desc = var_desc->Tensor_desc();
        DDim ddim = make_ddim(desc.Dims());
Y
yangfei 已提交
741 742 743 744 745

        int numel = 1;
        for (int i = 0; i < ddim.size(); i++) {
          numel = numel * ddim[i];
        }
746 747 748
        float *tensorInput = static_cast<float *>(
            paddle_mobile::memory::Alloc(sizeof(float) * numel));
        LoadMemory(*var_desc, tensorInput, &origin_data);
L
liuruilong 已提交
749 750 751 752

        // has not init
        cl_image->SetTensorData(tensorInput, ddim);

753 754
        paddle_mobile::memory::Free(tensorInput);
      } else {
755
        auto cl_image = var->template GetMutable<CLImage>();
Y
yangfei 已提交
756
        cl_context context = program_.scope->GetCLScpoe()->Context();
L
liuruilong 已提交
757 758
        cl_command_queue command_queue =
            program_.scope->GetCLScpoe()->CommandQueue();
759 760 761
        const TensorDesc &desc = var_desc->Tensor_desc();
        DDim ddim = cl_image->dims();
        //  DDim ddim = make_ddim(desc.Dims());
L
liuruilong 已提交
762
        cl_image->InitEmptyImage(context, command_queue, ddim);
Y
yangfei 已提交
763 764 765
      }
    }
  }
Y
yangfei 已提交
766
  if (self_alloc) {
767
    delete data;
Y
yangfei 已提交
768
  }
Y
yangfei 已提交
769
  LOG(kLOG_INFO) << " end init combine memory ";
770
}
Y
yangfei 已提交
771 772 773

#endif

774
template class Executor<CPU, float>;
Y
yangfei 已提交
775

776
template class Executor<FPGA, float>;
W
wangliu 已提交
777

778
template class Executor<GPU_CL, float>;
Y
yangfei 已提交
779

780
template class Executor<GPU_MALI, float>;
Y
yangfei 已提交
781 782

}  // namespace framework
W
wangliu 已提交
783
}  // namespace paddle_mobile