executor.cpp 26.1 KB
Newer Older
W
wangliu 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

15
#include "framework/executor.h"
D
dolphin8 已提交
16
#include <algorithm>
17
#include <utility>
W
wangliu 已提交
18
#include <vector>
L
liuruilong 已提交
19
#include "common/enforce.h"
L
liuruilong 已提交
20
#include "common/log.h"
L
liuruilong 已提交
21
#include "framework/framework.pb-c.h"
L
liuruilong 已提交
22 23
#include "framework/lod_tensor.h"
#include "framework/operator.h"
L
liuruilong 已提交
24
#include "framework/program/program-optimize/program_optimize.h"
L
liuruilong 已提交
25 26 27 28
#include "framework/program/program_desc.h"
#include "framework/program/var_desc.h"
#include "framework/scope.h"
#include "framework/tensor.h"
Z
zhangyang 已提交
29
#include "memory/t_malloc.h"
L
update  
liuruilong 已提交
30 31 32 33

#ifdef PADDLE_MOBILE_CL
#include "framework/cl/cl_image.h"
#endif
W
wangliu 已提交
34 35

namespace paddle_mobile {
36
namespace framework {
37

W
wangliu 已提交
38 39
#pragma mark - executor

40
template <typename Device, typename T>
xiebaiyuan's avatar
xiebaiyuan 已提交
41 42 43 44
Executor<Device, T>::Executor(const Program<Device> &program,
                              paddle_mobile::PaddleMobileConfigInternal config,
                              int batch_size, const bool use_optimize,
                              const bool lod_mode)
45
    : program_(program),
H
hjchen2 已提交
46 47
      batch_size_(batch_size),
      use_optimize_(use_optimize),
xiebaiyuan's avatar
xiebaiyuan 已提交
48 49
      lod_mode_(lod_mode),
      config_(config) {
50 51
  DLOG << "executor in lod mode: " << lod_mode_;

W
wangliu 已提交
52
  Variable *variable_ptr = program_.scope->Var("batch_size");
H
hjchen2 已提交
53
  variable_ptr->SetValue<int>(batch_size);
54 55

  program_desc_ =
Refine  
陈后江 已提交
56
      use_optimize_ ? program_.optimizeProgram : program_.originProgram;
57 58 59 60
  PADDLE_MOBILE_ENFORCE(program_desc_ != nullptr,
                        "program_desc_ should not be nullptr");
  const auto &blocks = program_desc_->Blocks();
  ops_of_block_.resize(blocks.size());
61

W
wangliu 已提交
62
  for (int i = 0; i < blocks.size(); ++i) {
63 64
    std::shared_ptr<BlockDesc> block_desc = blocks[i];
    std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
W
wangliu 已提交
65
    for (int j = 0; j < ops.size(); ++j) {
66 67
      std::shared_ptr<OpDesc> op_desc = ops[j];
      DLOG << "create op: " << op_desc->Type();
68

69 70 71 72 73 74 75
      auto op_handler = OpRegistry<Device>::CreateOp(
          op_desc->Type(), op_desc->GetInputs(), op_desc->GetOutputs(),
          op_desc->GetAttrMap(), program_.scope);
      // infer shape to reshape inputs and outputs before predict,
      // but for lod mode, it still need to infer shape in runtime
      if (!lod_mode) {
        op_handler->InferShape();
xiebaiyuan's avatar
xiebaiyuan 已提交
76
      }
77
      ops_of_block_[i].push_back(op_handler);
W
wangliu 已提交
78 79
    }
  }
80

W
wangliu 已提交
81
  if (program_.combined) {
L
liuruilong 已提交
82 83 84 85
    InitCombineMemory();
  } else {
    InitMemory();
  }
86 87 88 89 90 91 92 93

  int count = 0;
  for (int block_id = 0; block_id < ops_of_block_.size(); ++block_id) {
    for (auto &op_handler : ops_of_block_[block_id]) {
      DLOG << "Initialize op[" << count++ << "]: " << op_handler->Type();
      op_handler->Init();
      ops_list_.push_back(op_handler);
    }
L
liuruilong 已提交
94
  }
W
wangliu 已提交
95 96
}

97
template <typename T>
98
static void LoadMemInternal(void **data, LoDTensor *tensor,
99
                            bool quant_uint8 = false) {
Refine  
陈后江 已提交
100
  char **data_buf = reinterpret_cast<char **>(data);
101
  int64_t size = tensor->numel();
102
  T *tensor_data = tensor->mutable_data<T>();
103 104
  if (quant_uint8) {
    // should be moved into operator init function
105 106
    float min_value;
    float max_value;
107 108 109
    memory::Copy(&min_value, *data_buf, sizeof(float));
    memory::Copy(&max_value, *data_buf + sizeof(float), sizeof(float));
    *data_buf += 2 * sizeof(float);
110
    const float factor = (max_value - min_value) / 255.0;
111
    const uint8_t *uint8_data = reinterpret_cast<uint8_t *>(*data_buf);
112 113
    for (int k = 0; k < size; ++k) {
      tensor_data[k] = uint8_data[k] * factor + min_value;
W
wangliu 已提交
114
    }
115
    *data_buf += size * sizeof(uint8_t);
116
  } else {
117 118
    memory::Copy(tensor_data, *data_buf, size * sizeof(T));
    *data_buf += size * sizeof(T);
L
liuruilong 已提交
119
  }
120
}
W
wangliu 已提交
121

122 123 124 125
template <typename Device, typename T>
void Executor<Device, T>::LoadMemory(void **data,
                                     const std::shared_ptr<VarDesc> var_desc,
                                     LoDTensor *tensor) {
126
  char **data_buf = reinterpret_cast<char **>(data);
127
  // version
128
  uint32_t version = *(reinterpret_cast<uint32_t *>(*data_buf));
Refine  
陈后江 已提交
129
  *data_buf += sizeof(uint32_t);
130
  // lod information
H
hjchen2 已提交
131 132
  // uint64_t lod_level = *(reinterpret_cast<uint64_t *>(*data_buf));
  uint64_t lod_level = 0;
Z
zhangyang 已提交
133
  memory::Copy(&lod_level, *data_buf, sizeof(uint64_t));
Refine  
陈后江 已提交
134
  *data_buf += sizeof(uint64_t);
135 136 137 138

  auto *lod = tensor->mutable_lod();
  lod->resize(lod_level);
  for (uint64_t i = 0; i < lod_level; ++i) {
139
    uint64_t size = *(reinterpret_cast<uint64_t *>(*data_buf));
Refine  
陈后江 已提交
140
    *data_buf += sizeof(uint64_t);
141
    std::vector<size_t> tmp_dim(size / sizeof(size_t));
Z
zhangyang 已提交
142
    memory::Copy(tmp_dim.data(), *data_buf, size);
143
    (*lod)[i] = std::move(tmp_dim);
Refine  
陈后江 已提交
144
    *data_buf += size;
W
wangliu 已提交
145
  }
146
  // tensor version
147
  uint32_t tensor_version = *(reinterpret_cast<uint32_t *>(*data_buf));
Refine  
陈后江 已提交
148
  *data_buf += sizeof(uint32_t);
149
  // tensor desc size
150
  int32_t tensor_desc_size = *(reinterpret_cast<int32_t *>(*data_buf));
Refine  
陈后江 已提交
151
  *data_buf += sizeof(int32_t);
152
  // skip tensor desc
Refine  
陈后江 已提交
153
  *data_buf += tensor_desc_size;
154

155 156
  const TensorDesc &tensor_desc = var_desc->Tensor_desc();
  tensor->Resize(make_ddim(tensor_desc.Dims()));
157 158
  // parse tensor from stream
  switch (tensor_desc.DataType()) {
159
    case VARTYPE_TYPE_FP32:
160 161
      LoadMemInternal<float>(reinterpret_cast<void **>(data_buf), tensor,
                             program_.quantification);
W
wangliu 已提交
162
      break;
163
    case VARTYPE_TYPE_INT8:
164
      LoadMemInternal<int8_t>(reinterpret_cast<void **>(data_buf), tensor);
W
wangliu 已提交
165
      break;
166
    case VARTYPE_TYPE_INT32:
167
      LoadMemInternal<int>(reinterpret_cast<void **>(data_buf), tensor);
W
wangliu 已提交
168 169
      break;
    default:
170
      LOG(kLOG_ERROR) << "data type is not supported";
L
liuruilong 已提交
171
  }
W
wangliu 已提交
172 173
}

174 175 176
template <typename Device, typename T>
void Executor<Device, T>::InitMemory() {
  for (const auto &block : program_desc_->Blocks()) {
W
wangliu 已提交
177 178
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
179
      auto tensor = var->template GetMutable<LoDTensor>();
W
wangliu 已提交
180 181 182 183
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
Refine  
陈后江 已提交
184
        char *origin_data =
Refine  
陈后江 已提交
185
            ReadFileToBuff(program_.model_path + "/" + var_desc->Name());
Refine  
陈后江 已提交
186
        char *data = origin_data;
187 188
        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
        delete[] origin_data;
W
wangliu 已提交
189
      } else {
190
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
191
          varInputMemory(var_desc, var, tensor);
W
wangliu 已提交
192 193 194 195 196 197
        }
      }
    }
  }
}

198 199
template <typename Device, typename T>
void Executor<Device, T>::InitCombineMemory() {
Refine  
陈后江 已提交
200
  char *origin_data = nullptr;
Refine  
陈后江 已提交
201
  bool self_alloc = false;
202
  if (program_.combined_params_buf && program_.combined_params_len) {
203 204
    origin_data = reinterpret_cast<char *>(
        const_cast<uint8_t *>(program_.combined_params_buf));
205
  } else {
Refine  
陈后江 已提交
206
    self_alloc = true;
Refine  
陈后江 已提交
207
    origin_data = ReadFileToBuff(program_.para_path);
208
  }
Refine  
陈后江 已提交
209 210
  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "data == nullptr");
  char *data = origin_data;
211
  for (const auto &block : program_desc_->Blocks()) {
L
liuruilong 已提交
212 213
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
214
      auto tensor = var->template GetMutable<LoDTensor>();
L
liuruilong 已提交
215 216 217 218
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
L
liuruilong 已提交
219 220 221

        DLOG << " init combine memory persistable: " << var_desc->Name();

222
        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
L
liuruilong 已提交
223
      } else {
224
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
xiebaiyuan's avatar
xiebaiyuan 已提交
225 226
          DLOG << " init combine memory no persistable in lod: "
               << var_desc->Name();
227
          varInputMemory(var_desc, var, tensor);
L
liuruilong 已提交
228 229
        } else {
          DLOG << " init combine memory no persistable: " << var_desc->Name();
L
liuruilong 已提交
230 231 232 233
        }
      }
    }
  }
Refine  
陈后江 已提交
234
  if (self_alloc) {
235
    delete[] origin_data;
Refine  
陈后江 已提交
236 237
  }
  LOG(kLOG_INFO) << "init combine memory finish";
L
liuruilong 已提交
238
}
239

L
liuruilong 已提交
240
template <typename Device, typename T>
L
liuruilong 已提交
241
void Executor<Device, T>::InitNoPersistableMemory(const Tensor &input_tensor) {
L
liuruilong 已提交
242 243 244 245 246 247 248 249 250 251 252
  for (const auto &block : program_desc_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      auto tensor = var->template GetMutable<LoDTensor>();
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
      } else {
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
          DDim tensor_dim = tensor->dims();
xiebaiyuan's avatar
xiebaiyuan 已提交
253 254 255 256
          DDim new_dim =
              make_ddim({tensor_dim[0], tensor_dim[1], input_tensor.dims()[2],
                         input_tensor.dims()[3]});
          tensor->Resize(new_dim);
L
liuruilong 已提交
257 258 259 260 261 262 263 264 265 266 267
          tensor->template mutable_data<T>();
        }
      }
    }
  }

  std::shared_ptr<LoDTensor> output = GetOutput("fetch");
  output->Resize(input_tensor.dims());
  output->mutable_data<T>();
}

268 269 270 271
template <typename Device, typename T>
bool Executor<Device, T>::varInputMemory(
    const std::shared_ptr<VarDesc> &var_desc, Variable *var,
    LoDTensor *tensor) const {
272 273 274 275
#ifdef PADDLE_MOBILE_FPGA
  tensor->init(typeid(float));
  return true;
#endif
276 277
  auto type = var_desc->Tensor_desc().DataType();
  switch (type) {
278
    case VARTYPE_TYPE_FP32:
279
      tensor->mutable_data<float>();
xiebaiyuan's avatar
xiebaiyuan 已提交
280
      break;
281
    case VARTYPE_TYPE_INT8:
282
      tensor->mutable_data<int8_t>();
Refine  
陈后江 已提交
283
      break;
284
    case VARTYPE_TYPE_INT32:
285
      tensor->mutable_data<int32_t>();
xiebaiyuan's avatar
xiebaiyuan 已提交
286
      break;
287
    case VARTYPE_TYPE_INT64:
288
      tensor->mutable_data<int64_t>();
xiebaiyuan's avatar
xiebaiyuan 已提交
289
      break;
Refine  
陈后江 已提交
290
    default:
xiebaiyuan's avatar
xiebaiyuan 已提交
291 292
      break;
  }
293 294 295
  bool is_mute_match =
      (type == VARTYPE_TYPE_FP32) || (type == VARTYPE_TYPE_INT8) ||
      (type == VARTYPE_TYPE_INT32) || (type == VARTYPE_TYPE_INT64);
Refine  
陈后江 已提交
296
  PADDLE_MOBILE_ENFORCE(is_mute_match, "got unhandled data type : %d", type);
xiebaiyuan's avatar
xiebaiyuan 已提交
297 298
  return is_mute_match;
}
L
liuruilong 已提交
299

300 301 302 303 304
template <typename Device, typename T>
PMStatus Executor<Device, T>::Predict(
    const std::vector<std::pair<std::string, Tensor>> &inputs) {
  for (const auto &input : inputs) {
    SetInput(input.second, input.first);
D
dolphin8 已提交
305
  }
306 307 308 309 310 311 312 313
  return this->Predict();
}

template <typename Device, typename T>
PMStatus Executor<Device, T>::Predict(
    const std::vector<std::pair<std::string, LoDTensor>> &inputs) {
  for (const auto &input : inputs) {
    SetInput(input.second, input.first);
D
dolphin8 已提交
314
  }
315
  return this->Predict();
W
wangliu 已提交
316
}
xiebaiyuan's avatar
xiebaiyuan 已提交
317

318 319 320 321 322 323 324 325 326 327 328 329 330 331
template <typename Device, typename T>
std::vector<T> Executor<Device, T>::Predict(const std::vector<T> &input,
                                            const std::vector<int64_t> &dims) {
  Tensor feed_tensor(input, make_ddim(dims));
  SetInput(feed_tensor, "feed");
  std::vector<T> output;
  if (this->Predict() == PMSuccess) {
    const auto output_tensor = GetOutput("fetch");
    output.resize(output_tensor->numel());
    memcpy(output.data(), output_tensor->template data<T>(),
           output.size() * sizeof(T));
  }
  return output;
}
xiebaiyuan's avatar
xiebaiyuan 已提交
332

333 334 335 336 337 338
template <typename Device, typename T>
void Executor<Device, T>::SetInput(const Tensor &input,
                                   const std::string &var_name) {
  auto *target_var = program_.scope->FindVar(var_name);
  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
                        var_name.c_str());
L
liuruilong 已提交
339

340
  auto *target_tensor = target_var->template GetMutable<LoDTensor>();
L
liuruilong 已提交
341 342

  if (config_.load_when_predict) {
Z
zhaojiaying01 已提交
343 344 345
    if (input_dim_last_ != input.dims()) {
      InitNoPersistableMemory(input);
      input_dim_last_ = input.dims();
L
liuruilong 已提交
346 347 348
    }
  }

349 350 351
  target_tensor->Resize(input.dims());
  target_tensor->ShareDataWith(input);
}
xiebaiyuan's avatar
xiebaiyuan 已提交
352

353 354 355 356 357 358 359
template <typename Device, typename T>
void Executor<Device, T>::SetInput(const LoDTensor &input,
                                   const std::string &var_name) {
  auto *target_var = program_.scope->FindVar(var_name);
  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
                        var_name.c_str());
  auto *target_tensor = target_var->template GetMutable<LoDTensor>();
L
liuruilong 已提交
360 361

  if (config_.load_when_predict) {
Z
zhaojiaying01 已提交
362
    if (input_dim_last_ != input.dims()) {
L
liuruilong 已提交
363
      InitNoPersistableMemory(*target_tensor);
Z
zhaojiaying01 已提交
364
      input_dim_last_ = input.dims();
L
liuruilong 已提交
365 366 367
    }
  }

368 369 370 371
  target_tensor->Resize(input.dims());
  target_tensor->ShareDataWith(input);
  target_tensor->set_lod(input.lod());
}
xiebaiyuan's avatar
xiebaiyuan 已提交
372

373 374
template <typename Device, typename T>
PMStatus Executor<Device, T>::Predict() {
xiebaiyuan's avatar
xiebaiyuan 已提交
375
#ifdef PADDLE_MOBILE_PROFILE
376 377 378
  std::vector<ProfInfo> profile(ops_list_.size());
  struct timespec ts;
  int op_index = 0;
xiebaiyuan's avatar
xiebaiyuan 已提交
379
#endif
380 381
  for (auto &block : ops_of_block_) {
    for (auto &op_handler : block) {
xiebaiyuan's avatar
xiebaiyuan 已提交
382
#ifdef PADDLE_MOBILE_PROFILE
383 384
      clock_gettime(CLOCK_MONOTONIC, &ts);
      profile[op_index].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
xiebaiyuan's avatar
xiebaiyuan 已提交
385
#endif
386 387 388 389
      if (lod_mode_) {
        op_handler->InferShape();
      }
      op_handler->Run();
xiebaiyuan's avatar
xiebaiyuan 已提交
390
#ifdef PADDLE_MOBILE_PROFILE
391 392 393
      clock_gettime(CLOCK_MONOTONIC, &ts);
      profile[op_index].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
      ++op_index;
xiebaiyuan's avatar
xiebaiyuan 已提交
394
#endif
395
    }
xiebaiyuan's avatar
xiebaiyuan 已提交
396 397 398 399 400 401
  }
#ifdef PADDLE_MOBILE_PROFILE
  std::unordered_map<std::string, uint64_t> _tp;
  for (int i = 0; i < profile.size(); i++) {
    const auto &pInfo = profile[i];
    uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
402 403 404 405 406
    if (ops_list_[i]->Type() == "conv2d" ||
        ops_list_[i]->Type() == "depthwise_conv2d") {
      auto inputs = ops_list_[i]->Inputs();
      auto *filter =
          GetVarValue<LoDTensor>("Filter", inputs, *(program_.scope));
407
      int kernel_size = filter->dims()[2];
408 409 410
      _tp[ops_list_[i]->Type() + "_" + std::to_string(kernel_size)] += timeCost;
    } else {
      _tp[ops_list_[i]->Type()] += timeCost;
411
    }
xiebaiyuan's avatar
xiebaiyuan 已提交
412
  }
H
hjchen2 已提交
413
  printf("====================[ profile ]======================\n");
414
  typedef std::pair<std::string, uint64_t> prof_t;
xiebaiyuan's avatar
xiebaiyuan 已提交
415 416 417 418 419 420 421 422 423 424 425 426 427 428 429
  std::vector<prof_t> _tv(_tp.begin(), _tp.end());
  uint64_t _ptotal = 0;
  for (auto const &p : _tv) {
    _ptotal += p.second;
  }
  auto compf = [](const prof_t &a, const prof_t &b) {
    return a.second > b.second;
  };
  std::sort(_tv.begin(), _tv.end(), compf);
  _tv.push_back(std::make_pair("total", _ptotal));
  for (auto const &p : _tv) {
    printf("%-16s\t%-10.0f\t%-2.4f\n", p.first.c_str(),
           static_cast<float>(p.second),
           static_cast<float>(p.second) / _ptotal * 100.0);
  }
H
hjchen2 已提交
430
  printf("====================[---------]======================\n");
xiebaiyuan's avatar
xiebaiyuan 已提交
431
#endif
432
  return PMSuccess;
xiebaiyuan's avatar
xiebaiyuan 已提交
433 434
}

435 436 437 438 439 440 441 442
template <typename Device, typename T>
std::shared_ptr<LoDTensor> Executor<Device, T>::GetOutput(
    const std::string &var_name) {
  auto *target_var = program_.scope->FindVar(var_name);
  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
                        var_name.c_str());
  auto *output_tensor = target_var->template GetMutable<LoDTensor>();
  return std::make_shared<LoDTensor>(*output_tensor);
W
wangliu 已提交
443 444
}

445
#ifdef PADDLE_MOBILE_FPGA
446 447 448 449 450
template <typename Device, typename T>
void Executor<Device, T>::InjectVariable(const Tensor &t,
                                         std::string var_name) {
  Variable *g_feed_value = program_.scope->Var(var_name);
  Tensor *feed_tensor = g_feed_value->GetMutable<LoDTensor>();
451 452
  feed_tensor->Resize(t.dims());
  feed_tensor->ShareDataWith(t);
453
}
454

455 456
template <typename Device, typename T>
void Executor<Device, T>::FeedData(const Tensor &t) {
457
  InjectVariable(t, "feed");
458
}
459

460 461
template <typename Device, typename T>
std::shared_ptr<Tensor> Executor<Device, T>::FetchResult(int id) {
H
hjchen2 已提交
462
  auto &ops = ops_of_block_[0];
463

Z
zhangyang 已提交
464 465 466 467 468
  PADDLE_MOBILE_ENFORCE(id < (int)ops.size(), "Index out of range");
  auto op = id < 0 ? ops[ops.size() - 1] : ops[id];
  auto output_map = op->Outputs();
  std::vector<std::string> out_keys = op->GetOutKeys();
  PADDLE_MOBILE_ENFORCE(!out_keys.empty(), "this op contains no output");
469 470 471
  auto *output_tensor =
      GetVarValue<LoDTensor>(out_keys[0], output_map, *(program_.scope));
  return std::make_shared<Tensor>(Tensor(*output_tensor));
472
}
473

474 475
template <typename Device, typename T>
void Executor<Device, T>::Predict_From_To(int start, int end) {
H
hjchen2 已提交
476
  auto &ops = ops_of_block_[0];
477
  end = end < 0 ? static_cast<int>(ops.size()) : end;
478 479 480 481 482 483 484 485 486 487 488 489
  PADDLE_MOBILE_ENFORCE(start >= 0 && start < end && end <= ops.size(),
                        "start or end parameter is wrong");

#ifdef PADDLE_MOBILE_PROFILE
  std::vector<ProfInfo> profile(ops.size());
#endif
  for (int i = start; i < end; i++) {
#ifdef PADDLE_MOBILE_PROFILE
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
Z
zhangyang 已提交
490
    DLOG << "Running op: " << i << "  " << ops[i]->Type();
491 492 493 494 495 496 497
    ops[i]->Run();

#ifdef PADDLE_MOBILE_PROFILE
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
  }
498
}
499

500 501
template <typename Device, typename T>
void Executor<Device, T>::Predict_From(int start) {
502
  Predict_From_To(start);
503
}
504

505 506
template <typename Device, typename T>
void Executor<Device, T>::Predict_To(int end) {
507
  Predict_From_To(0, end);
508
}
509 510
#endif

Y
yangfei 已提交
511
#ifdef PADDLE_MOBILE_CL
xiebaiyuan's avatar
xiebaiyuan 已提交
512 513
template <>
void Executor<GPU_CL, float>::InitNoPersistableMemory(
514
    const Tensor &input_tensor) {
xiebaiyuan's avatar
xiebaiyuan 已提交
515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557
  DLOG << "CL InitNoPersistableMemory ";
  for (const auto &block : program_desc_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());

      auto cl_image = var->template GetMutable<CLImage>();

      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
      } else {
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
          cl_context context = program_.scope->GetCLScpoe()->Context();
          cl_command_queue command_queue =
              program_.scope->GetCLScpoe()->CommandQueue();

          DDim tensor_dim = cl_image->dims();
          DDim new_dim =
              make_ddim({tensor_dim[0], tensor_dim[1], input_tensor.dims()[2],
                         input_tensor.dims()[3]});
          cl_image->Resize(new_dim);
          cl_image->InitEmptyImage(context, command_queue, new_dim);
        }
      }
    }
  }
  std::shared_ptr<LoDTensor> output = GetOutput("fetch");
  output->Resize(input_tensor.dims());
  output->mutable_data<float>();
}
template <>
void Executor<GPU_CL, float>::SetInput(const Tensor &input,
                                       const std::string &var_name) {
  auto *target_var = program_.scope->FindVar(var_name);
  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
                        var_name.c_str());

  auto *target_tensor = target_var->template GetMutable<LoDTensor>();
  DLOG << "config_.load_when_predict   " << config_.load_when_predict;
  DLOG << "target_tensor->IsInitialized() " << target_tensor->IsInitialized();
  DLOG << "target_tensor->dims()   " << target_tensor->dims();
  DLOG << "input.dims()   " << input.dims();
558
  DLOG << "input_dim_last_   " << input_dim_last_;
xiebaiyuan's avatar
xiebaiyuan 已提交
559
  if (config_.load_when_predict) {
xiebaiyuan's avatar
xiebaiyuan 已提交
560
    if (input_dim_last_ != input.dims()) {
561 562 563
      DLOG << "SetInput ---- > resize1";
      target_tensor->Resize(input.dims());
      target_tensor->mutable_data<float>();
xiebaiyuan's avatar
xiebaiyuan 已提交
564 565 566 567 568 569 570 571
      InitNoPersistableMemory(*target_tensor);
    }
  } else {
    DLOG << "SetInput ---- > resize2";
    target_tensor->Resize(input.dims());
    DLOG << "SetInput ---- > ShareDataWith";
  }
  target_tensor->ShareDataWith(input);
572 573
  auto &dim = input.dims();
  input_dim_last_ = static_cast<DDim>(dim);
xiebaiyuan's avatar
xiebaiyuan 已提交
574 575
}

576 577 578
template <typename Device, typename T>
void Executor<Device, T>::LoadMemory(const VarDesc var_desc, float *tensorInput,
                                     char **data) {}
L
liuruilong 已提交
579

Y
yangfei 已提交
580
template <>
H
hjchen2 已提交
581 582
void Executor<GPU_CL, float>::LoadMemory(const VarDesc var_desc,
                                         float *tensorInput, char **data) {
583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619
  // 1. version
  uint32_t version = *reinterpret_cast<uint32_t *>(*data);

  (*data) += sizeof(uint32_t);

  // 2 Lod information
  uint64_t *lod_level_ptr = new uint64_t();
  memcpy(lod_level_ptr, (*data), sizeof(uint64_t));
  uint64_t lod_level = *lod_level_ptr;
  delete lod_level_ptr;
  (*data) += sizeof(uint64_t);

  for (uint64_t i = 0; i < lod_level; ++i) {
    uint64_t size = *reinterpret_cast<uint64_t *>(*data);
    (*data) += sizeof(uint64_t);
    std::vector<size_t> tmp(size / sizeof(size_t));

    for (int k = 0; k < tmp.size(); ++k) {
      tmp[k] = *reinterpret_cast<size_t *>(*data);
      (*data) += sizeof(size_t);
    }
  }

  // 3. tensor version
  uint32_t tensor_version = *reinterpret_cast<uint32_t *>(*data);
  (*data) += sizeof(uint32_t);

  // 4. tensor desc
  int32_t size = *reinterpret_cast<int32_t *>(*data);
  (*data) += sizeof(int32_t);

  std::unique_ptr<char[]> buf(new char[size]);
  for (int m = 0; m < size; ++m) {
    buf.get()[m] = (*data)[m];
  }
  (*data) += (sizeof(char) * size);

620
  const TensorDesc &desc = var_desc.Tensor_desc();
621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654
  int memory_size = 1;
  for (auto l : desc.Dims()) {
    memory_size *= l;
  }

  void *memory = nullptr;
  int type_size = 4;
  memory = tensorInput;
  if (program_.quantification) {
    float min_value;
    float max_value;

    memcpy(&min_value, *data, sizeof(float));
    memcpy(&max_value, *data + sizeof(float), sizeof(float));
    *data += 2 * sizeof(float);
    const float factor = (max_value - min_value) / 255.0;
    uint8_t *uint8_data = reinterpret_cast<uint8_t *>(*data);
    for (int k = 0; k < memory_size; ++k) {
      static_cast<float *>(memory)[k] = uint8_data[k] * factor + min_value;
    }
    *data += (memory_size * sizeof(uint8_t));
  } else {
    for (int n = 0; n < memory_size; n++) {
      float value;
      memcpy(&value, *data + n * type_size, type_size);
      if (value < 1e-30 && value > -1e-30) {
        static_cast<float *>(memory)[n] = 0.0;
      } else {
        static_cast<float *>(memory)[n] = value;
      }
    }
    (*data) += (sizeof(char) * memory_size * type_size);
  }
}
655

Y
yangfei 已提交
656
template <>
657 658
void Executor<GPU_CL, float>::InitMemory() {
  for (const auto &block : program_desc_->Blocks()) {
Y
yangfei 已提交
659 660 661
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      if (var_desc->Persistable()) {
L
liuruilong 已提交
662
        CLImage *cl_image = nullptr;
Y
yangfei 已提交
663
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
664
          var->template GetMutable<LoDTensor>();
Y
yangfei 已提交
665
          continue;
L
liuruilong 已提交
666
        } else {
667
          cl_image = var->template GetMutable<CLImage>();
Y
yangfei 已提交
668
        }
L
liuruilong 已提交
669

Y
yangfei 已提交
670
        char *origin_data =
L
liuruilong 已提交
671
            ReadFileToBuff(program_.model_path + "/" + var_desc->Name());
672
        char *data = origin_data;
Y
yangfei 已提交
673
        cl_context context = program_.scope->GetCLScpoe()->Context();
674
        const TensorDesc &desc = var_desc->Tensor_desc();
675 676 677 678 679
        int numel = 1;
        for (auto l : desc.Dims()) {
          numel *= l;
        }
        DLOG << var_desc->Name();
Y
yangfei 已提交
680
        float *tensorInput = static_cast<float *>(
681 682
            paddle_mobile::memory::Alloc(sizeof(float) * numel));
        LoadMemory(*var_desc, tensorInput, &data);
Y
yangfei 已提交
683

684
        DDim ddim = make_ddim(desc.Dims());
Y
yangfei 已提交
685

L
liuruilong 已提交
686 687
        // has not init
        cl_image->SetTensorData(tensorInput, ddim);
Y
yangfei 已提交
688

689
        delete origin_data;
Y
yangfei 已提交
690
        paddle_mobile::memory::Free(tensorInput);
691
      } else {
692 693
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
          auto cl_image = var->template GetMutable<CLImage>();
694
          cl_context context = program_.scope->GetCLScpoe()->Context();
L
liuruilong 已提交
695 696
          cl_command_queue command_queue =
              program_.scope->GetCLScpoe()->CommandQueue();
Y
yangfei 已提交
697

698 699 700
          const TensorDesc &desc = var_desc->Tensor_desc();
          //          DDim ddim = make_ddim(desc.Dims());
          DDim ddim = cl_image->dims();
701
          DLOG << var_desc->Name();
L
liuruilong 已提交
702
          cl_image->InitEmptyImage(context, command_queue, ddim);
703
        }
Y
yangfei 已提交
704 705 706 707
      }
    }
  }
}
708

Y
yangfei 已提交
709
template <>
710
void Executor<GPU_CL, float>::InitCombineMemory() {
xiebaiyuan's avatar
xiebaiyuan 已提交
711 712
  DLOG << "CL InitCombineMemory---- "
       << "config_.load_when_predict: " << config_.load_when_predict;
Y
yangfei 已提交
713 714
  char *origin_data = nullptr;
  bool self_alloc = false;
Y
yangfei 已提交
715 716
  if (program_.combined_params_buf && program_.combined_params_len) {
    LOG(kLOG_INFO) << "use outter memory";
717
    origin_data = reinterpret_cast<char *>(program_.combined_params_buf);
Y
yangfei 已提交
718 719
  } else {
    LOG(kLOG_INFO) << " begin init combine memory";
Y
yangfei 已提交
720
    self_alloc = true;
L
liuruilong 已提交
721
    origin_data = ReadFileToBuff(program_.para_path);
Y
yangfei 已提交
722 723
  }
  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "origin_data==nullptr!!!");
724
  float *data = reinterpret_cast<float *>(origin_data);
Y
yangfei 已提交
725

726
  for (const auto &block : program_desc_->Blocks()) {
Y
yangfei 已提交
727 728 729
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      if (var_desc->Persistable()) {
L
liuruilong 已提交
730
        CLImage *cl_image = nullptr;
Y
yangfei 已提交
731
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
732
          var->template GetMutable<LoDTensor>();
Y
yangfei 已提交
733
          continue;
L
liuruilong 已提交
734
        } else {
735
          cl_image = var->template GetMutable<CLImage>();
Y
yangfei 已提交
736 737 738 739
        }

        cl_context context = program_.scope->GetCLScpoe()->Context();

740 741
        const TensorDesc &desc = var_desc->Tensor_desc();
        DDim ddim = make_ddim(desc.Dims());
Y
yangfei 已提交
742 743 744 745 746

        int numel = 1;
        for (int i = 0; i < ddim.size(); i++) {
          numel = numel * ddim[i];
        }
747 748 749
        float *tensorInput = static_cast<float *>(
            paddle_mobile::memory::Alloc(sizeof(float) * numel));
        LoadMemory(*var_desc, tensorInput, &origin_data);
L
liuruilong 已提交
750 751 752 753

        // has not init
        cl_image->SetTensorData(tensorInput, ddim);

754 755
        paddle_mobile::memory::Free(tensorInput);
      } else {
756
        auto cl_image = var->template GetMutable<CLImage>();
Y
yangfei 已提交
757
        cl_context context = program_.scope->GetCLScpoe()->Context();
L
liuruilong 已提交
758 759
        cl_command_queue command_queue =
            program_.scope->GetCLScpoe()->CommandQueue();
760 761 762
        const TensorDesc &desc = var_desc->Tensor_desc();
        DDim ddim = cl_image->dims();
        //  DDim ddim = make_ddim(desc.Dims());
L
liuruilong 已提交
763
        cl_image->InitEmptyImage(context, command_queue, ddim);
Y
yangfei 已提交
764 765 766
      }
    }
  }
Y
yangfei 已提交
767
  if (self_alloc) {
768
    delete data;
Y
yangfei 已提交
769
  }
Y
yangfei 已提交
770
  LOG(kLOG_INFO) << " end init combine memory ";
771
}
Y
yangfei 已提交
772 773 774

#endif

775
template class Executor<CPU, float>;
Y
yangfei 已提交
776

777
template class Executor<FPGA, float>;
W
wangliu 已提交
778

779
template class Executor<GPU_CL, float>;
Y
yangfei 已提交
780

781
template class Executor<GPU_MALI, float>;
Y
yangfei 已提交
782 783

}  // namespace framework
W
wangliu 已提交
784
}  // namespace paddle_mobile