executor.cpp 26.0 KB
Newer Older
W
wangliu 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

15
#include "framework/executor.h"
D
dolphin8 已提交
16
#include <algorithm>
17
#include <utility>
W
wangliu 已提交
18
#include <vector>
L
liuruilong 已提交
19
#include "common/enforce.h"
L
liuruilong 已提交
20
#include "common/log.h"
L
liuruilong 已提交
21
#include "framework/framework.pb-c.h"
L
liuruilong 已提交
22 23
#include "framework/lod_tensor.h"
#include "framework/operator.h"
L
liuruilong 已提交
24
#include "framework/program/program-optimize/program_optimize.h"
L
liuruilong 已提交
25 26 27 28
#include "framework/program/program_desc.h"
#include "framework/program/var_desc.h"
#include "framework/scope.h"
#include "framework/tensor.h"
Z
zhangyang 已提交
29
#include "memory/t_malloc.h"
L
update  
liuruilong 已提交
30 31 32 33

#ifdef PADDLE_MOBILE_CL
#include "framework/cl/cl_image.h"
#endif
W
wangliu 已提交
34 35

namespace paddle_mobile {
36
namespace framework {
37

W
wangliu 已提交
38 39
#pragma mark - executor

L
liuruilong 已提交
40
template <typename Device, typename T>
xiebaiyuan's avatar
xiebaiyuan 已提交
41 42 43 44
Executor<Device, T>::Executor(const Program<Device> &program,
                              paddle_mobile::PaddleMobileConfigInternal config,
                              int batch_size, const bool use_optimize,
                              const bool lod_mode)
45
    : program_(program),
H
hjchen2 已提交
46 47
      batch_size_(batch_size),
      use_optimize_(use_optimize),
xiebaiyuan's avatar
xiebaiyuan 已提交
48 49
      lod_mode_(lod_mode),
      config_(config) {
50 51
  DLOG << "executor in lod mode: " << lod_mode_;

W
wangliu 已提交
52
  Variable *variable_ptr = program_.scope->Var("batch_size");
H
hjchen2 已提交
53
  variable_ptr->SetValue<int>(batch_size);
54 55

  program_desc_ =
Refine  
陈后江 已提交
56
      use_optimize_ ? program_.optimizeProgram : program_.originProgram;
57 58 59 60
  PADDLE_MOBILE_ENFORCE(program_desc_ != nullptr,
                        "program_desc_ should not be nullptr");
  const auto &blocks = program_desc_->Blocks();
  ops_of_block_.resize(blocks.size());
61

W
wangliu 已提交
62
  for (int i = 0; i < blocks.size(); ++i) {
63 64
    std::shared_ptr<BlockDesc> block_desc = blocks[i];
    std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
W
wangliu 已提交
65
    for (int j = 0; j < ops.size(); ++j) {
66 67 68 69 70 71 72 73 74
      std::shared_ptr<OpDesc> op_desc = ops[j];
      DLOG << "create op: " << op_desc->Type();
      auto op_handler = OpRegistry<Device>::CreateOp(
          op_desc->Type(), op_desc->GetInputs(), op_desc->GetOutputs(),
          op_desc->GetAttrMap(), program_.scope);
      // infer shape to reshape inputs and outputs before predict,
      // but for lod mode, it still need to infer shape in runtime
      if (!lod_mode) {
        op_handler->InferShape();
xiebaiyuan's avatar
xiebaiyuan 已提交
75
      }
76
      ops_of_block_[i].push_back(op_handler);
W
wangliu 已提交
77 78
    }
  }
79

W
wangliu 已提交
80
  if (program_.combined) {
L
liuruilong 已提交
81 82 83 84
    InitCombineMemory();
  } else {
    InitMemory();
  }
85 86 87 88 89 90 91 92

  int count = 0;
  for (int block_id = 0; block_id < ops_of_block_.size(); ++block_id) {
    for (auto &op_handler : ops_of_block_[block_id]) {
      DLOG << "Initialize op[" << count++ << "]: " << op_handler->Type();
      op_handler->Init();
      ops_list_.push_back(op_handler);
    }
L
liuruilong 已提交
93
  }
W
wangliu 已提交
94 95
}

96 97
template <typename Device>
static void LoadMemInternal(void **data, LoDTensor *tensor,
98
                            bool quant_uint8 = false) {
Refine  
陈后江 已提交
99
  char **data_buf = reinterpret_cast<char **>(data);
100
  int64_t size = tensor->numel();
101
  Device *tensor_data = tensor->mutable_data<Device>();
102 103
  if (quant_uint8) {
    // should be moved into operator init function
104 105
    float min_value;
    float max_value;
Z
zhangyang 已提交
106 107
    memory::Copy(&min_value, data_buf, sizeof(float));
    memory::Copy(&max_value, data_buf + sizeof(float), sizeof(float));
108 109
    data_buf += 2 * sizeof(float);
    const float factor = (max_value - min_value) / 255.0;
110
    const uint8_t *uint8_data = reinterpret_cast<uint8_t *>(data_buf);
111 112
    for (int k = 0; k < size; ++k) {
      tensor_data[k] = uint8_data[k] * factor + min_value;
W
wangliu 已提交
113
    }
114 115
    data_buf += size * sizeof(uint8_t);
  } else {
116 117
    memory::Copy(tensor_data, *data_buf, size * sizeof(Device));
    *data_buf += size * sizeof(Device);
L
liuruilong 已提交
118
  }
119
}
W
wangliu 已提交
120

121 122 123 124
template <typename Device, typename T>
void Executor<Device, T>::LoadMemory(void **data,
                                     const std::shared_ptr<VarDesc> var_desc,
                                     LoDTensor *tensor) {
125
  char **data_buf = reinterpret_cast<char **>(data);
126
  // version
127
  uint32_t version = *(reinterpret_cast<uint32_t *>(*data_buf));
Refine  
陈后江 已提交
128
  *data_buf += sizeof(uint32_t);
129
  // lod information
H
hjchen2 已提交
130 131
  // uint64_t lod_level = *(reinterpret_cast<uint64_t *>(*data_buf));
  uint64_t lod_level = 0;
Z
zhangyang 已提交
132
  memory::Copy(&lod_level, *data_buf, sizeof(uint64_t));
Refine  
陈后江 已提交
133
  *data_buf += sizeof(uint64_t);
134 135 136 137

  auto *lod = tensor->mutable_lod();
  lod->resize(lod_level);
  for (uint64_t i = 0; i < lod_level; ++i) {
138
    uint64_t size = *(reinterpret_cast<uint64_t *>(*data_buf));
Refine  
陈后江 已提交
139
    *data_buf += sizeof(uint64_t);
140
    std::vector<size_t> tmp_dim(size / sizeof(size_t));
Z
zhangyang 已提交
141
    memory::Copy(tmp_dim.data(), *data_buf, size);
142
    (*lod)[i] = std::move(tmp_dim);
Refine  
陈后江 已提交
143
    *data_buf += size;
W
wangliu 已提交
144
  }
145
  // tensor version
146
  uint32_t tensor_version = *(reinterpret_cast<uint32_t *>(*data_buf));
Refine  
陈后江 已提交
147
  *data_buf += sizeof(uint32_t);
148
  // tensor desc size
149
  int32_t tensor_desc_size = *(reinterpret_cast<int32_t *>(*data_buf));
Refine  
陈后江 已提交
150
  *data_buf += sizeof(int32_t);
151
  // skip tensor desc
Refine  
陈后江 已提交
152
  *data_buf += tensor_desc_size;
153

154 155
  const TensorDesc &tensor_desc = var_desc->Tensor_desc();
  tensor->Resize(make_ddim(tensor_desc.Dims()));
156 157
  // parse tensor from stream
  switch (tensor_desc.DataType()) {
158
    case VARTYPE_TYPE_FP32:
159 160
      LoadMemInternal<float>(reinterpret_cast<void **>(data_buf), tensor,
                             program_.quantification);
W
wangliu 已提交
161
      break;
162
    case VARTYPE_TYPE_INT8:
163
      LoadMemInternal<int8_t>(reinterpret_cast<void **>(data_buf), tensor);
W
wangliu 已提交
164
      break;
165
    case VARTYPE_TYPE_INT32:
166
      LoadMemInternal<int>(reinterpret_cast<void **>(data_buf), tensor);
W
wangliu 已提交
167 168
      break;
    default:
169
      LOG(kLOG_ERROR) << "data type is not supported";
L
liuruilong 已提交
170
  }
W
wangliu 已提交
171 172
}

173 174 175
template <typename Device, typename T>
void Executor<Device, T>::InitMemory() {
  for (const auto &block : program_desc_->Blocks()) {
W
wangliu 已提交
176 177
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
178
      auto tensor = var->template GetMutable<LoDTensor>();
W
wangliu 已提交
179 180 181 182
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
Refine  
陈后江 已提交
183
        char *origin_data =
Refine  
陈后江 已提交
184
            ReadFileToBuff(program_.model_path + "/" + var_desc->Name());
Refine  
陈后江 已提交
185
        char *data = origin_data;
186 187
        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
        delete[] origin_data;
W
wangliu 已提交
188
      } else {
189
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
190
          varInputMemory(var_desc, var, tensor);
W
wangliu 已提交
191 192 193 194 195 196
        }
      }
    }
  }
}

197 198
template <typename Device, typename T>
void Executor<Device, T>::InitCombineMemory() {
Refine  
陈后江 已提交
199
  char *origin_data = nullptr;
Refine  
陈后江 已提交
200
  bool self_alloc = false;
201
  if (program_.combined_params_buf && program_.combined_params_len) {
202 203
    origin_data = reinterpret_cast<char *>(
        const_cast<uint8_t *>(program_.combined_params_buf));
204
  } else {
Refine  
陈后江 已提交
205
    self_alloc = true;
Refine  
陈后江 已提交
206
    origin_data = ReadFileToBuff(program_.para_path);
207
  }
Refine  
陈后江 已提交
208 209
  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "data == nullptr");
  char *data = origin_data;
210
  for (const auto &block : program_desc_->Blocks()) {
L
liuruilong 已提交
211 212
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
213
      auto tensor = var->template GetMutable<LoDTensor>();
L
liuruilong 已提交
214 215 216 217
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
L
liuruilong 已提交
218 219 220

        DLOG << " init combine memory persistable: " << var_desc->Name();

221
        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
L
liuruilong 已提交
222
      } else {
223
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
xiebaiyuan's avatar
xiebaiyuan 已提交
224 225
          DLOG << " init combine memory no persistable in lod: "
               << var_desc->Name();
226
          varInputMemory(var_desc, var, tensor);
L
liuruilong 已提交
227 228
        } else {
          DLOG << " init combine memory no persistable: " << var_desc->Name();
L
liuruilong 已提交
229 230 231 232
        }
      }
    }
  }
Refine  
陈后江 已提交
233
  if (self_alloc) {
234
    delete[] origin_data;
Refine  
陈后江 已提交
235 236
  }
  LOG(kLOG_INFO) << "init combine memory finish";
L
liuruilong 已提交
237
}
238

L
liuruilong 已提交
239
template <typename Device, typename T>
xiebaiyuan's avatar
xiebaiyuan 已提交
240
void Executor<Device, T>::InitNoPersistableMemory(
Z
zhaojiaying01 已提交
241
    const Tensor &input_tensor) {
L
liuruilong 已提交
242 243 244 245 246 247 248 249 250 251 252
  for (const auto &block : program_desc_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      auto tensor = var->template GetMutable<LoDTensor>();
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
      } else {
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
          DDim tensor_dim = tensor->dims();
xiebaiyuan's avatar
xiebaiyuan 已提交
253 254 255 256
          DDim new_dim =
              make_ddim({tensor_dim[0], tensor_dim[1], input_tensor.dims()[2],
                         input_tensor.dims()[3]});
          tensor->Resize(new_dim);
L
liuruilong 已提交
257 258 259 260 261 262 263 264 265 266 267
          tensor->template mutable_data<T>();
        }
      }
    }
  }

  std::shared_ptr<LoDTensor> output = GetOutput("fetch");
  output->Resize(input_tensor.dims());
  output->mutable_data<T>();
}

268 269 270 271
template <typename Device, typename T>
bool Executor<Device, T>::varInputMemory(
    const std::shared_ptr<VarDesc> &var_desc, Variable *var,
    LoDTensor *tensor) const {
272 273
  auto type = var_desc->Tensor_desc().DataType();
  switch (type) {
274
    case VARTYPE_TYPE_FP32:
275
      tensor->mutable_data<float>();
xiebaiyuan's avatar
xiebaiyuan 已提交
276
      break;
277
    case VARTYPE_TYPE_INT8:
278
      tensor->mutable_data<int8_t>();
Refine  
陈后江 已提交
279
      break;
280
    case VARTYPE_TYPE_INT32:
281
      tensor->mutable_data<int32_t>();
xiebaiyuan's avatar
xiebaiyuan 已提交
282
      break;
283
    case VARTYPE_TYPE_INT64:
284
      tensor->mutable_data<int64_t>();
xiebaiyuan's avatar
xiebaiyuan 已提交
285
      break;
Refine  
陈后江 已提交
286
    default:
xiebaiyuan's avatar
xiebaiyuan 已提交
287 288
      break;
  }
289 290 291
  bool is_mute_match =
      (type == VARTYPE_TYPE_FP32) || (type == VARTYPE_TYPE_INT8) ||
      (type == VARTYPE_TYPE_INT32) || (type == VARTYPE_TYPE_INT64);
Refine  
陈后江 已提交
292
  PADDLE_MOBILE_ENFORCE(is_mute_match, "got unhandled data type : %d", type);
xiebaiyuan's avatar
xiebaiyuan 已提交
293 294
  return is_mute_match;
}
L
liuruilong 已提交
295

296 297 298 299 300
template <typename Device, typename T>
PMStatus Executor<Device, T>::Predict(
    const std::vector<std::pair<std::string, Tensor>> &inputs) {
  for (const auto &input : inputs) {
    SetInput(input.second, input.first);
D
dolphin8 已提交
301
  }
302 303 304 305 306 307 308 309
  return this->Predict();
}

template <typename Device, typename T>
PMStatus Executor<Device, T>::Predict(
    const std::vector<std::pair<std::string, LoDTensor>> &inputs) {
  for (const auto &input : inputs) {
    SetInput(input.second, input.first);
D
dolphin8 已提交
310
  }
311
  return this->Predict();
W
wangliu 已提交
312
}
xiebaiyuan's avatar
xiebaiyuan 已提交
313

314 315 316 317 318 319 320 321 322 323 324 325 326 327
template <typename Device, typename T>
std::vector<T> Executor<Device, T>::Predict(const std::vector<T> &input,
                                            const std::vector<int64_t> &dims) {
  Tensor feed_tensor(input, make_ddim(dims));
  SetInput(feed_tensor, "feed");
  std::vector<T> output;
  if (this->Predict() == PMSuccess) {
    const auto output_tensor = GetOutput("fetch");
    output.resize(output_tensor->numel());
    memcpy(output.data(), output_tensor->template data<T>(),
           output.size() * sizeof(T));
  }
  return output;
}
xiebaiyuan's avatar
xiebaiyuan 已提交
328

329 330 331 332 333 334
template <typename Device, typename T>
void Executor<Device, T>::SetInput(const Tensor &input,
                                   const std::string &var_name) {
  auto *target_var = program_.scope->FindVar(var_name);
  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
                        var_name.c_str());
L
liuruilong 已提交
335

336
  auto *target_tensor = target_var->template GetMutable<LoDTensor>();
L
liuruilong 已提交
337 338

  if (config_.load_when_predict) {
Z
zhaojiaying01 已提交
339 340 341
    if (input_dim_last_ != input.dims()) {
      InitNoPersistableMemory(input);
      input_dim_last_ = input.dims();
L
liuruilong 已提交
342 343 344
    }
  }

345 346 347
  target_tensor->Resize(input.dims());
  target_tensor->ShareDataWith(input);
}
xiebaiyuan's avatar
xiebaiyuan 已提交
348

349 350 351 352 353 354 355
template <typename Device, typename T>
void Executor<Device, T>::SetInput(const LoDTensor &input,
                                   const std::string &var_name) {
  auto *target_var = program_.scope->FindVar(var_name);
  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
                        var_name.c_str());
  auto *target_tensor = target_var->template GetMutable<LoDTensor>();
L
liuruilong 已提交
356 357

  if (config_.load_when_predict) {
Z
zhaojiaying01 已提交
358
    if (input_dim_last_ != input.dims()) {
L
liuruilong 已提交
359
      InitNoPersistableMemory(*target_tensor);
Z
zhaojiaying01 已提交
360
      input_dim_last_ = input.dims();
L
liuruilong 已提交
361 362 363
    }
  }

364 365 366 367
  target_tensor->Resize(input.dims());
  target_tensor->ShareDataWith(input);
  target_tensor->set_lod(input.lod());
}
xiebaiyuan's avatar
xiebaiyuan 已提交
368

369 370
template <typename Device, typename T>
PMStatus Executor<Device, T>::Predict() {
xiebaiyuan's avatar
xiebaiyuan 已提交
371
#ifdef PADDLE_MOBILE_PROFILE
372 373 374
  std::vector<ProfInfo> profile(ops_list_.size());
  struct timespec ts;
  int op_index = 0;
xiebaiyuan's avatar
xiebaiyuan 已提交
375
#endif
376 377
  for (auto &block : ops_of_block_) {
    for (auto &op_handler : block) {
xiebaiyuan's avatar
xiebaiyuan 已提交
378
#ifdef PADDLE_MOBILE_PROFILE
379 380
      clock_gettime(CLOCK_MONOTONIC, &ts);
      profile[op_index].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
xiebaiyuan's avatar
xiebaiyuan 已提交
381
#endif
382 383 384 385
      if (lod_mode_) {
        op_handler->InferShape();
      }
      op_handler->Run();
xiebaiyuan's avatar
xiebaiyuan 已提交
386
#ifdef PADDLE_MOBILE_PROFILE
387 388 389
      clock_gettime(CLOCK_MONOTONIC, &ts);
      profile[op_index].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
      ++op_index;
xiebaiyuan's avatar
xiebaiyuan 已提交
390
#endif
391
    }
xiebaiyuan's avatar
xiebaiyuan 已提交
392 393 394 395 396 397
  }
#ifdef PADDLE_MOBILE_PROFILE
  std::unordered_map<std::string, uint64_t> _tp;
  for (int i = 0; i < profile.size(); i++) {
    const auto &pInfo = profile[i];
    uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
398 399 400 401 402
    if (ops_list_[i]->Type() == "conv2d" ||
        ops_list_[i]->Type() == "depthwise_conv2d") {
      auto inputs = ops_list_[i]->Inputs();
      auto *filter =
          GetVarValue<LoDTensor>("Filter", inputs, *(program_.scope));
403
      int kernel_size = filter->dims()[2];
404 405 406
      _tp[ops_list_[i]->Type() + "_" + std::to_string(kernel_size)] += timeCost;
    } else {
      _tp[ops_list_[i]->Type()] += timeCost;
407
    }
xiebaiyuan's avatar
xiebaiyuan 已提交
408
  }
H
hjchen2 已提交
409
  printf("====================[ profile ]======================\n");
410
  typedef std::pair<std::string, uint64_t> prof_t;
xiebaiyuan's avatar
xiebaiyuan 已提交
411 412 413 414 415 416 417 418 419 420 421 422 423 424 425
  std::vector<prof_t> _tv(_tp.begin(), _tp.end());
  uint64_t _ptotal = 0;
  for (auto const &p : _tv) {
    _ptotal += p.second;
  }
  auto compf = [](const prof_t &a, const prof_t &b) {
    return a.second > b.second;
  };
  std::sort(_tv.begin(), _tv.end(), compf);
  _tv.push_back(std::make_pair("total", _ptotal));
  for (auto const &p : _tv) {
    printf("%-16s\t%-10.0f\t%-2.4f\n", p.first.c_str(),
           static_cast<float>(p.second),
           static_cast<float>(p.second) / _ptotal * 100.0);
  }
H
hjchen2 已提交
426
  printf("====================[---------]======================\n");
xiebaiyuan's avatar
xiebaiyuan 已提交
427
#endif
428
  return PMSuccess;
xiebaiyuan's avatar
xiebaiyuan 已提交
429 430
}

431 432 433 434 435 436 437 438
template <typename Device, typename T>
std::shared_ptr<LoDTensor> Executor<Device, T>::GetOutput(
    const std::string &var_name) {
  auto *target_var = program_.scope->FindVar(var_name);
  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
                        var_name.c_str());
  auto *output_tensor = target_var->template GetMutable<LoDTensor>();
  return std::make_shared<LoDTensor>(*output_tensor);
W
wangliu 已提交
439 440
}

441
#ifdef PADDLE_MOBILE_FPGA
442 443 444 445 446
template <typename Device, typename T>
void Executor<Device, T>::InjectVariable(const Tensor &t,
                                         std::string var_name) {
  Variable *g_feed_value = program_.scope->Var(var_name);
  Tensor *feed_tensor = g_feed_value->GetMutable<LoDTensor>();
447 448
  feed_tensor->Resize(t.dims());
  feed_tensor->ShareDataWith(t);
449
}
450

451 452
template <typename Device, typename T>
void Executor<Device, T>::FeedData(const Tensor &t) {
453
  InjectVariable(t, "feed");
454
}
455

456 457
template <typename Device, typename T>
std::shared_ptr<Tensor> Executor<Device, T>::FetchResult(int id) {
H
hjchen2 已提交
458
  auto &ops = ops_of_block_[0];
459

Z
zhangyang 已提交
460 461 462 463 464
  PADDLE_MOBILE_ENFORCE(id < (int)ops.size(), "Index out of range");
  auto op = id < 0 ? ops[ops.size() - 1] : ops[id];
  auto output_map = op->Outputs();
  std::vector<std::string> out_keys = op->GetOutKeys();
  PADDLE_MOBILE_ENFORCE(!out_keys.empty(), "this op contains no output");
465 466 467
  auto *output_tensor =
      GetVarValue<LoDTensor>(out_keys[0], output_map, *(program_.scope));
  return std::make_shared<Tensor>(Tensor(*output_tensor));
468
}
469

470 471
template <typename Device, typename T>
void Executor<Device, T>::Predict_From_To(int start, int end) {
H
hjchen2 已提交
472
  auto &ops = ops_of_block_[0];
473
  end = end < 0 ? static_cast<int>(ops.size()) : end;
474 475 476 477 478 479 480 481 482 483 484 485
  PADDLE_MOBILE_ENFORCE(start >= 0 && start < end && end <= ops.size(),
                        "start or end parameter is wrong");

#ifdef PADDLE_MOBILE_PROFILE
  std::vector<ProfInfo> profile(ops.size());
#endif
  for (int i = start; i < end; i++) {
#ifdef PADDLE_MOBILE_PROFILE
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
Z
zhangyang 已提交
486
    DLOG << "Running op: " << i << "  " << ops[i]->Type();
487 488 489 490 491 492 493
    ops[i]->Run();

#ifdef PADDLE_MOBILE_PROFILE
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
  }
494
}
495

496 497
template <typename Device, typename T>
void Executor<Device, T>::Predict_From(int start) {
498
  Predict_From_To(start);
499
}
500

501 502
template <typename Device, typename T>
void Executor<Device, T>::Predict_To(int end) {
503
  Predict_From_To(0, end);
504
}
505 506
#endif

Y
yangfei 已提交
507
#ifdef PADDLE_MOBILE_CL
xiebaiyuan's avatar
xiebaiyuan 已提交
508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553
template <>
void Executor<GPU_CL, float>::InitNoPersistableMemory(
    const LoDTensor &input_tensor) {
  DLOG << "CL InitNoPersistableMemory ";
  for (const auto &block : program_desc_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());

      auto cl_image = var->template GetMutable<CLImage>();

      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
      } else {
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
          cl_context context = program_.scope->GetCLScpoe()->Context();
          cl_command_queue command_queue =
              program_.scope->GetCLScpoe()->CommandQueue();

          DDim tensor_dim = cl_image->dims();
          DDim new_dim =
              make_ddim({tensor_dim[0], tensor_dim[1], input_tensor.dims()[2],
                         input_tensor.dims()[3]});
          cl_image->Resize(new_dim);
          cl_image->InitEmptyImage(context, command_queue, new_dim);
        }
      }
    }
  }
  std::shared_ptr<LoDTensor> output = GetOutput("fetch");
  output->Resize(input_tensor.dims());
  output->mutable_data<float>();
}
template <>
void Executor<GPU_CL, float>::SetInput(const Tensor &input,
                                       const std::string &var_name) {
  auto *target_var = program_.scope->FindVar(var_name);
  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
                        var_name.c_str());

  auto *target_tensor = target_var->template GetMutable<LoDTensor>();
  DLOG << "config_.load_when_predict   " << config_.load_when_predict;
  DLOG << "target_tensor->IsInitialized() " << target_tensor->IsInitialized();
  DLOG << "target_tensor->dims()   " << target_tensor->dims();
  DLOG << "input.dims()   " << input.dims();
554
  DLOG << "input_dim_last_   " << input_dim_last_;
xiebaiyuan's avatar
xiebaiyuan 已提交
555
  if (config_.load_when_predict) {
xiebaiyuan's avatar
xiebaiyuan 已提交
556
    if (input_dim_last_ != input.dims()) {
557 558 559
      DLOG << "SetInput ---- > resize1";
      target_tensor->Resize(input.dims());
      target_tensor->mutable_data<float>();
xiebaiyuan's avatar
xiebaiyuan 已提交
560 561 562 563 564 565 566 567
      InitNoPersistableMemory(*target_tensor);
    }
  } else {
    DLOG << "SetInput ---- > resize2";
    target_tensor->Resize(input.dims());
    DLOG << "SetInput ---- > ShareDataWith";
  }
  target_tensor->ShareDataWith(input);
568 569
  auto &dim = input.dims();
  input_dim_last_ = static_cast<DDim>(dim);
xiebaiyuan's avatar
xiebaiyuan 已提交
570 571
}

572 573 574
template <typename Device, typename T>
void Executor<Device, T>::LoadMemory(const VarDesc var_desc, float *tensorInput,
                                     char **data) {}
L
liuruilong 已提交
575

Y
yangfei 已提交
576
template <>
H
hjchen2 已提交
577 578
void Executor<GPU_CL, float>::LoadMemory(const VarDesc var_desc,
                                         float *tensorInput, char **data) {
579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615
  // 1. version
  uint32_t version = *reinterpret_cast<uint32_t *>(*data);

  (*data) += sizeof(uint32_t);

  // 2 Lod information
  uint64_t *lod_level_ptr = new uint64_t();
  memcpy(lod_level_ptr, (*data), sizeof(uint64_t));
  uint64_t lod_level = *lod_level_ptr;
  delete lod_level_ptr;
  (*data) += sizeof(uint64_t);

  for (uint64_t i = 0; i < lod_level; ++i) {
    uint64_t size = *reinterpret_cast<uint64_t *>(*data);
    (*data) += sizeof(uint64_t);
    std::vector<size_t> tmp(size / sizeof(size_t));

    for (int k = 0; k < tmp.size(); ++k) {
      tmp[k] = *reinterpret_cast<size_t *>(*data);
      (*data) += sizeof(size_t);
    }
  }

  // 3. tensor version
  uint32_t tensor_version = *reinterpret_cast<uint32_t *>(*data);
  (*data) += sizeof(uint32_t);

  // 4. tensor desc
  int32_t size = *reinterpret_cast<int32_t *>(*data);
  (*data) += sizeof(int32_t);

  std::unique_ptr<char[]> buf(new char[size]);
  for (int m = 0; m < size; ++m) {
    buf.get()[m] = (*data)[m];
  }
  (*data) += (sizeof(char) * size);

616
  const TensorDesc &desc = var_desc.Tensor_desc();
617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650
  int memory_size = 1;
  for (auto l : desc.Dims()) {
    memory_size *= l;
  }

  void *memory = nullptr;
  int type_size = 4;
  memory = tensorInput;
  if (program_.quantification) {
    float min_value;
    float max_value;

    memcpy(&min_value, *data, sizeof(float));
    memcpy(&max_value, *data + sizeof(float), sizeof(float));
    *data += 2 * sizeof(float);
    const float factor = (max_value - min_value) / 255.0;
    uint8_t *uint8_data = reinterpret_cast<uint8_t *>(*data);
    for (int k = 0; k < memory_size; ++k) {
      static_cast<float *>(memory)[k] = uint8_data[k] * factor + min_value;
    }
    *data += (memory_size * sizeof(uint8_t));
  } else {
    for (int n = 0; n < memory_size; n++) {
      float value;
      memcpy(&value, *data + n * type_size, type_size);
      if (value < 1e-30 && value > -1e-30) {
        static_cast<float *>(memory)[n] = 0.0;
      } else {
        static_cast<float *>(memory)[n] = value;
      }
    }
    (*data) += (sizeof(char) * memory_size * type_size);
  }
}
651

Y
yangfei 已提交
652
template <>
653 654
void Executor<GPU_CL, float>::InitMemory() {
  for (const auto &block : program_desc_->Blocks()) {
Y
yangfei 已提交
655 656 657
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      if (var_desc->Persistable()) {
L
liuruilong 已提交
658
        CLImage *cl_image = nullptr;
Y
yangfei 已提交
659
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
660
          var->template GetMutable<LoDTensor>();
Y
yangfei 已提交
661
          continue;
L
liuruilong 已提交
662
        } else {
663
          cl_image = var->template GetMutable<CLImage>();
Y
yangfei 已提交
664
        }
L
liuruilong 已提交
665

Y
yangfei 已提交
666
        char *origin_data =
L
liuruilong 已提交
667
            ReadFileToBuff(program_.model_path + "/" + var_desc->Name());
668
        char *data = origin_data;
Y
yangfei 已提交
669
        cl_context context = program_.scope->GetCLScpoe()->Context();
670
        const TensorDesc &desc = var_desc->Tensor_desc();
671 672 673 674 675
        int numel = 1;
        for (auto l : desc.Dims()) {
          numel *= l;
        }
        DLOG << var_desc->Name();
Y
yangfei 已提交
676
        float *tensorInput = static_cast<float *>(
677 678
            paddle_mobile::memory::Alloc(sizeof(float) * numel));
        LoadMemory(*var_desc, tensorInput, &data);
Y
yangfei 已提交
679

680
        DDim ddim = make_ddim(desc.Dims());
Y
yangfei 已提交
681

L
liuruilong 已提交
682 683
        // has not init
        cl_image->SetTensorData(tensorInput, ddim);
Y
yangfei 已提交
684

685
        delete origin_data;
Y
yangfei 已提交
686
        paddle_mobile::memory::Free(tensorInput);
687
      } else {
688 689
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
          auto cl_image = var->template GetMutable<CLImage>();
690
          cl_context context = program_.scope->GetCLScpoe()->Context();
L
liuruilong 已提交
691 692
          cl_command_queue command_queue =
              program_.scope->GetCLScpoe()->CommandQueue();
Y
yangfei 已提交
693

694 695 696
          const TensorDesc &desc = var_desc->Tensor_desc();
          //          DDim ddim = make_ddim(desc.Dims());
          DDim ddim = cl_image->dims();
697
          DLOG << var_desc->Name();
L
liuruilong 已提交
698
          cl_image->InitEmptyImage(context, command_queue, ddim);
699
        }
Y
yangfei 已提交
700 701 702 703
      }
    }
  }
}
704

Y
yangfei 已提交
705
template <>
706
void Executor<GPU_CL, float>::InitCombineMemory() {
xiebaiyuan's avatar
xiebaiyuan 已提交
707 708
  DLOG << "CL InitCombineMemory---- "
       << "config_.load_when_predict: " << config_.load_when_predict;
Y
yangfei 已提交
709 710
  char *origin_data = nullptr;
  bool self_alloc = false;
Y
yangfei 已提交
711 712
  if (program_.combined_params_buf && program_.combined_params_len) {
    LOG(kLOG_INFO) << "use outter memory";
713
    origin_data = reinterpret_cast<char *>(program_.combined_params_buf);
Y
yangfei 已提交
714 715
  } else {
    LOG(kLOG_INFO) << " begin init combine memory";
Y
yangfei 已提交
716
    self_alloc = true;
L
liuruilong 已提交
717
    origin_data = ReadFileToBuff(program_.para_path);
Y
yangfei 已提交
718 719
  }
  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "origin_data==nullptr!!!");
720
  float *data = reinterpret_cast<float *>(origin_data);
Y
yangfei 已提交
721

722
  for (const auto &block : program_desc_->Blocks()) {
Y
yangfei 已提交
723 724 725
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      if (var_desc->Persistable()) {
L
liuruilong 已提交
726
        CLImage *cl_image = nullptr;
Y
yangfei 已提交
727
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
728
          var->template GetMutable<LoDTensor>();
Y
yangfei 已提交
729
          continue;
L
liuruilong 已提交
730
        } else {
731
          cl_image = var->template GetMutable<CLImage>();
Y
yangfei 已提交
732 733 734 735
        }

        cl_context context = program_.scope->GetCLScpoe()->Context();

736 737
        const TensorDesc &desc = var_desc->Tensor_desc();
        DDim ddim = make_ddim(desc.Dims());
Y
yangfei 已提交
738 739 740 741 742

        int numel = 1;
        for (int i = 0; i < ddim.size(); i++) {
          numel = numel * ddim[i];
        }
743 744 745
        float *tensorInput = static_cast<float *>(
            paddle_mobile::memory::Alloc(sizeof(float) * numel));
        LoadMemory(*var_desc, tensorInput, &origin_data);
L
liuruilong 已提交
746 747 748 749

        // has not init
        cl_image->SetTensorData(tensorInput, ddim);

750 751
        paddle_mobile::memory::Free(tensorInput);
      } else {
752
        auto cl_image = var->template GetMutable<CLImage>();
Y
yangfei 已提交
753
        cl_context context = program_.scope->GetCLScpoe()->Context();
L
liuruilong 已提交
754 755
        cl_command_queue command_queue =
            program_.scope->GetCLScpoe()->CommandQueue();
756 757 758
        const TensorDesc &desc = var_desc->Tensor_desc();
        DDim ddim = cl_image->dims();
        //  DDim ddim = make_ddim(desc.Dims());
L
liuruilong 已提交
759
        cl_image->InitEmptyImage(context, command_queue, ddim);
Y
yangfei 已提交
760 761 762
      }
    }
  }
Y
yangfei 已提交
763
  if (self_alloc) {
764
    delete data;
Y
yangfei 已提交
765
  }
Y
yangfei 已提交
766
  LOG(kLOG_INFO) << " end init combine memory ";
767
}
Y
yangfei 已提交
768 769 770

#endif

771
template class Executor<CPU, float>;
Y
yangfei 已提交
772

773
template class Executor<FPGA, float>;
W
wangliu 已提交
774

775
template class Executor<GPU_CL, float>;
Y
yangfei 已提交
776

777
template class Executor<GPU_MALI, float>;
Y
yangfei 已提交
778 779

}  // namespace framework
W
wangliu 已提交
780
}  // namespace paddle_mobile