executor.cpp 23.7 KB
Newer Older
W
wangliu 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

15
#include "framework/executor.h"
D
dolphin8 已提交
16
#include <algorithm>
17
#include <utility>
W
wangliu 已提交
18
#include <vector>
L
liuruilong 已提交
19
#include "common/enforce.h"
L
liuruilong 已提交
20
#include "common/log.h"
L
liuruilong 已提交
21
#include "framework/framework.pb-c.h"
L
liuruilong 已提交
22 23
#include "framework/lod_tensor.h"
#include "framework/operator.h"
L
liuruilong 已提交
24
#include "framework/program/program-optimize/program_optimize.h"
L
liuruilong 已提交
25 26 27 28
#include "framework/program/program_desc.h"
#include "framework/program/var_desc.h"
#include "framework/scope.h"
#include "framework/tensor.h"
Z
zhangyang 已提交
29
#include "memory/t_malloc.h"
L
update  
liuruilong 已提交
30 31 32 33

#ifdef PADDLE_MOBILE_CL
#include "framework/cl/cl_image.h"
#endif
W
wangliu 已提交
34 35

namespace paddle_mobile {
36
namespace framework {
37

W
wangliu 已提交
38 39
#pragma mark - executor

L
liuruilong 已提交
40 41 42 43 44 45
template <typename Device, typename T>
Executor<Device, T>::Executor(const Program<Device> &program, paddle_mobile::PaddleMobileConfigInternal config, int batch_size,
         const bool use_optimize, const bool lod_mode): Executor(program, batch_size, use_optimize, lod_mode) {
  config_ = config;
};

46 47 48 49
template <typename Device, typename T>
Executor<Device, T>::Executor(const Program<Device> &program, int batch_size,
                              const bool use_optimize, const bool lod_mode)
    : program_(program),
H
hjchen2 已提交
50 51
      batch_size_(batch_size),
      use_optimize_(use_optimize),
52 53 54
      lod_mode_(lod_mode) {
  DLOG << "executor in lod mode: " << lod_mode_;

W
wangliu 已提交
55
  Variable *variable_ptr = program_.scope->Var("batch_size");
H
hjchen2 已提交
56
  variable_ptr->SetValue<int>(batch_size);
57 58

  program_desc_ =
Refine  
陈后江 已提交
59
      use_optimize_ ? program_.optimizeProgram : program_.originProgram;
60 61 62 63
  PADDLE_MOBILE_ENFORCE(program_desc_ != nullptr,
                        "program_desc_ should not be nullptr");
  const auto &blocks = program_desc_->Blocks();
  ops_of_block_.resize(blocks.size());
64

W
wangliu 已提交
65
  for (int i = 0; i < blocks.size(); ++i) {
66 67
    std::shared_ptr<BlockDesc> block_desc = blocks[i];
    std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
W
wangliu 已提交
68
    for (int j = 0; j < ops.size(); ++j) {
69 70 71 72 73 74 75 76 77
      std::shared_ptr<OpDesc> op_desc = ops[j];
      DLOG << "create op: " << op_desc->Type();
      auto op_handler = OpRegistry<Device>::CreateOp(
          op_desc->Type(), op_desc->GetInputs(), op_desc->GetOutputs(),
          op_desc->GetAttrMap(), program_.scope);
      // infer shape to reshape inputs and outputs before predict,
      // but for lod mode, it still need to infer shape in runtime
      if (!lod_mode) {
        op_handler->InferShape();
xiebaiyuan's avatar
xiebaiyuan 已提交
78
      }
79
      ops_of_block_[i].push_back(op_handler);
W
wangliu 已提交
80 81
    }
  }
82

W
wangliu 已提交
83
  if (program_.combined) {
L
liuruilong 已提交
84 85 86 87
    InitCombineMemory();
  } else {
    InitMemory();
  }
88 89 90 91 92 93 94 95

  int count = 0;
  for (int block_id = 0; block_id < ops_of_block_.size(); ++block_id) {
    for (auto &op_handler : ops_of_block_[block_id]) {
      DLOG << "Initialize op[" << count++ << "]: " << op_handler->Type();
      op_handler->Init();
      ops_list_.push_back(op_handler);
    }
L
liuruilong 已提交
96
  }
W
wangliu 已提交
97 98
}

99 100
template <typename Device>
static void LoadMemInternal(void **data, LoDTensor *tensor,
101
                            bool quant_uint8 = false) {
Refine  
陈后江 已提交
102
  char **data_buf = reinterpret_cast<char **>(data);
103
  int64_t size = tensor->numel();
104
  Device *tensor_data = tensor->mutable_data<Device>();
105 106
  if (quant_uint8) {
    // should be moved into operator init function
107 108
    float min_value;
    float max_value;
Z
zhangyang 已提交
109 110
    memory::Copy(&min_value, data_buf, sizeof(float));
    memory::Copy(&max_value, data_buf + sizeof(float), sizeof(float));
111 112
    data_buf += 2 * sizeof(float);
    const float factor = (max_value - min_value) / 255.0;
113
    const uint8_t *uint8_data = reinterpret_cast<uint8_t *>(data_buf);
114 115
    for (int k = 0; k < size; ++k) {
      tensor_data[k] = uint8_data[k] * factor + min_value;
W
wangliu 已提交
116
    }
117 118
    data_buf += size * sizeof(uint8_t);
  } else {
119 120
    memory::Copy(tensor_data, *data_buf, size * sizeof(Device));
    *data_buf += size * sizeof(Device);
L
liuruilong 已提交
121
  }
122
}
W
wangliu 已提交
123

124 125 126 127
template <typename Device, typename T>
void Executor<Device, T>::LoadMemory(void **data,
                                     const std::shared_ptr<VarDesc> var_desc,
                                     LoDTensor *tensor) {
128
  char **data_buf = reinterpret_cast<char **>(data);
129
  // version
130
  uint32_t version = *(reinterpret_cast<uint32_t *>(*data_buf));
Refine  
陈后江 已提交
131
  *data_buf += sizeof(uint32_t);
132
  // lod information
H
hjchen2 已提交
133 134
  // uint64_t lod_level = *(reinterpret_cast<uint64_t *>(*data_buf));
  uint64_t lod_level = 0;
Z
zhangyang 已提交
135
  memory::Copy(&lod_level, *data_buf, sizeof(uint64_t));
Refine  
陈后江 已提交
136
  *data_buf += sizeof(uint64_t);
137 138 139 140

  auto *lod = tensor->mutable_lod();
  lod->resize(lod_level);
  for (uint64_t i = 0; i < lod_level; ++i) {
141
    uint64_t size = *(reinterpret_cast<uint64_t *>(*data_buf));
Refine  
陈后江 已提交
142
    *data_buf += sizeof(uint64_t);
143
    std::vector<size_t> tmp_dim(size / sizeof(size_t));
Z
zhangyang 已提交
144
    memory::Copy(tmp_dim.data(), *data_buf, size);
145
    (*lod)[i] = std::move(tmp_dim);
Refine  
陈后江 已提交
146
    *data_buf += size;
W
wangliu 已提交
147
  }
148
  // tensor version
149
  uint32_t tensor_version = *(reinterpret_cast<uint32_t *>(*data_buf));
Refine  
陈后江 已提交
150
  *data_buf += sizeof(uint32_t);
151
  // tensor desc size
152
  int32_t tensor_desc_size = *(reinterpret_cast<int32_t *>(*data_buf));
Refine  
陈后江 已提交
153
  *data_buf += sizeof(int32_t);
154
  // skip tensor desc
Refine  
陈后江 已提交
155
  *data_buf += tensor_desc_size;
156

157 158
  const TensorDesc &tensor_desc = var_desc->Tensor_desc();
  tensor->Resize(make_ddim(tensor_desc.Dims()));
159 160
  // parse tensor from stream
  switch (tensor_desc.DataType()) {
161
    case VARTYPE_TYPE_FP32:
162 163
      LoadMemInternal<float>(reinterpret_cast<void **>(data_buf), tensor,
                             program_.quantification);
W
wangliu 已提交
164
      break;
165
    case VARTYPE_TYPE_INT8:
166
      LoadMemInternal<int8_t>(reinterpret_cast<void **>(data_buf), tensor);
W
wangliu 已提交
167
      break;
168
    case VARTYPE_TYPE_INT32:
169
      LoadMemInternal<int>(reinterpret_cast<void **>(data_buf), tensor);
W
wangliu 已提交
170 171
      break;
    default:
172
      LOG(kLOG_ERROR) << "data type is not supported";
L
liuruilong 已提交
173
  }
W
wangliu 已提交
174 175
}

176 177 178
template <typename Device, typename T>
void Executor<Device, T>::InitMemory() {
  for (const auto &block : program_desc_->Blocks()) {
W
wangliu 已提交
179 180
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
181
      auto tensor = var->template GetMutable<LoDTensor>();
W
wangliu 已提交
182 183 184 185
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
Refine  
陈后江 已提交
186
        char *origin_data =
Refine  
陈后江 已提交
187
            ReadFileToBuff(program_.model_path + "/" + var_desc->Name());
Refine  
陈后江 已提交
188
        char *data = origin_data;
189 190
        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
        delete[] origin_data;
W
wangliu 已提交
191
      } else {
192
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
193
          varInputMemory(var_desc, var, tensor);
W
wangliu 已提交
194 195 196 197 198 199
        }
      }
    }
  }
}

200 201
template <typename Device, typename T>
void Executor<Device, T>::InitCombineMemory() {
Refine  
陈后江 已提交
202
  char *origin_data = nullptr;
Refine  
陈后江 已提交
203
  bool self_alloc = false;
204
  if (program_.combined_params_buf && program_.combined_params_len) {
205 206
    origin_data = reinterpret_cast<char *>(
        const_cast<uint8_t *>(program_.combined_params_buf));
207
  } else {
Refine  
陈后江 已提交
208
    self_alloc = true;
Refine  
陈后江 已提交
209
    origin_data = ReadFileToBuff(program_.para_path);
210
  }
Refine  
陈后江 已提交
211 212
  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "data == nullptr");
  char *data = origin_data;
213
  for (const auto &block : program_desc_->Blocks()) {
L
liuruilong 已提交
214 215
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
216
      auto tensor = var->template GetMutable<LoDTensor>();
L
liuruilong 已提交
217 218 219 220
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
L
liuruilong 已提交
221 222 223

        DLOG << " init combine memory persistable: " << var_desc->Name();

224
        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
L
liuruilong 已提交
225
      } else {
226
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
L
liuruilong 已提交
227
          DLOG << " init combine memory no persistable in lod: " << var_desc->Name();
228
          varInputMemory(var_desc, var, tensor);
L
liuruilong 已提交
229 230
        } else {
          DLOG << " init combine memory no persistable: " << var_desc->Name();
L
liuruilong 已提交
231 232 233 234
        }
      }
    }
  }
Refine  
陈后江 已提交
235
  if (self_alloc) {
236
    delete[] origin_data;
Refine  
陈后江 已提交
237 238
  }
  LOG(kLOG_INFO) << "init combine memory finish";
L
liuruilong 已提交
239
}
240

L
liuruilong 已提交
241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266
template <typename Device, typename T>
void Executor<Device, T>::InitNoPersistableMemory(const LoDTensor &input_tensor) {
  for (const auto &block : program_desc_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      auto tensor = var->template GetMutable<LoDTensor>();
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
      } else {
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
          DDim tensor_dim = tensor->dims();
          DDim new_dim = make_ddim({tensor_dim[0], tensor_dim[1], input_tensor.dims()[2], input_tensor.dims()[3]});
          tensor->template Resize(new_dim);
          tensor->template mutable_data<T>();
        }
      }
    }
  }

  std::shared_ptr<LoDTensor> output = GetOutput("fetch");
  output->Resize(input_tensor.dims());
  output->mutable_data<T>();
}

267 268 269 270
template <typename Device, typename T>
bool Executor<Device, T>::varInputMemory(
    const std::shared_ptr<VarDesc> &var_desc, Variable *var,
    LoDTensor *tensor) const {
271 272
  auto type = var_desc->Tensor_desc().DataType();
  switch (type) {
273
    case VARTYPE_TYPE_FP32:
274
      tensor->mutable_data<float>();
xiebaiyuan's avatar
xiebaiyuan 已提交
275
      break;
276
    case VARTYPE_TYPE_INT8:
277
      tensor->mutable_data<int8_t>();
Refine  
陈后江 已提交
278
      break;
279
    case VARTYPE_TYPE_INT32:
280
      tensor->mutable_data<int32_t>();
xiebaiyuan's avatar
xiebaiyuan 已提交
281
      break;
282
    case VARTYPE_TYPE_INT64:
283
      tensor->mutable_data<int64_t>();
xiebaiyuan's avatar
xiebaiyuan 已提交
284
      break;
Refine  
陈后江 已提交
285
    default:
xiebaiyuan's avatar
xiebaiyuan 已提交
286 287
      break;
  }
288 289 290
  bool is_mute_match =
      (type == VARTYPE_TYPE_FP32) || (type == VARTYPE_TYPE_INT8) ||
      (type == VARTYPE_TYPE_INT32) || (type == VARTYPE_TYPE_INT64);
Refine  
陈后江 已提交
291
  PADDLE_MOBILE_ENFORCE(is_mute_match, "got unhandled data type : %d", type);
xiebaiyuan's avatar
xiebaiyuan 已提交
292 293
  return is_mute_match;
}
L
liuruilong 已提交
294

295 296 297 298 299
template <typename Device, typename T>
PMStatus Executor<Device, T>::Predict(
    const std::vector<std::pair<std::string, Tensor>> &inputs) {
  for (const auto &input : inputs) {
    SetInput(input.second, input.first);
D
dolphin8 已提交
300
  }
301 302 303 304 305 306 307 308
  return this->Predict();
}

template <typename Device, typename T>
PMStatus Executor<Device, T>::Predict(
    const std::vector<std::pair<std::string, LoDTensor>> &inputs) {
  for (const auto &input : inputs) {
    SetInput(input.second, input.first);
D
dolphin8 已提交
309
  }
310
  return this->Predict();
W
wangliu 已提交
311
}
xiebaiyuan's avatar
xiebaiyuan 已提交
312

313 314 315
template <typename Device, typename T>
std::vector<T> Executor<Device, T>::Predict(const std::vector<T> &input,
                                            const std::vector<int64_t> &dims) {
L
liuruilong 已提交
316

317 318 319 320 321 322 323 324 325 326 327
  Tensor feed_tensor(input, make_ddim(dims));
  SetInput(feed_tensor, "feed");
  std::vector<T> output;
  if (this->Predict() == PMSuccess) {
    const auto output_tensor = GetOutput("fetch");
    output.resize(output_tensor->numel());
    memcpy(output.data(), output_tensor->template data<T>(),
           output.size() * sizeof(T));
  }
  return output;
}
xiebaiyuan's avatar
xiebaiyuan 已提交
328

329 330 331 332 333 334
template <typename Device, typename T>
void Executor<Device, T>::SetInput(const Tensor &input,
                                   const std::string &var_name) {
  auto *target_var = program_.scope->FindVar(var_name);
  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
                        var_name.c_str());
L
liuruilong 已提交
335

336
  auto *target_tensor = target_var->template GetMutable<LoDTensor>();
L
liuruilong 已提交
337 338 339 340 341 342 343

  if (config_.load_when_predict) {
    if (target_tensor->IsInitialized() && target_tensor->dims() != input.dims()) {
      InitNoPersistableMemory(*target_tensor);
    }
  }

344 345 346
  target_tensor->Resize(input.dims());
  target_tensor->ShareDataWith(input);
}
xiebaiyuan's avatar
xiebaiyuan 已提交
347

348 349 350
template <typename Device, typename T>
void Executor<Device, T>::SetInput(const LoDTensor &input,
                                   const std::string &var_name) {
L
liuruilong 已提交
351

352 353 354 355
  auto *target_var = program_.scope->FindVar(var_name);
  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
                        var_name.c_str());
  auto *target_tensor = target_var->template GetMutable<LoDTensor>();
L
liuruilong 已提交
356 357 358 359 360 361 362

  if (config_.load_when_predict) {
    if (target_tensor->IsInitialized() && target_tensor->dims() != input.dims()) {
      InitNoPersistableMemory(*target_tensor);
    }
  }

363 364 365 366
  target_tensor->Resize(input.dims());
  target_tensor->ShareDataWith(input);
  target_tensor->set_lod(input.lod());
}
xiebaiyuan's avatar
xiebaiyuan 已提交
367

368 369
template <typename Device, typename T>
PMStatus Executor<Device, T>::Predict() {
xiebaiyuan's avatar
xiebaiyuan 已提交
370
#ifdef PADDLE_MOBILE_PROFILE
371 372 373
  std::vector<ProfInfo> profile(ops_list_.size());
  struct timespec ts;
  int op_index = 0;
xiebaiyuan's avatar
xiebaiyuan 已提交
374
#endif
375 376
  for (auto &block : ops_of_block_) {
    for (auto &op_handler : block) {
xiebaiyuan's avatar
xiebaiyuan 已提交
377
#ifdef PADDLE_MOBILE_PROFILE
378 379
      clock_gettime(CLOCK_MONOTONIC, &ts);
      profile[op_index].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
xiebaiyuan's avatar
xiebaiyuan 已提交
380
#endif
381 382 383 384
      if (lod_mode_) {
        op_handler->InferShape();
      }
      op_handler->Run();
xiebaiyuan's avatar
xiebaiyuan 已提交
385
#ifdef PADDLE_MOBILE_PROFILE
386 387 388
      clock_gettime(CLOCK_MONOTONIC, &ts);
      profile[op_index].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
      ++op_index;
xiebaiyuan's avatar
xiebaiyuan 已提交
389
#endif
390
    }
xiebaiyuan's avatar
xiebaiyuan 已提交
391 392 393 394 395 396
  }
#ifdef PADDLE_MOBILE_PROFILE
  std::unordered_map<std::string, uint64_t> _tp;
  for (int i = 0; i < profile.size(); i++) {
    const auto &pInfo = profile[i];
    uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
397 398 399 400 401
    if (ops_list_[i]->Type() == "conv2d" ||
        ops_list_[i]->Type() == "depthwise_conv2d") {
      auto inputs = ops_list_[i]->Inputs();
      auto *filter =
          GetVarValue<LoDTensor>("Filter", inputs, *(program_.scope));
402
      int kernel_size = filter->dims()[2];
403 404 405
      _tp[ops_list_[i]->Type() + "_" + std::to_string(kernel_size)] += timeCost;
    } else {
      _tp[ops_list_[i]->Type()] += timeCost;
406
    }
xiebaiyuan's avatar
xiebaiyuan 已提交
407
  }
H
hjchen2 已提交
408
  printf("====================[ profile ]======================\n");
409
  typedef std::pair<std::string, uint64_t> prof_t;
xiebaiyuan's avatar
xiebaiyuan 已提交
410 411 412 413 414 415 416 417 418 419 420 421 422 423 424
  std::vector<prof_t> _tv(_tp.begin(), _tp.end());
  uint64_t _ptotal = 0;
  for (auto const &p : _tv) {
    _ptotal += p.second;
  }
  auto compf = [](const prof_t &a, const prof_t &b) {
    return a.second > b.second;
  };
  std::sort(_tv.begin(), _tv.end(), compf);
  _tv.push_back(std::make_pair("total", _ptotal));
  for (auto const &p : _tv) {
    printf("%-16s\t%-10.0f\t%-2.4f\n", p.first.c_str(),
           static_cast<float>(p.second),
           static_cast<float>(p.second) / _ptotal * 100.0);
  }
H
hjchen2 已提交
425
  printf("====================[---------]======================\n");
xiebaiyuan's avatar
xiebaiyuan 已提交
426
#endif
427
  return PMSuccess;
xiebaiyuan's avatar
xiebaiyuan 已提交
428 429
}

430 431 432 433 434 435 436 437
template <typename Device, typename T>
std::shared_ptr<LoDTensor> Executor<Device, T>::GetOutput(
    const std::string &var_name) {
  auto *target_var = program_.scope->FindVar(var_name);
  PADDLE_MOBILE_ENFORCE(target_var != nullptr, "Variable %s is not exist",
                        var_name.c_str());
  auto *output_tensor = target_var->template GetMutable<LoDTensor>();
  return std::make_shared<LoDTensor>(*output_tensor);
W
wangliu 已提交
438 439
}

440
#ifdef PADDLE_MOBILE_FPGA
441 442 443 444 445
template <typename Device, typename T>
void Executor<Device, T>::InjectVariable(const Tensor &t,
                                         std::string var_name) {
  Variable *g_feed_value = program_.scope->Var(var_name);
  Tensor *feed_tensor = g_feed_value->GetMutable<LoDTensor>();
446 447
  feed_tensor->Resize(t.dims());
  feed_tensor->ShareDataWith(t);
448
}
449

450 451
template <typename Device, typename T>
void Executor<Device, T>::FeedData(const Tensor &t) {
452
  InjectVariable(t, "feed");
453
}
454

455 456
template <typename Device, typename T>
std::shared_ptr<Tensor> Executor<Device, T>::FetchResult(int id) {
H
hjchen2 已提交
457
  auto &ops = ops_of_block_[0];
458

Z
zhangyang 已提交
459 460 461 462 463
  PADDLE_MOBILE_ENFORCE(id < (int)ops.size(), "Index out of range");
  auto op = id < 0 ? ops[ops.size() - 1] : ops[id];
  auto output_map = op->Outputs();
  std::vector<std::string> out_keys = op->GetOutKeys();
  PADDLE_MOBILE_ENFORCE(!out_keys.empty(), "this op contains no output");
464 465 466
  auto *output_tensor =
      GetVarValue<LoDTensor>(out_keys[0], output_map, *(program_.scope));
  return std::make_shared<Tensor>(Tensor(*output_tensor));
467
}
468

469 470
template <typename Device, typename T>
void Executor<Device, T>::Predict_From_To(int start, int end) {
H
hjchen2 已提交
471
  auto &ops = ops_of_block_[0];
472
  end = end < 0 ? static_cast<int>(ops.size()) : end;
473 474 475 476 477 478 479 480 481 482 483 484
  PADDLE_MOBILE_ENFORCE(start >= 0 && start < end && end <= ops.size(),
                        "start or end parameter is wrong");

#ifdef PADDLE_MOBILE_PROFILE
  std::vector<ProfInfo> profile(ops.size());
#endif
  for (int i = start; i < end; i++) {
#ifdef PADDLE_MOBILE_PROFILE
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
Z
zhangyang 已提交
485
    DLOG << "Running op: " << i << "  " << ops[i]->Type();
486 487 488 489 490 491 492
    ops[i]->Run();

#ifdef PADDLE_MOBILE_PROFILE
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
  }
493
}
494

495 496
template <typename Device, typename T>
void Executor<Device, T>::Predict_From(int start) {
497
  Predict_From_To(start);
498
}
499

500 501
template <typename Device, typename T>
void Executor<Device, T>::Predict_To(int end) {
502
  Predict_From_To(0, end);
503
}
504 505
#endif

Y
yangfei 已提交
506
#ifdef PADDLE_MOBILE_CL
507 508 509
template <typename Device, typename T>
void Executor<Device, T>::LoadMemory(const VarDesc var_desc, float *tensorInput,
                                     char **data) {}
L
liuruilong 已提交
510

Y
yangfei 已提交
511
template <>
H
hjchen2 已提交
512 513
void Executor<GPU_CL, float>::LoadMemory(const VarDesc var_desc,
                                         float *tensorInput, char **data) {
514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550
  // 1. version
  uint32_t version = *reinterpret_cast<uint32_t *>(*data);

  (*data) += sizeof(uint32_t);

  // 2 Lod information
  uint64_t *lod_level_ptr = new uint64_t();
  memcpy(lod_level_ptr, (*data), sizeof(uint64_t));
  uint64_t lod_level = *lod_level_ptr;
  delete lod_level_ptr;
  (*data) += sizeof(uint64_t);

  for (uint64_t i = 0; i < lod_level; ++i) {
    uint64_t size = *reinterpret_cast<uint64_t *>(*data);
    (*data) += sizeof(uint64_t);
    std::vector<size_t> tmp(size / sizeof(size_t));

    for (int k = 0; k < tmp.size(); ++k) {
      tmp[k] = *reinterpret_cast<size_t *>(*data);
      (*data) += sizeof(size_t);
    }
  }

  // 3. tensor version
  uint32_t tensor_version = *reinterpret_cast<uint32_t *>(*data);
  (*data) += sizeof(uint32_t);

  // 4. tensor desc
  int32_t size = *reinterpret_cast<int32_t *>(*data);
  (*data) += sizeof(int32_t);

  std::unique_ptr<char[]> buf(new char[size]);
  for (int m = 0; m < size; ++m) {
    buf.get()[m] = (*data)[m];
  }
  (*data) += (sizeof(char) * size);

551
  const TensorDesc &desc = var_desc.Tensor_desc();
552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585
  int memory_size = 1;
  for (auto l : desc.Dims()) {
    memory_size *= l;
  }

  void *memory = nullptr;
  int type_size = 4;
  memory = tensorInput;
  if (program_.quantification) {
    float min_value;
    float max_value;

    memcpy(&min_value, *data, sizeof(float));
    memcpy(&max_value, *data + sizeof(float), sizeof(float));
    *data += 2 * sizeof(float);
    const float factor = (max_value - min_value) / 255.0;
    uint8_t *uint8_data = reinterpret_cast<uint8_t *>(*data);
    for (int k = 0; k < memory_size; ++k) {
      static_cast<float *>(memory)[k] = uint8_data[k] * factor + min_value;
    }
    *data += (memory_size * sizeof(uint8_t));
  } else {
    for (int n = 0; n < memory_size; n++) {
      float value;
      memcpy(&value, *data + n * type_size, type_size);
      if (value < 1e-30 && value > -1e-30) {
        static_cast<float *>(memory)[n] = 0.0;
      } else {
        static_cast<float *>(memory)[n] = value;
      }
    }
    (*data) += (sizeof(char) * memory_size * type_size);
  }
}
586

Y
yangfei 已提交
587
template <>
588 589
void Executor<GPU_CL, float>::InitMemory() {
  for (const auto &block : program_desc_->Blocks()) {
Y
yangfei 已提交
590 591 592
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      if (var_desc->Persistable()) {
L
liuruilong 已提交
593
        CLImage *cl_image = nullptr;
Y
yangfei 已提交
594
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
595
          var->template GetMutable<LoDTensor>();
Y
yangfei 已提交
596
          continue;
L
liuruilong 已提交
597
        } else {
598
          cl_image = var->template GetMutable<CLImage>();
Y
yangfei 已提交
599
        }
L
liuruilong 已提交
600

Y
yangfei 已提交
601
        char *origin_data =
L
liuruilong 已提交
602
            ReadFileToBuff(program_.model_path + "/" + var_desc->Name());
603
        char *data = origin_data;
Y
yangfei 已提交
604
        cl_context context = program_.scope->GetCLScpoe()->Context();
605
        const TensorDesc &desc = var_desc->Tensor_desc();
606 607 608 609 610
        int numel = 1;
        for (auto l : desc.Dims()) {
          numel *= l;
        }
        DLOG << var_desc->Name();
Y
yangfei 已提交
611
        float *tensorInput = static_cast<float *>(
612 613
            paddle_mobile::memory::Alloc(sizeof(float) * numel));
        LoadMemory(*var_desc, tensorInput, &data);
Y
yangfei 已提交
614

615
        DDim ddim = make_ddim(desc.Dims());
Y
yangfei 已提交
616

L
liuruilong 已提交
617 618
        // has not init
        cl_image->SetTensorData(tensorInput, ddim);
Y
yangfei 已提交
619

620
        delete origin_data;
Y
yangfei 已提交
621
        paddle_mobile::memory::Free(tensorInput);
622
      } else {
623 624
        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
          auto cl_image = var->template GetMutable<CLImage>();
625
          cl_context context = program_.scope->GetCLScpoe()->Context();
L
liuruilong 已提交
626 627
          cl_command_queue command_queue =
              program_.scope->GetCLScpoe()->CommandQueue();
Y
yangfei 已提交
628

629 630 631
          const TensorDesc &desc = var_desc->Tensor_desc();
          //          DDim ddim = make_ddim(desc.Dims());
          DDim ddim = cl_image->dims();
632
          DLOG << var_desc->Name();
L
liuruilong 已提交
633
          cl_image->InitEmptyImage(context, command_queue, ddim);
634
        }
Y
yangfei 已提交
635 636 637 638
      }
    }
  }
}
639

Y
yangfei 已提交
640
template <>
641
void Executor<GPU_CL, float>::InitCombineMemory() {
Y
yangfei 已提交
642 643
  char *origin_data = nullptr;
  bool self_alloc = false;
Y
yangfei 已提交
644 645
  if (program_.combined_params_buf && program_.combined_params_len) {
    LOG(kLOG_INFO) << "use outter memory";
646
    origin_data = reinterpret_cast<char *>(program_.combined_params_buf);
Y
yangfei 已提交
647 648
  } else {
    LOG(kLOG_INFO) << " begin init combine memory";
Y
yangfei 已提交
649
    self_alloc = true;
L
liuruilong 已提交
650
    origin_data = ReadFileToBuff(program_.para_path);
Y
yangfei 已提交
651 652
  }
  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "origin_data==nullptr!!!");
653
  float *data = reinterpret_cast<float *>(origin_data);
Y
yangfei 已提交
654

655
  for (const auto &block : program_desc_->Blocks()) {
Y
yangfei 已提交
656 657 658
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
      if (var_desc->Persistable()) {
L
liuruilong 已提交
659
        CLImage *cl_image = nullptr;
Y
yangfei 已提交
660
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
661
          var->template GetMutable<LoDTensor>();
Y
yangfei 已提交
662
          continue;
L
liuruilong 已提交
663
        } else {
664
          cl_image = var->template GetMutable<CLImage>();
Y
yangfei 已提交
665 666 667 668
        }

        cl_context context = program_.scope->GetCLScpoe()->Context();

669 670
        const TensorDesc &desc = var_desc->Tensor_desc();
        DDim ddim = make_ddim(desc.Dims());
Y
yangfei 已提交
671 672 673 674 675

        int numel = 1;
        for (int i = 0; i < ddim.size(); i++) {
          numel = numel * ddim[i];
        }
676 677 678
        float *tensorInput = static_cast<float *>(
            paddle_mobile::memory::Alloc(sizeof(float) * numel));
        LoadMemory(*var_desc, tensorInput, &origin_data);
L
liuruilong 已提交
679 680 681 682

        // has not init
        cl_image->SetTensorData(tensorInput, ddim);

683 684
        paddle_mobile::memory::Free(tensorInput);
      } else {
685
        auto cl_image = var->template GetMutable<CLImage>();
Y
yangfei 已提交
686
        cl_context context = program_.scope->GetCLScpoe()->Context();
L
liuruilong 已提交
687 688
        cl_command_queue command_queue =
            program_.scope->GetCLScpoe()->CommandQueue();
689 690 691
        const TensorDesc &desc = var_desc->Tensor_desc();
        DDim ddim = cl_image->dims();
        //  DDim ddim = make_ddim(desc.Dims());
L
liuruilong 已提交
692
        cl_image->InitEmptyImage(context, command_queue, ddim);
Y
yangfei 已提交
693 694 695
      }
    }
  }
Y
yangfei 已提交
696
  if (self_alloc) {
697
    delete data;
Y
yangfei 已提交
698
  }
Y
yangfei 已提交
699
  LOG(kLOG_INFO) << " end init combine memory ";
700
}
Y
yangfei 已提交
701 702 703

#endif

704
template class Executor<CPU, float>;
Y
yangfei 已提交
705

706
template class Executor<FPGA, float>;
W
wangliu 已提交
707

708
template class Executor<GPU_CL, float>;
Y
yangfei 已提交
709

710
template class Executor<GPU_MALI, float>;
Y
yangfei 已提交
711 712

}  // namespace framework
W
wangliu 已提交
713
}  // namespace paddle_mobile