executor.cpp 17.0 KB
Newer Older
W
wangliu 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

15
#include "io/executor.h"
D
dolphin8 已提交
16
#include <algorithm>
17
#include <utility>
W
wangliu 已提交
18
#include <vector>
L
liuruilong 已提交
19
#include "common/enforce.h"
L
liuruilong 已提交
20
#include "common/log.h"
L
liuruilong 已提交
21
#include "framework/framework.pb-c.h"
L
liuruilong 已提交
22 23
#include "framework/lod_tensor.h"
#include "framework/operator.h"
L
liuruilong 已提交
24
#include "framework/program/program-optimize/program_optimize.h"
L
liuruilong 已提交
25 26 27 28
#include "framework/program/program_desc.h"
#include "framework/program/var_desc.h"
#include "framework/scope.h"
#include "framework/tensor.h"
29
#include "operators/math/gemm.h"
W
wangliu 已提交
30 31

namespace paddle_mobile {
32

W
wangliu 已提交
33 34 35
using framework::Variable;

template <typename Dtype, Precision P>
H
hjchen2 已提交
36
Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
37
                             const bool use_optimize, const bool loddable)
H
hjchen2 已提交
38 39 40 41
    : program_(p),
      batch_size_(batch_size),
      use_optimize_(use_optimize),
      loddable_(loddable) {
W
wangliu 已提交
42
  Variable *variable_ptr = program_.scope->Var("batch_size");
H
hjchen2 已提交
43
  variable_ptr->SetValue<int>(batch_size);
Refine  
陈后江 已提交
44 45
  to_predict_program_ =
      use_optimize_ ? program_.optimizeProgram : program_.originProgram;
46 47
  PADDLE_MOBILE_ENFORCE(to_predict_program_ != nullptr,
                        "to_predict_program_ == NULL!");
48
  const std::vector<std::shared_ptr<framework::BlockDesc>> &blocks =
W
wangliu 已提交
49
      to_predict_program_->Blocks();
50 51

  DLOG << "executor in loaddable mode: " << loddable_;
W
wangliu 已提交
52 53 54 55 56
  for (int i = 0; i < blocks.size(); ++i) {
    std::shared_ptr<framework::BlockDesc> block_desc = blocks[i];
    std::vector<std::shared_ptr<framework::OpDesc>> ops = block_desc->Ops();
    for (int j = 0; j < ops.size(); ++j) {
      std::shared_ptr<framework::OpDesc> op = ops[j];
57
      DLOG << "create op: " << op->Type();
W
wangliu 已提交
58 59 60
      auto op_base = framework::OpRegistry<Dtype>::CreateOp(
          op->Type(), op->GetInputs(), op->GetOutputs(), op->GetAttrMap(),
          program_.scope);
Refine  
陈后江 已提交
61 62
      // infer shape to reshape tensor before predict,
      // but for lod tensor, it will need to reshape in runtime
xiebaiyuan's avatar
xiebaiyuan 已提交
63 64 65
      if (!loddable_) {
        op_base->InferShape();
      }
W
wangliu 已提交
66 67 68
      ops_of_block_[*block_desc.get()].push_back(op_base);
    }
  }
W
wangliu 已提交
69
  if (program_.combined) {
L
liuruilong 已提交
70 71 72 73
    InitCombineMemory();
  } else {
    InitMemory();
  }
L
liuruilong 已提交
74
  std::shared_ptr<framework::BlockDesc> to_predict_block =
L
liuruilong 已提交
75
      to_predict_program_->Block(0);
L
liuruilong 已提交
76
  auto &ops = ops_of_block_[*to_predict_block.get()];
L
liuruilong 已提交
77
  for (const auto &op : ops) {
L
liuruilong 已提交
78 79
    op->Init();
  }
W
wangliu 已提交
80 81
}

82
template <typename Dtype>
83 84
static void LoadMemInternal(void **data, framework::LoDTensor *tensor,
                            bool quant_uint8 = false) {
Refine  
陈后江 已提交
85
  char **data_buf = reinterpret_cast<char **>(data);
86
  int64_t size = tensor->numel();
87
  Dtype *tensor_data = tensor->mutable_data<Dtype>();
88 89
  if (quant_uint8) {
    // should be moved into operator init function
90 91 92 93 94 95
    float min_value;
    float max_value;
    memcpy(&min_value, data_buf, sizeof(float));
    memcpy(&max_value, data_buf + sizeof(float), sizeof(float));
    data_buf += 2 * sizeof(float);
    const float factor = (max_value - min_value) / 255.0;
96
    const uint8_t *uint8_data = reinterpret_cast<uint8_t *>(data_buf);
97 98
    for (int k = 0; k < size; ++k) {
      tensor_data[k] = uint8_data[k] * factor + min_value;
W
wangliu 已提交
99
    }
100 101
    data_buf += size * sizeof(uint8_t);
  } else {
Refine  
陈后江 已提交
102 103
    memcpy(tensor_data, *data_buf, size * sizeof(Dtype));
    *data_buf += size * sizeof(Dtype);
L
liuruilong 已提交
104
  }
105
}
W
wangliu 已提交
106

107
template <typename Dtype, Precision P>
Refine  
陈后江 已提交
108
void Executor<Dtype, P>::LoadMemory(
109 110 111
    void **data, const std::shared_ptr<framework::VarDesc> var_desc,
    framework::LoDTensor *tensor) {
  char **data_buf = reinterpret_cast<char **>(data);
112
  // version
113
  uint32_t version = *(reinterpret_cast<uint32_t *>(*data_buf));
Refine  
陈后江 已提交
114
  *data_buf += sizeof(uint32_t);
115
  // lod information
H
hjchen2 已提交
116 117 118
  // uint64_t lod_level = *(reinterpret_cast<uint64_t *>(*data_buf));
  uint64_t lod_level = 0;
  memcpy(&lod_level, *data_buf, sizeof(uint64_t));
Refine  
陈后江 已提交
119
  *data_buf += sizeof(uint64_t);
120 121 122 123

  auto *lod = tensor->mutable_lod();
  lod->resize(lod_level);
  for (uint64_t i = 0; i < lod_level; ++i) {
124
    uint64_t size = *(reinterpret_cast<uint64_t *>(*data_buf));
Refine  
陈后江 已提交
125
    *data_buf += sizeof(uint64_t);
126
    std::vector<size_t> tmp_dim(size / sizeof(size_t));
Refine  
陈后江 已提交
127
    memcpy(tmp_dim.data(), *data_buf, size);
128
    (*lod)[i] = std::move(tmp_dim);
Refine  
陈后江 已提交
129
    *data_buf += size;
W
wangliu 已提交
130
  }
131
  // tensor version
132
  uint32_t tensor_version = *(reinterpret_cast<uint32_t *>(*data_buf));
Refine  
陈后江 已提交
133
  *data_buf += sizeof(uint32_t);
134
  // tensor desc size
135
  int32_t tensor_desc_size = *(reinterpret_cast<int32_t *>(*data_buf));
Refine  
陈后江 已提交
136
  *data_buf += sizeof(int32_t);
137
  // skip tensor desc
Refine  
陈后江 已提交
138
  *data_buf += tensor_desc_size;
139

Refine  
陈后江 已提交
140
  const framework::TensorDesc &tensor_desc = var_desc->Tensor_desc();
141 142 143
  tensor->Resize(framework::make_ddim(tensor_desc.Dims()));
  // parse tensor from stream
  switch (tensor_desc.DataType()) {
W
wangliu 已提交
144
    case framework::VARTYPE_TYPE_FP32:
145 146
      LoadMemInternal<float>(reinterpret_cast<void **>(data_buf), tensor,
                             program_.quantification);
W
wangliu 已提交
147
      break;
148
    case framework::VARTYPE_TYPE_INT8:
149
      LoadMemInternal<int8_t>(reinterpret_cast<void **>(data_buf), tensor);
W
wangliu 已提交
150 151
      break;
    case framework::VARTYPE_TYPE_INT32:
152
      LoadMemInternal<int>(reinterpret_cast<void **>(data_buf), tensor);
W
wangliu 已提交
153 154
      break;
    default:
155
      LOG(kLOG_ERROR) << "data type is not supported";
L
liuruilong 已提交
156
  }
W
wangliu 已提交
157 158 159 160 161 162 163
}

template <typename Dtype, Precision P>
void Executor<Dtype, P>::InitMemory() {
  for (const auto &block : to_predict_program_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
164
      auto tensor = var->template GetMutable<framework::LoDTensor>();
W
wangliu 已提交
165 166 167 168
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
Refine  
陈后江 已提交
169
        char *origin_data =
Refine  
陈后江 已提交
170
            ReadFileToBuff(program_.model_path + "/" + var_desc->Name());
Refine  
陈后江 已提交
171
        char *data = origin_data;
172 173
        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
        delete[] origin_data;
W
wangliu 已提交
174 175
      } else {
        if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
176
          varInputMemory(var_desc, var, tensor);
W
wangliu 已提交
177 178 179 180 181 182
        }
      }
    }
  }
}

L
liuruilong 已提交
183
template <typename Dtype, Precision P>
L
liuruilong 已提交
184
void Executor<Dtype, P>::InitCombineMemory() {
Refine  
陈后江 已提交
185
  char *origin_data = nullptr;
Refine  
陈后江 已提交
186
  bool self_alloc = false;
187
  if (program_.combined_params_buf && program_.combined_params_len) {
188 189
    origin_data = reinterpret_cast<char *>(
        const_cast<uint8_t *>(program_.combined_params_buf));
190
  } else {
Refine  
陈后江 已提交
191
    self_alloc = true;
Refine  
陈后江 已提交
192
    origin_data = ReadFileToBuff(program_.para_path);
193
  }
Refine  
陈后江 已提交
194 195
  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "data == nullptr");
  char *data = origin_data;
L
liuruilong 已提交
196 197 198
  for (const auto &block : to_predict_program_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
199
      auto tensor = var->template GetMutable<framework::LoDTensor>();
L
liuruilong 已提交
200 201 202 203
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
204
        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
L
liuruilong 已提交
205 206
      } else {
        if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
207
          varInputMemory(var_desc, var, tensor);
L
liuruilong 已提交
208 209 210 211
        }
      }
    }
  }
Refine  
陈后江 已提交
212
  if (self_alloc) {
213
    delete[] origin_data;
Refine  
陈后江 已提交
214 215
  }
  LOG(kLOG_INFO) << "init combine memory finish";
L
liuruilong 已提交
216
}
217

xiebaiyuan's avatar
xiebaiyuan 已提交
218 219 220 221
template <typename Dtype, Precision P>
bool Executor<Dtype, P>::varInputMemory(
    const std::shared_ptr<framework::VarDesc> &var_desc, Variable *var,
    framework::LoDTensor *tensor) const {
222 223
  auto type = var_desc->Tensor_desc().DataType();
  switch (type) {
Refine  
陈后江 已提交
224
    case framework::VARTYPE_TYPE_FP32:
225
      tensor->mutable_data<float>();
xiebaiyuan's avatar
xiebaiyuan 已提交
226
      break;
Refine  
陈后江 已提交
227
    case framework::VARTYPE_TYPE_INT8:
228
      tensor->mutable_data<int8_t>();
Refine  
陈后江 已提交
229 230
      break;
    case framework::VARTYPE_TYPE_INT32:
231
      tensor->mutable_data<int32_t>();
xiebaiyuan's avatar
xiebaiyuan 已提交
232
      break;
Refine  
陈后江 已提交
233
    case framework::VARTYPE_TYPE_INT64:
234
      tensor->mutable_data<int64_t>();
xiebaiyuan's avatar
xiebaiyuan 已提交
235
      break;
Refine  
陈后江 已提交
236
    default:
xiebaiyuan's avatar
xiebaiyuan 已提交
237 238
      break;
  }
Refine  
陈后江 已提交
239
  bool is_mute_match = (type == framework::VARTYPE_TYPE_FP32) ||
240 241 242
                       (type == framework::VARTYPE_TYPE_INT8) ||
                       (type == framework::VARTYPE_TYPE_INT32) ||
                       (type == framework::VARTYPE_TYPE_INT64);
Refine  
陈后江 已提交
243
  PADDLE_MOBILE_ENFORCE(is_mute_match, "got unhandled data type : %d", type);
xiebaiyuan's avatar
xiebaiyuan 已提交
244 245
  return is_mute_match;
}
L
liuruilong 已提交
246

W
wangliu 已提交
247
template <typename Dtype, Precision P>
W
wangliu 已提交
248 249
std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
    const framework::Tensor &t) {
W
wangliu 已提交
250 251 252 253 254 255
  framework::Variable *g_feed_value = program_.scope->Var("feed");
  framework::Tensor *feed_tensor =
      g_feed_value->GetMutable<framework::LoDTensor>();
  feed_tensor->Resize(t.dims());
  feed_tensor->ShareDataWith(t);
  std::shared_ptr<framework::BlockDesc> to_predict_block =
W
wangliu 已提交
256
      to_predict_program_->Block(0);
D
dolphin8 已提交
257
  auto &ops = ops_of_block_[*to_predict_block.get()];
xiebaiyuan's avatar
xiebaiyuan 已提交
258

D
dolphin8 已提交
259
#ifdef PADDLE_MOBILE_PROFILE
D
dolphin8 已提交
260
  std::vector<ProfInfo> profile(ops.size());
D
dolphin8 已提交
261
#endif
D
dolphin8 已提交
262
  for (int i = 0; i < ops.size(); i++) {
D
dolphin8 已提交
263
#ifdef PADDLE_MOBILE_PROFILE
D
dolphin8 已提交
264 265 266 267
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
L
liuruilong 已提交
268
    // to Run
D
dolphin8 已提交
269 270 271 272 273
    ops[i]->Run();
#ifdef PADDLE_MOBILE_PROFILE
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
D
dolphin8 已提交
274
  }
W
wangliu 已提交
275 276 277 278 279 280 281
  auto last_op = ops.rbegin();
  auto output_map = (*last_op)->Outputs();
  std::vector<std::string> out_keys = (*last_op)->GetOutKeys();
  PADDLE_MOBILE_ENFORCE(out_keys.size() > 0, "the last op contains no output");
  framework::LoDTensor *output_tensor =
      framework::GetVarValue<framework::LoDTensor>(out_keys[0], output_map,
                                                   *(program_.scope));
D
dolphin8 已提交
282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301
#ifdef PADDLE_MOBILE_PROFILE
  std::unordered_map<std::string, uint64_t> _tp;
  for (int i = 0; i < profile.size(); i++) {
    const auto &pInfo = profile[i];
    uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
    _tp[ops[i]->Type()] += timeCost;
  }
  printf("====================[ profile ]======================\n");
  using prof_t = std::pair<std::string, uint64_t>;
  std::vector<prof_t> _tv(_tp.begin(), _tp.end());
  uint64_t _ptotal = 0;
  for (auto const &p : _tv) {
    _ptotal += p.second;
  }
  auto compf = [](const prof_t &a, const prof_t &b) {
    return a.second > b.second;
  };
  std::sort(_tv.begin(), _tv.end(), compf);
  _tv.push_back(std::make_pair("total", _ptotal));
  for (auto const &p : _tv) {
302 303 304
    printf("%-16s\t%-10.0f\t%-2.4f\n", p.first.c_str(),
           static_cast<float>(p.second),
           static_cast<float>(p.second) / _ptotal * 100.0);
D
dolphin8 已提交
305 306 307
  }
  printf("====================[---------]======================\n");
#endif
L
liuruilong 已提交
308
  return std::make_shared<framework::Tensor>(framework::Tensor(*output_tensor));
W
wangliu 已提交
309
}
xiebaiyuan's avatar
xiebaiyuan 已提交
310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381

template <typename Dtype, Precision P>
std::shared_ptr<framework::LoDTensor> Executor<Dtype, P>::PredictLod(
    const framework::LoDTensor &t) {
  framework::Variable *g_feed_value = program_.scope->Var("feed");
  framework::LoDTensor *feed_tensor =
      g_feed_value->GetMutable<framework::LoDTensor>();
  feed_tensor->Resize(t.dims());
  feed_tensor->ShareDataWith(t);
  feed_tensor->set_lod(t.lod());

  std::shared_ptr<framework::BlockDesc> to_predict_block =
      to_predict_program_->Block(0);

  auto &ops = ops_of_block_[*to_predict_block.get()];

#ifdef PADDLE_MOBILE_PROFILE
  std::vector<ProfInfo> profile(ops.size());
#endif
  for (int i = 0; i < ops.size(); i++) {
#ifdef PADDLE_MOBILE_PROFILE
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
    if (loddable_) {
      ops[i]->InferShape();
    }
    ops[i]->Run();
#ifdef PADDLE_MOBILE_PROFILE
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
  }
  auto last_op = ops.rbegin();

  auto output_map = (*last_op)->Outputs();
  std::vector<std::string> out_keys = (*last_op)->GetOutKeys();
  PADDLE_MOBILE_ENFORCE(out_keys.size() > 0, "the last op contains no output");
  framework::LoDTensor *output_tensor =
      framework::GetVarValue<framework::LoDTensor>(out_keys[0], output_map,
                                                   *(program_.scope));
#ifdef PADDLE_MOBILE_PROFILE
  std::unordered_map<std::string, uint64_t> _tp;
  for (int i = 0; i < profile.size(); i++) {
    const auto &pInfo = profile[i];
    uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
    _tp[ops[i]->Type()] += timeCost;
  }
  printf("====================[ profile ]======================\n");
  using prof_t = std::pair<std::string, uint64_t>;
  std::vector<prof_t> _tv(_tp.begin(), _tp.end());
  uint64_t _ptotal = 0;
  for (auto const &p : _tv) {
    _ptotal += p.second;
  }
  auto compf = [](const prof_t &a, const prof_t &b) {
    return a.second > b.second;
  };
  std::sort(_tv.begin(), _tv.end(), compf);
  _tv.push_back(std::make_pair("total", _ptotal));
  for (auto const &p : _tv) {
    printf("%-16s\t%-10.0f\t%-2.4f\n", p.first.c_str(),
           static_cast<float>(p.second),
           static_cast<float>(p.second) / _ptotal * 100.0);
  }
  printf("====================[---------]======================\n");
#endif
  return std::make_shared<framework::LoDTensor>(
      framework::LoDTensor(*output_tensor));
}

W
wangliu 已提交
382 383 384 385
template <typename Dtype, Precision P>
std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
    const framework::Tensor &t, int block_id) {
  return Predict(t);
W
wangliu 已提交
386 387 388
}

template <typename Dtype, Precision P>
L
liuruilong 已提交
389
std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
W
wangliu 已提交
390 391
    const std::vector<Ptype> &input, const std::vector<int64_t> &dims) {
  framework::Tensor tensor(input, framework::make_ddim(dims));
W
wangliu 已提交
392 393 394 395 396 397 398 399
  std::shared_ptr<framework::Tensor> output_tensor = Predict(tensor, 0);
  Executor<Dtype, P>::Ptype *output_ptr =
      output_tensor->data<typename Executor<Dtype, P>::Ptype>();
  std::vector<typename Executor<Dtype, P>::Ptype> result_vector;
  for (int j = 0; j < output_tensor->numel(); ++j) {
    result_vector.push_back(output_ptr[j]);
  }
  return result_vector;
W
wangliu 已提交
400 401
}

402 403
#ifdef PADDLE_MOBILE_FPGA
template <typename Dtype, Precision P>
404
void Executor<Dtype, P>::InjectVariable(const framework::Tensor &t,
H
hjchen2 已提交
405
                                        std::string var_name) {
406
  framework::Variable *g_feed_value = program_.scope->Var(var_name);
407 408 409 410
  framework::Tensor *feed_tensor =
      g_feed_value->GetMutable<framework::LoDTensor>();
  feed_tensor->Resize(t.dims());
  feed_tensor->ShareDataWith(t);
411
}
412

413 414 415
template <typename Dtype, Precision P>
void Executor<Dtype, P>::FeedData(const framework::Tensor &t) {
  InjectVariable(t, "feed");
416
}
417

418
template <typename Dtype, Precision P>
419
std::shared_ptr<framework::Tensor> Executor<Dtype, P>::FetchResult(int id) {
420 421 422
  std::shared_ptr<framework::BlockDesc> to_predict_block =
      to_predict_program_->Block(0);
  auto &ops = ops_of_block_[*to_predict_block.get()];
423

Z
zhangyang 已提交
424 425 426 427 428
  PADDLE_MOBILE_ENFORCE(id < (int)ops.size(), "Index out of range");
  auto op = id < 0 ? ops[ops.size() - 1] : ops[id];
  auto output_map = op->Outputs();
  std::vector<std::string> out_keys = op->GetOutKeys();
  PADDLE_MOBILE_ENFORCE(!out_keys.empty(), "this op contains no output");
429 430 431
  auto *output_tensor = framework::GetVarValue<framework::LoDTensor>(
      out_keys[0], output_map, *(program_.scope));
  return std::make_shared<framework::Tensor>(framework::Tensor(*output_tensor));
432
}
433 434 435 436 437 438

template <typename Dtype, Precision P>
void Executor<Dtype, P>::Predict_From_To(int start, int end) {
  std::shared_ptr<framework::BlockDesc> to_predict_block =
      to_predict_program_->Block(0);
  auto &ops = ops_of_block_[*to_predict_block.get()];
439
  end = end < 0 ? static_cast<int>(ops.size()) : end;
440 441 442 443 444 445 446 447 448 449 450 451
  PADDLE_MOBILE_ENFORCE(start >= 0 && start < end && end <= ops.size(),
                        "start or end parameter is wrong");

#ifdef PADDLE_MOBILE_PROFILE
  std::vector<ProfInfo> profile(ops.size());
#endif
  for (int i = start; i < end; i++) {
#ifdef PADDLE_MOBILE_PROFILE
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
Z
zhangyang 已提交
452
    DLOG << "Running op: " << i << "  " << ops[i]->Type();
453 454 455 456 457 458 459
    ops[i]->Run();

#ifdef PADDLE_MOBILE_PROFILE
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
  }
460
}
461 462 463 464

template <typename Dtype, Precision P>
void Executor<Dtype, P>::Predict_From(int start) {
  Predict_From_To(start);
465
}
466 467 468 469

template <typename Dtype, Precision P>
void Executor<Dtype, P>::Predict_To(int end) {
  Predict_From_To(0, end);
470
}
471 472
#endif

W
wangliu 已提交
473
template class Executor<CPU, Precision::FP32>;
H
hanbuhe 已提交
474
template class Executor<GPU_MALI, Precision::FP32>;
L
liuruilong 已提交
475
template class Executor<FPGA, Precision::FP32>;
W
wangliu 已提交
476 477

}  // namespace paddle_mobile