executor.cpp 16.8 KB
Newer Older
W
wangliu 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

15
#include "io/executor.h"
D
dolphin8 已提交
16
#include <algorithm>
17
#include <utility>
W
wangliu 已提交
18
#include <vector>
L
liuruilong 已提交
19
#include "common/enforce.h"
L
liuruilong 已提交
20
#include "common/log.h"
L
liuruilong 已提交
21
#include "framework/framework.pb-c.h"
L
liuruilong 已提交
22 23
#include "framework/lod_tensor.h"
#include "framework/operator.h"
L
liuruilong 已提交
24
#include "framework/program/program-optimize/program_optimize.h"
L
liuruilong 已提交
25 26 27 28
#include "framework/program/program_desc.h"
#include "framework/program/var_desc.h"
#include "framework/scope.h"
#include "framework/tensor.h"
29
#include "operators/math/gemm.h"
W
wangliu 已提交
30 31

namespace paddle_mobile {
32

W
wangliu 已提交
33 34 35
using framework::Variable;

template <typename Dtype, Precision P>
H
hjchen2 已提交
36
Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
37
                             const bool use_optimize, const bool loddable)
H
hjchen2 已提交
38 39 40 41
    : program_(p),
      batch_size_(batch_size),
      use_optimize_(use_optimize),
      loddable_(loddable) {
W
wangliu 已提交
42
  Variable *variable_ptr = program_.scope->Var("batch_size");
H
hjchen2 已提交
43
  variable_ptr->SetValue<int>(batch_size);
Refine  
陈后江 已提交
44 45
  to_predict_program_ =
      use_optimize_ ? program_.optimizeProgram : program_.originProgram;
46 47
  PADDLE_MOBILE_ENFORCE(to_predict_program_ != nullptr,
                        "to_predict_program_ == NULL!");
48
  const std::vector<std::shared_ptr<framework::BlockDesc>> &blocks =
W
wangliu 已提交
49
      to_predict_program_->Blocks();
50 51

  DLOG << "executor in loaddable mode: " << loddable_;
W
wangliu 已提交
52 53 54 55 56
  for (int i = 0; i < blocks.size(); ++i) {
    std::shared_ptr<framework::BlockDesc> block_desc = blocks[i];
    std::vector<std::shared_ptr<framework::OpDesc>> ops = block_desc->Ops();
    for (int j = 0; j < ops.size(); ++j) {
      std::shared_ptr<framework::OpDesc> op = ops[j];
57
      DLOG << "create op: " << op->Type();
W
wangliu 已提交
58 59 60
      auto op_base = framework::OpRegistry<Dtype>::CreateOp(
          op->Type(), op->GetInputs(), op->GetOutputs(), op->GetAttrMap(),
          program_.scope);
Refine  
陈后江 已提交
61 62
      // infer shape to reshape tensor before predict,
      // but for lod tensor, it will need to reshape in runtime
xiebaiyuan's avatar
xiebaiyuan 已提交
63 64 65
      if (!loddable_) {
        op_base->InferShape();
      }
W
wangliu 已提交
66 67 68
      ops_of_block_[*block_desc.get()].push_back(op_base);
    }
  }
W
wangliu 已提交
69
  if (program_.combined) {
L
liuruilong 已提交
70 71 72 73
    InitCombineMemory();
  } else {
    InitMemory();
  }
L
liuruilong 已提交
74
  std::shared_ptr<framework::BlockDesc> to_predict_block =
L
liuruilong 已提交
75
      to_predict_program_->Block(0);
L
liuruilong 已提交
76
  auto &ops = ops_of_block_[*to_predict_block.get()];
L
liuruilong 已提交
77
  for (const auto &op : ops) {
L
liuruilong 已提交
78 79
    op->Init();
  }
W
wangliu 已提交
80 81
}

82
template <typename Dtype>
Refine  
陈后江 已提交
83 84
void LoadMemInternal(void **data, framework::LoDTensor *tensor) {
  char **data_buf = reinterpret_cast<char **>(data);
85
  int64_t size = tensor->numel();
86
  Dtype *tensor_data = tensor->mutable_data<Dtype>();
87
  if (0) {
88
    // TODO(hjchen2) should be moved into operator init function
89 90 91 92 93 94
    float min_value;
    float max_value;
    memcpy(&min_value, data_buf, sizeof(float));
    memcpy(&max_value, data_buf + sizeof(float), sizeof(float));
    data_buf += 2 * sizeof(float);
    const float factor = (max_value - min_value) / 255.0;
95
    const uint8_t *uint8_data = reinterpret_cast<uint8_t *>(data_buf);
96 97
    for (int k = 0; k < size; ++k) {
      tensor_data[k] = uint8_data[k] * factor + min_value;
W
wangliu 已提交
98
    }
99 100
    data_buf += size * sizeof(uint8_t);
  } else {
Refine  
陈后江 已提交
101 102
    memcpy(tensor_data, *data_buf, size * sizeof(Dtype));
    *data_buf += size * sizeof(Dtype);
L
liuruilong 已提交
103
  }
104
}
W
wangliu 已提交
105

106
template <typename Dtype, Precision P>
Refine  
陈后江 已提交
107
void Executor<Dtype, P>::LoadMemory(
108 109 110
    void **data, const std::shared_ptr<framework::VarDesc> var_desc,
    framework::LoDTensor *tensor) {
  char **data_buf = reinterpret_cast<char **>(data);
111
  // version
112
  uint32_t version = *(reinterpret_cast<uint32_t *>(*data_buf));
Refine  
陈后江 已提交
113
  *data_buf += sizeof(uint32_t);
114
  // lod information
115
  uint64_t lod_level = *(reinterpret_cast<uint64_t *>(*data_buf));
Refine  
陈后江 已提交
116
  *data_buf += sizeof(uint64_t);
117 118 119 120

  auto *lod = tensor->mutable_lod();
  lod->resize(lod_level);
  for (uint64_t i = 0; i < lod_level; ++i) {
121
    uint64_t size = *(reinterpret_cast<uint64_t *>(*data_buf));
Refine  
陈后江 已提交
122
    *data_buf += sizeof(uint64_t);
123
    std::vector<size_t> tmp_dim(size / sizeof(size_t));
Refine  
陈后江 已提交
124
    memcpy(tmp_dim.data(), *data_buf, size);
125
    (*lod)[i] = std::move(tmp_dim);
Refine  
陈后江 已提交
126
    *data_buf += size;
W
wangliu 已提交
127
  }
128
  // tensor version
129
  uint32_t tensor_version = *(reinterpret_cast<uint32_t *>(*data_buf));
Refine  
陈后江 已提交
130
  *data_buf += sizeof(uint32_t);
131
  // tensor desc size
132
  int32_t tensor_desc_size = *(reinterpret_cast<int32_t *>(*data_buf));
Refine  
陈后江 已提交
133
  *data_buf += sizeof(int32_t);
134
  // skip tensor desc
Refine  
陈后江 已提交
135
  *data_buf += tensor_desc_size;
136

Refine  
陈后江 已提交
137
  const framework::TensorDesc &tensor_desc = var_desc->Tensor_desc();
138 139 140
  tensor->Resize(framework::make_ddim(tensor_desc.Dims()));
  // parse tensor from stream
  switch (tensor_desc.DataType()) {
W
wangliu 已提交
141
    case framework::VARTYPE_TYPE_FP32:
142
      LoadMemInternal<float>(reinterpret_cast<void **>(data_buf), tensor);
W
wangliu 已提交
143
      break;
144
    case framework::VARTYPE_TYPE_INT8:
145
      LoadMemInternal<int8_t>(reinterpret_cast<void **>(data_buf), tensor);
W
wangliu 已提交
146 147
      break;
    case framework::VARTYPE_TYPE_INT32:
148
      LoadMemInternal<int>(reinterpret_cast<void **>(data_buf), tensor);
W
wangliu 已提交
149 150
      break;
    default:
151
      LOG(kLOG_ERROR) << "data type is not supported";
L
liuruilong 已提交
152
  }
W
wangliu 已提交
153 154 155 156 157 158 159
}

template <typename Dtype, Precision P>
void Executor<Dtype, P>::InitMemory() {
  for (const auto &block : to_predict_program_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
160
      auto tensor = var->template GetMutable<framework::LoDTensor>();
W
wangliu 已提交
161 162 163 164
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
Refine  
陈后江 已提交
165
        char *origin_data =
Refine  
陈后江 已提交
166
            ReadFileToBuff(program_.model_path + "/" + var_desc->Name());
Refine  
陈后江 已提交
167
        char *data = origin_data;
168 169
        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
        delete[] origin_data;
W
wangliu 已提交
170 171
      } else {
        if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
172
          varInputMemory(var_desc, var, tensor);
W
wangliu 已提交
173 174 175 176 177 178
        }
      }
    }
  }
}

L
liuruilong 已提交
179
template <typename Dtype, Precision P>
L
liuruilong 已提交
180
void Executor<Dtype, P>::InitCombineMemory() {
Refine  
陈后江 已提交
181
  char *origin_data = nullptr;
Refine  
陈后江 已提交
182
  bool self_alloc = false;
183
  if (program_.combined_params_buf && program_.combined_params_len) {
184 185
    origin_data = reinterpret_cast<char *>(
        const_cast<uint8_t *>(program_.combined_params_buf));
186
  } else {
Refine  
陈后江 已提交
187
    self_alloc = true;
Refine  
陈后江 已提交
188
    origin_data = ReadFileToBuff(program_.para_path);
189
  }
Refine  
陈后江 已提交
190 191
  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "data == nullptr");
  char *data = origin_data;
L
liuruilong 已提交
192 193 194
  for (const auto &block : to_predict_program_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
195
      auto tensor = var->template GetMutable<framework::LoDTensor>();
L
liuruilong 已提交
196 197 198 199
      if (var_desc->Persistable()) {
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
200
        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
L
liuruilong 已提交
201 202
      } else {
        if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
203
          varInputMemory(var_desc, var, tensor);
L
liuruilong 已提交
204 205 206 207
        }
      }
    }
  }
Refine  
陈后江 已提交
208
  if (self_alloc) {
209
    delete[] origin_data;
Refine  
陈后江 已提交
210 211
  }
  LOG(kLOG_INFO) << "init combine memory finish";
L
liuruilong 已提交
212
}
213

xiebaiyuan's avatar
xiebaiyuan 已提交
214 215 216 217
template <typename Dtype, Precision P>
bool Executor<Dtype, P>::varInputMemory(
    const std::shared_ptr<framework::VarDesc> &var_desc, Variable *var,
    framework::LoDTensor *tensor) const {
218 219
  auto type = var_desc->Tensor_desc().DataType();
  switch (type) {
Refine  
陈后江 已提交
220
    case framework::VARTYPE_TYPE_FP32:
221
      tensor->mutable_data<float>();
xiebaiyuan's avatar
xiebaiyuan 已提交
222
      break;
Refine  
陈后江 已提交
223
    case framework::VARTYPE_TYPE_INT8:
224
      tensor->mutable_data<int8_t>();
Refine  
陈后江 已提交
225 226
      break;
    case framework::VARTYPE_TYPE_INT32:
227
      tensor->mutable_data<int32_t>();
xiebaiyuan's avatar
xiebaiyuan 已提交
228
      break;
Refine  
陈后江 已提交
229
    case framework::VARTYPE_TYPE_INT64:
230
      tensor->mutable_data<int64_t>();
xiebaiyuan's avatar
xiebaiyuan 已提交
231
      break;
Refine  
陈后江 已提交
232
    default:
xiebaiyuan's avatar
xiebaiyuan 已提交
233 234
      break;
  }
Refine  
陈后江 已提交
235
  bool is_mute_match = (type == framework::VARTYPE_TYPE_FP32) ||
236 237 238
                       (type == framework::VARTYPE_TYPE_INT8) ||
                       (type == framework::VARTYPE_TYPE_INT32) ||
                       (type == framework::VARTYPE_TYPE_INT64);
Refine  
陈后江 已提交
239
  PADDLE_MOBILE_ENFORCE(is_mute_match, "got unhandled data type : %d", type);
xiebaiyuan's avatar
xiebaiyuan 已提交
240 241
  return is_mute_match;
}
L
liuruilong 已提交
242

W
wangliu 已提交
243
template <typename Dtype, Precision P>
W
wangliu 已提交
244 245
std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
    const framework::Tensor &t) {
W
wangliu 已提交
246 247 248 249 250 251
  framework::Variable *g_feed_value = program_.scope->Var("feed");
  framework::Tensor *feed_tensor =
      g_feed_value->GetMutable<framework::LoDTensor>();
  feed_tensor->Resize(t.dims());
  feed_tensor->ShareDataWith(t);
  std::shared_ptr<framework::BlockDesc> to_predict_block =
W
wangliu 已提交
252
      to_predict_program_->Block(0);
D
dolphin8 已提交
253
  auto &ops = ops_of_block_[*to_predict_block.get()];
xiebaiyuan's avatar
xiebaiyuan 已提交
254

D
dolphin8 已提交
255
#ifdef PADDLE_MOBILE_PROFILE
D
dolphin8 已提交
256
  std::vector<ProfInfo> profile(ops.size());
D
dolphin8 已提交
257
#endif
D
dolphin8 已提交
258
  for (int i = 0; i < ops.size(); i++) {
D
dolphin8 已提交
259
#ifdef PADDLE_MOBILE_PROFILE
D
dolphin8 已提交
260 261 262 263
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
L
liuruilong 已提交
264
    // to Run
D
dolphin8 已提交
265 266 267 268 269
    ops[i]->Run();
#ifdef PADDLE_MOBILE_PROFILE
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
D
dolphin8 已提交
270
  }
W
wangliu 已提交
271 272 273 274 275 276 277
  auto last_op = ops.rbegin();
  auto output_map = (*last_op)->Outputs();
  std::vector<std::string> out_keys = (*last_op)->GetOutKeys();
  PADDLE_MOBILE_ENFORCE(out_keys.size() > 0, "the last op contains no output");
  framework::LoDTensor *output_tensor =
      framework::GetVarValue<framework::LoDTensor>(out_keys[0], output_map,
                                                   *(program_.scope));
D
dolphin8 已提交
278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297
#ifdef PADDLE_MOBILE_PROFILE
  std::unordered_map<std::string, uint64_t> _tp;
  for (int i = 0; i < profile.size(); i++) {
    const auto &pInfo = profile[i];
    uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
    _tp[ops[i]->Type()] += timeCost;
  }
  printf("====================[ profile ]======================\n");
  using prof_t = std::pair<std::string, uint64_t>;
  std::vector<prof_t> _tv(_tp.begin(), _tp.end());
  uint64_t _ptotal = 0;
  for (auto const &p : _tv) {
    _ptotal += p.second;
  }
  auto compf = [](const prof_t &a, const prof_t &b) {
    return a.second > b.second;
  };
  std::sort(_tv.begin(), _tv.end(), compf);
  _tv.push_back(std::make_pair("total", _ptotal));
  for (auto const &p : _tv) {
298 299 300
    printf("%-16s\t%-10.0f\t%-2.4f\n", p.first.c_str(),
           static_cast<float>(p.second),
           static_cast<float>(p.second) / _ptotal * 100.0);
D
dolphin8 已提交
301 302 303
  }
  printf("====================[---------]======================\n");
#endif
L
liuruilong 已提交
304
  return std::make_shared<framework::Tensor>(framework::Tensor(*output_tensor));
W
wangliu 已提交
305
}
xiebaiyuan's avatar
xiebaiyuan 已提交
306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377

template <typename Dtype, Precision P>
std::shared_ptr<framework::LoDTensor> Executor<Dtype, P>::PredictLod(
    const framework::LoDTensor &t) {
  framework::Variable *g_feed_value = program_.scope->Var("feed");
  framework::LoDTensor *feed_tensor =
      g_feed_value->GetMutable<framework::LoDTensor>();
  feed_tensor->Resize(t.dims());
  feed_tensor->ShareDataWith(t);
  feed_tensor->set_lod(t.lod());

  std::shared_ptr<framework::BlockDesc> to_predict_block =
      to_predict_program_->Block(0);

  auto &ops = ops_of_block_[*to_predict_block.get()];

#ifdef PADDLE_MOBILE_PROFILE
  std::vector<ProfInfo> profile(ops.size());
#endif
  for (int i = 0; i < ops.size(); i++) {
#ifdef PADDLE_MOBILE_PROFILE
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
    if (loddable_) {
      ops[i]->InferShape();
    }
    ops[i]->Run();
#ifdef PADDLE_MOBILE_PROFILE
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
  }
  auto last_op = ops.rbegin();

  auto output_map = (*last_op)->Outputs();
  std::vector<std::string> out_keys = (*last_op)->GetOutKeys();
  PADDLE_MOBILE_ENFORCE(out_keys.size() > 0, "the last op contains no output");
  framework::LoDTensor *output_tensor =
      framework::GetVarValue<framework::LoDTensor>(out_keys[0], output_map,
                                                   *(program_.scope));
#ifdef PADDLE_MOBILE_PROFILE
  std::unordered_map<std::string, uint64_t> _tp;
  for (int i = 0; i < profile.size(); i++) {
    const auto &pInfo = profile[i];
    uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
    _tp[ops[i]->Type()] += timeCost;
  }
  printf("====================[ profile ]======================\n");
  using prof_t = std::pair<std::string, uint64_t>;
  std::vector<prof_t> _tv(_tp.begin(), _tp.end());
  uint64_t _ptotal = 0;
  for (auto const &p : _tv) {
    _ptotal += p.second;
  }
  auto compf = [](const prof_t &a, const prof_t &b) {
    return a.second > b.second;
  };
  std::sort(_tv.begin(), _tv.end(), compf);
  _tv.push_back(std::make_pair("total", _ptotal));
  for (auto const &p : _tv) {
    printf("%-16s\t%-10.0f\t%-2.4f\n", p.first.c_str(),
           static_cast<float>(p.second),
           static_cast<float>(p.second) / _ptotal * 100.0);
  }
  printf("====================[---------]======================\n");
#endif
  return std::make_shared<framework::LoDTensor>(
      framework::LoDTensor(*output_tensor));
}

W
wangliu 已提交
378 379 380 381
template <typename Dtype, Precision P>
std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
    const framework::Tensor &t, int block_id) {
  return Predict(t);
W
wangliu 已提交
382 383 384
}

template <typename Dtype, Precision P>
L
liuruilong 已提交
385
std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
W
wangliu 已提交
386 387
    const std::vector<Ptype> &input, const std::vector<int64_t> &dims) {
  framework::Tensor tensor(input, framework::make_ddim(dims));
W
wangliu 已提交
388 389 390 391 392 393 394 395
  std::shared_ptr<framework::Tensor> output_tensor = Predict(tensor, 0);
  Executor<Dtype, P>::Ptype *output_ptr =
      output_tensor->data<typename Executor<Dtype, P>::Ptype>();
  std::vector<typename Executor<Dtype, P>::Ptype> result_vector;
  for (int j = 0; j < output_tensor->numel(); ++j) {
    result_vector.push_back(output_ptr[j]);
  }
  return result_vector;
W
wangliu 已提交
396 397
}

398 399
#ifdef PADDLE_MOBILE_FPGA
template <typename Dtype, Precision P>
400 401 402
void Executor<Dtype, P>::InjectVariable(const framework::Tensor &t,
                                        string var_name) {
  framework::Variable *g_feed_value = program_.scope->Var(var_name);
403 404 405 406
  framework::Tensor *feed_tensor =
      g_feed_value->GetMutable<framework::LoDTensor>();
  feed_tensor->Resize(t.dims());
  feed_tensor->ShareDataWith(t);
407
}
408

409 410 411
template <typename Dtype, Precision P>
void Executor<Dtype, P>::FeedData(const framework::Tensor &t) {
  InjectVariable(t, "feed");
412
}
413

414
template <typename Dtype, Precision P>
415
std::shared_ptr<framework::Tensor> Executor<Dtype, P>::FetchResult(int id) {
416 417 418
  std::shared_ptr<framework::BlockDesc> to_predict_block =
      to_predict_program_->Block(0);
  auto &ops = ops_of_block_[*to_predict_block.get()];
419

Z
zhangyang 已提交
420 421 422 423 424
  PADDLE_MOBILE_ENFORCE(id < (int)ops.size(), "Index out of range");
  auto op = id < 0 ? ops[ops.size() - 1] : ops[id];
  auto output_map = op->Outputs();
  std::vector<std::string> out_keys = op->GetOutKeys();
  PADDLE_MOBILE_ENFORCE(!out_keys.empty(), "this op contains no output");
425 426 427
  auto *output_tensor = framework::GetVarValue<framework::LoDTensor>(
      out_keys[0], output_map, *(program_.scope));
  return std::make_shared<framework::Tensor>(framework::Tensor(*output_tensor));
428
}
429 430 431 432 433 434

template <typename Dtype, Precision P>
void Executor<Dtype, P>::Predict_From_To(int start, int end) {
  std::shared_ptr<framework::BlockDesc> to_predict_block =
      to_predict_program_->Block(0);
  auto &ops = ops_of_block_[*to_predict_block.get()];
435
  end = end < 0 ? static_cast<int>(ops.size()) : end;
436 437 438 439 440 441 442 443 444 445 446 447
  PADDLE_MOBILE_ENFORCE(start >= 0 && start < end && end <= ops.size(),
                        "start or end parameter is wrong");

#ifdef PADDLE_MOBILE_PROFILE
  std::vector<ProfInfo> profile(ops.size());
#endif
  for (int i = start; i < end; i++) {
#ifdef PADDLE_MOBILE_PROFILE
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
Z
zhangyang 已提交
448
    DLOG << "Running op: " << i << "  " << ops[i]->Type();
449 450 451 452 453 454 455
    ops[i]->Run();

#ifdef PADDLE_MOBILE_PROFILE
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
#endif
  }
456
}
457 458 459 460

template <typename Dtype, Precision P>
void Executor<Dtype, P>::Predict_From(int start) {
  Predict_From_To(start);
461
}
462 463 464 465

template <typename Dtype, Precision P>
void Executor<Dtype, P>::Predict_To(int end) {
  Predict_From_To(0, end);
466
}
467 468
#endif

W
wangliu 已提交
469
template class Executor<CPU, Precision::FP32>;
H
hanbuhe 已提交
470
template class Executor<GPU_MALI, Precision::FP32>;
L
liuruilong 已提交
471
template class Executor<FPGA, Precision::FP32>;
W
wangliu 已提交
472 473

}  // namespace paddle_mobile