reorganize stream. test=develop

84bdddb2 · jiweibo · 07ae2599 · 84bdddb2 · 84bdddb2 · 84bdddb2
12 changed file
--- a/lite/api/cxx_api.cc
+++ b/lite/api/cxx_api.cc
@@ -349,6 +349,11 @@ void Predictor::GenRuntimeProgram() {
  program_ = optimizer_.GenRuntimeProgram();
  CHECK_EQ(exec_scope_, program_->exec_scope());
  program_generated_ = true;
+#ifdef LITE_WITH_CUDA
+  if (!multi_stream_) {
+    program_->UpdateContext(exec_stream_, io_stream_);
+  }
+#endif
 }
 const lite::Tensor *Predictor::GetTensor(const std::string &name) const {

--- a/lite/api/cxx_api.h
+++ b/lite/api/cxx_api.h
@@ -20,12 +20,17 @@
 #include <utility>
 #include <vector>
 #include "lite/api/paddle_api.h"
+#include "lite/core/device_info.h"
 #include "lite/core/op_lite.h"
 #include "lite/core/optimizer.h"
 #include "lite/core/program.h"
 #include "lite/core/types.h"
 #include "lite/model_parser/model_parser.h"
+#ifdef LITE_WITH_CUDA
+#include "lite/backends/cuda/cuda_utils.h"
+#endif
 namespace paddle {
 namespace lite {
@@ -56,7 +61,9 @@ class LITE_API Predictor {
            const std::vector<std::string>& var_names = {})
      : program_desc_(desc), scope_(root) {
    Program program(*desc.get(), scope_, valid_places, var_names);
-    optimizer_ = Optimizer(std::move(program), valid_places);
+    std::vector<std::string> passes{};
+    // TODO(wilber): rethink a new way to associate config and passes.
+    optimizer_ = Optimizer(std::move(program), valid_places, passes);
    exec_scope_ = optimizer_.exec_scope();
    valid_places_ = valid_places;
  }
@@ -146,14 +153,23 @@ class LITE_API Predictor {
      bool record_info = false);
  void SaveOpKernelInfo(const std::string& model_dir);
-  // #ifdef LITE_WITH_TRAIN
+// #ifdef LITE_WITH_TRAIN
-  //   void Run(const std::vector<framework::Tensor>& tensors) {
+//   void Run(const std::vector<framework::Tensor>& tensors) {
-  //     FeedVars(tensors);
+//     FeedVars(tensors);
-  //     program_->Run();
+//     program_->Run();
-  //   }
+//   }
-  //   void FeedVars(const std::vector<framework::Tensor>& tensors);
+//   void FeedVars(const std::vector<framework::Tensor>& tensors);
-  // #endif
+// #endif
+#ifdef LITE_WITH_CUDA
+  void SetMultiStream(bool multi_stream) { multi_stream_ = multi_stream; }
+  bool multi_stream() { return multi_stream_; }
+  void SetExecStream(cudaStream_t* stream) { exec_stream_ = stream; }
+  void SetIoStream(cudaStream_t* stream) { io_stream_ = stream; }
+  const cudaStream_t& exec_stream() { return *exec_stream_; }
+  const cudaStream_t& io_stream() { return *io_stream_; }
+#endif
 private:
  Optimizer optimizer_;
@@ -165,6 +181,11 @@ class LITE_API Predictor {
  std::vector<std::string> input_names_;
  std::vector<std::string> output_names_;
  std::vector<Place> valid_places_;
+#ifdef LITE_WITH_CUDA
+  bool multi_stream_{false};
+  cudaStream_t* io_stream_;
+  cudaStream_t* exec_stream_;
+#endif
 };
 class CxxPaddleApiImpl : public lite_api::PaddlePredictor {
@@ -178,6 +199,8 @@ class CxxPaddleApiImpl : public lite_api::PaddlePredictor {
    status_is_cloned_ = true;
  }
+  ~CxxPaddleApiImpl();
  /// Create a new predictor from a config.
  void Init(const lite_api::CxxConfig& config);
@@ -216,11 +239,31 @@ class CxxPaddleApiImpl : public lite_api::PaddlePredictor {
      lite_api::LiteModelType model_type = lite_api::LiteModelType::kProtobuf,
      bool record_info = false) override;
+ private:
+#ifdef LITE_WITH_CUDA
+  // Cuda related environment initialization, including setting stream pointers,
+  // initializing synchronization events, setting predictor_id, etc.
+  void CudaEnvInit(std::vector<std::string>* passes);
+  // Due to the asynchronous nature of cuda kernel execution, synchronization is
+  // required before setting input and getting output.
+  void InputSync();
+  void OutputSync();
+#endif
 private:
  std::shared_ptr<Predictor> raw_predictor_;
  lite_api::CxxConfig config_;
  std::mutex mutex_;
  bool status_is_cloned_;
+#ifdef LITE_WITH_CUDA
+  bool multi_stream_{false};
+  cudaStream_t* io_stream_;
+  cudaStream_t* exec_stream_;
+  cudaEvent_t input_event_;
+  std::vector<cudaEvent_t> output_events_;
+  // only for multi exec stream mode.
+  std::vector<cudaStream_t*> exec_streams_;
+#endif
 };
 /*

--- a/lite/api/cxx_api_impl.cc
+++ b/lite/api/cxx_api_impl.cc
@@ -23,7 +23,9 @@
 #ifndef LITE_ON_TINY_PUBLISH
 #include "lite/api/paddle_use_passes.h"
 #endif
+#ifdef LITE_WITH_CUDA
+#include "lite/backends/cuda/cuda_utils.h"
+#endif
 #if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \
    !(defined LITE_ON_MODEL_OPTIMIZE_TOOL) && !defined(__APPLE__)
 #include <omp.h>
@@ -34,7 +36,6 @@ namespace lite {
 void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
  config_ = config;
-  if (!status_is_cloned_) {
  auto places = config.valid_places();
  std::vector<std::string> passes = config.get_passes_internal();
 #ifdef LITE_WITH_CUDA
@@ -42,15 +43,13 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
  // otherwise skip this step.
  for (auto &p : places) {
    if (p.target == TARGET(kCUDA)) {
-        Env<TARGET(kCUDA)>::Init();
+      CudaEnvInit(&passes);
-        if (config_.multi_stream()) {
-          passes = {"multi_stream_analysis_pass"};
-          VLOG(3) << "add pass: " << passes[0];
-        }
      break;
    }
  }
 #endif
+  if (!status_is_cloned_) {
 #ifdef LITE_WITH_MLU
    Env<TARGET(kMLU)>::Init();
    lite::DeviceInfo::Global().SetMLURunMode(config.mlu_core_version(),
@@ -73,6 +72,7 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
    raw_predictor_->PrepareFeedFetch();
    CHECK(raw_predictor_) << "The Predictor can not be nullptr in Clone mode.";
  }
  mode_ = config.power_mode();
  threads_ = config.threads();
 #if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \
@@ -87,15 +87,83 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
 #endif
 }
+#ifdef LITE_WITH_CUDA
+void CxxPaddleApiImpl::CudaEnvInit(std::vector<std::string> *passes) {
+  Env<TARGET(kCUDA)>::Init();
+  // init two streams for each predictor.
+  if (config_.exec_stream()) {
+    exec_stream_ = config_.exec_stream();
+  } else {
+    exec_stream_ = new cudaStream_t();
+    TargetWrapperCuda::CreateStream(exec_stream_);
+  }
+  if (config_.io_stream()) {
+    io_stream_ = config_.io_stream();
+  } else {
+    io_stream_ = new cudaStream_t();
+    TargetWrapperCuda::CreateStream(io_stream_);
+  }
+  raw_predictor_->SetExecStream(exec_stream_);
+  raw_predictor_->SetIoStream(io_stream_);
+  // init sync events.
+  if (config_.multi_stream()) {
+    multi_stream_ = true;
+    raw_predictor_->SetMultiStream(multi_stream_);
+    passes->push_back("multi_stream_analysis_pass");
+    VLOG(3) << "add pass: " << (*passes)[0];
+    Env<TargetType::kCUDA>::Devs &devs = Env<TargetType::kCUDA>::Global();
+    int dev_id = TargetWrapperCuda::GetCurDevice();
+    for (size_t i = 0; i < lite::kMaxStream; ++i) {
+      exec_streams_.push_back(
+          const_cast<cudaStream_t *>(&devs[dev_id].exec_streams()[i]));
+      cudaEvent_t out_event;
+      TargetWrapperCuda::CreateEventWithFlags(&out_event);
+      output_events_.push_back(out_event);
+    }
+  } else {
+    cudaEvent_t out_event;
+    TargetWrapperCuda::CreateEventWithFlags(&out_event);
+    output_events_.push_back(out_event);
+  }
+  TargetWrapperCuda::CreateEventWithFlags(&input_event_);
+}
+void CxxPaddleApiImpl::InputSync() {
+  TargetWrapperCuda::RecordEvent(input_event_, *io_stream_);
+  if (multi_stream_) {
+    for (int i = 0; i < lite::kMaxStream; ++i) {
+      TargetWrapperCuda::StreamSync(*exec_streams_[i], input_event_);
+    }
+  } else {
+    TargetWrapperCuda::StreamSync(*exec_stream_, input_event_);
+  }
+}
+void CxxPaddleApiImpl::OutputSync() {
+  if (multi_stream_) {
+    for (size_t i = 0; i < output_events_.size(); ++i) {
+      TargetWrapperCuda::RecordEvent(output_events_[i], *exec_streams_[i]);
+      TargetWrapperCuda::StreamSync(*io_stream_, output_events_[i]);
+    }
+  } else {
+    TargetWrapperCuda::RecordEvent(output_events_[0], *exec_stream_);
+    TargetWrapperCuda::StreamSync(*io_stream_, output_events_[0]);
+  }
+}
+#endif
 std::unique_ptr<lite_api::Tensor> CxxPaddleApiImpl::GetInput(int i) {
  auto *x = raw_predictor_->GetInput(i);
-  return std::unique_ptr<lite_api::Tensor>(new lite_api::Tensor(x));
+  return std::unique_ptr<lite_api::Tensor>(new lite_api::Tensor(x, io_stream_));
 }
 std::unique_ptr<const lite_api::Tensor> CxxPaddleApiImpl::GetOutput(
    int i) const {
  const auto *x = raw_predictor_->GetOutput(i);
-  return std::unique_ptr<lite_api::Tensor>(new lite_api::Tensor(x));
+  return std::unique_ptr<lite_api::Tensor>(new lite_api::Tensor(x, io_stream_));
 }
 std::vector<std::string> CxxPaddleApiImpl::GetInputNames() {
@@ -114,7 +182,15 @@ void CxxPaddleApiImpl::Run() {
 #ifdef LITE_WITH_ARM
  lite::DeviceInfo::Global().SetRunMode(mode_, threads_);
 #endif
+#ifdef LITE_WITH_CUDA
+  InputSync();
+#endif
  raw_predictor_->Run();
+#ifdef LITE_WITH_CUDA
+  OutputSync();
+#endif
 }
 std::shared_ptr<lite_api::PaddlePredictor> CxxPaddleApiImpl::Clone() {
@@ -160,6 +236,17 @@ void CxxPaddleApiImpl::SaveOptimizedModel(const std::string &model_dir,
  raw_predictor_->SaveModel(model_dir, model_type, record_info);
 }
+CxxPaddleApiImpl::~CxxPaddleApiImpl() {
+  TargetWrapperCuda::DestroyEvent(input_event_);
+  for (size_t i = 0; i < output_events_.size(); ++i) {
+    TargetWrapperCuda::DestroyEvent(output_events_[i]);
+  }
+  if (multi_stream_) {
+    TargetWrapperCuda::DestroyStream(*io_stream_);
+    TargetWrapperCuda::DestroyStream(*exec_stream_);
+  }
+}
 }  // namespace lite
 namespace lite_api {

--- a/lite/api/cxx_api_test.cc
+++ b/lite/api/cxx_api_test.cc
@@ -84,7 +84,7 @@ TEST(CXXApi, clone_predictor) {
  auto* cloned_output_tensor = cloned_predictor->GetOutput(0);
  int step = 50;
-  for (int i = 0; i < output_tensor->data_size(); i += step) {
+  for (size_t i = 0; i < output_tensor->data_size(); i += step) {
    EXPECT_NEAR(output_tensor->data<float>()[i],
                cloned_output_tensor->data<float>()[i],
                1e-6);

--- a/lite/api/paddle_api.cc
+++ b/lite/api/paddle_api.cc
@@ -30,6 +30,15 @@ Tensor::Tensor(void *raw) : raw_tensor_(raw) {}
 // TODO(Superjomn) refine this by using another `const void* const_raw`;
 Tensor::Tensor(const void *raw) { raw_tensor_ = const_cast<void *>(raw); }
+#ifdef LITE_WITH_CUDA
+Tensor::Tensor(void *raw, cudaStream_t *stream)
+    : raw_tensor_(raw), io_stream_(stream) {}
+Tensor::Tensor(const void *raw, cudaStream_t *stream) : io_stream_(stream) {
+  raw_tensor_ = const_cast<void *>(raw);
+}
+#endif
 lite::Tensor *tensor(void *x) { return static_cast<lite::Tensor *>(x); }
 const lite::Tensor *ctensor(void *x) {
  return static_cast<const lite::Tensor *>(x);
@@ -93,8 +102,8 @@ void Tensor::CopyFromCpu(const T *src_data) {
        data, src_data, num * sizeof(T), lite::IoDirection::HtoH);
  } else if (type == TargetType::kCUDA) {
 #ifdef LITE_WITH_CUDA
-    lite::TargetWrapperCuda::MemcpySync(
+    lite::TargetWrapperCuda::MemcpyAsync(
-        data, src_data, num * sizeof(T), lite::IoDirection::HtoD);
+        data, src_data, num * sizeof(T), lite::IoDirection::HtoD, *io_stream_);
 #else
    LOG(FATAL) << "Please compile the lib with CUDA.";
 #endif
@@ -113,8 +122,9 @@ void Tensor::CopyToCpu(T *data) const {
        data, src_data, num * sizeof(T), lite::IoDirection::HtoH);
  } else if (type == TargetType::kCUDA) {
 #ifdef LITE_WITH_CUDA
-    lite::TargetWrapperCuda::MemcpySync(
+    lite::TargetWrapperCuda::MemcpyAsync(
-        data, src_data, num * sizeof(T), lite::IoDirection::DtoH);
+        data, src_data, num * sizeof(T), lite::IoDirection::DtoH, *io_stream_);
+    lite::TargetWrapperCuda::StreamSync(*io_stream_);
 #else
    LOG(FATAL) << "Please compile the lib with CUDA.";
 #endif

--- a/lite/api/paddle_api.h
+++ b/lite/api/paddle_api.h
@@ -24,6 +24,10 @@
 #include <vector>
 #include "paddle_place.h"  // NOLINT
+#ifdef LITE_WITH_CUDA
+#include "lite/backends/cuda/cuda_utils.h"
+#endif
 namespace paddle {
 namespace lite_api {
@@ -61,8 +65,16 @@ struct LITE_API Tensor {
  // Set LoD of the tensor
  void SetLoD(const lod_t& lod);
+#ifdef LITE_WITH_CUDA
+  explicit Tensor(void* raw, cudaStream_t* stream);
+  explicit Tensor(const void* raw, cudaStream_t* stream);
+#endif
 private:
  void* raw_tensor_;
+#ifdef LITE_WITH_CUDA
+  cudaStream_t* io_stream_{nullptr};
+#endif
 };
 /// The PaddlePredictor defines the basic interfaces for different kinds of
@@ -155,6 +167,8 @@ class LITE_API CxxConfig : public ConfigBase {
 #endif
 #ifdef LITE_WITH_CUDA
  bool multi_stream_{false};
+  cudaStream_t* exec_stream_{nullptr};
+  cudaStream_t* io_stream_{nullptr};
 #endif
 #ifdef LITE_WITH_MLU
  lite_api::MLUCoreVersion mlu_core_version_{lite_api::MLUCoreVersion::MLU_270};
@@ -203,6 +217,12 @@ class LITE_API CxxConfig : public ConfigBase {
 #ifdef LITE_WITH_CUDA
  void set_multi_stream(bool multi_stream) { multi_stream_ = multi_stream; }
  bool multi_stream() const { return multi_stream_; }
+  void set_exec_stream(cudaStream_t* exec_stream) {
+    exec_stream_ = exec_stream;
+  }
+  void set_io_stream(cudaStream_t* io_stream) { io_stream_ = io_stream; }
+  cudaStream_t* exec_stream() { return exec_stream_; }
+  cudaStream_t* io_stream() { return io_stream_; }
 #endif
 #ifdef LITE_WITH_MLU

--- a/lite/api/test_resnet50_lite_cuda.cc
+++ b/lite/api/test_resnet50_lite_cuda.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "lite/api/paddle_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/api/test_helper.h"
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/backends/cuda/target_wrapper.h"
+#include "lite/utils/cp_logging.h"
+namespace paddle {
+namespace lite {
+void RunModel(lite_api::CxxConfig config) {
+  auto predictor = lite_api::CreatePaddlePredictor(config);
+  const int batch_size = 4;
+  const int channels = 3;
+  const int height = 224;
+  const int width = 224;
+  auto input_tensor = predictor->GetInput(0);
+  std::vector<int64_t> input_shape{batch_size, channels, height, width};
+  input_tensor->Resize(input_shape);
+  std::vector<float> in_data(batch_size * channels * height * width);
+  for (size_t i = 0; i < in_data.size(); i++) {
+    in_data[i] = 1;
+  }
+  input_tensor->CopyFromCpu<float, lite_api::TargetType::kCUDA>(in_data.data());
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    predictor->Run();
+  }
+  auto start = GetCurrentUS();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    predictor->Run();
+  }
+  LOG(INFO) << "================== Speed Report ===================";
+  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
+            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
+            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
+            << " ms in average.";
+  std::vector<float> results{
+      0.000241399, 0.000224183, 0.000536607, 0.000286386, 0.000726817,
+      0.000212999, 0.00638716,  0.00128127,  0.000135354, 0.000767598,
+      0.000241399, 0.000224183, 0.000536607, 0.000286386, 0.000726817,
+      0.000212999, 0.00638716,  0.00128127,  0.000135354, 0.000767598,
+      0.000241399, 0.000224183, 0.000536607, 0.000286386, 0.000726817,
+      0.000212999, 0.00638716,  0.00128127,  0.000135354, 0.000767598,
+      0.000241399, 0.000224183, 0.000536607, 0.000286386, 0.000726817,
+      0.000212999, 0.00638716,  0.00128127,  0.000135354, 0.000767598};
+  auto out = predictor->GetOutput(0);
+  ASSERT_EQ(out->shape().size(), 2u);
+  ASSERT_EQ(out->shape()[0], batch_size);
+  ASSERT_EQ(out->shape()[1], 1000);
+  std::vector<int64_t> shape = out->shape();
+  int out_num =
+      std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
+  std::vector<float> out_cpu(out_num);
+  out->CopyToCpu(out_cpu.data());
+  int step = 100;
+  for (size_t i = 0; i < results.size(); ++i) {
+    EXPECT_NEAR(out_cpu[i * step], results[i], 1e-6);
+  }
+}
+TEST(Resnet50, config_no_stream) {
+  lite_api::CxxConfig config;
+  config.set_model_dir(FLAGS_model_dir);
+  config.set_valid_places({lite_api::Place{TARGET(kCUDA), PRECISION(kFloat)}});
+  RunModel(config);
+}
+TEST(Resnet50, config_exec_stream) {
+  lite_api::CxxConfig config;
+  config.set_model_dir(FLAGS_model_dir);
+  config.set_valid_places({lite_api::Place{TARGET(kCUDA), PRECISION(kFloat)}});
+  cudaStream_t exec_stream;
+  lite::TargetWrapperCuda::CreateStream(&exec_stream);
+  config.set_exec_stream(&exec_stream);
+  RunModel(config);
+}
+TEST(Resnet50, config_io_stream) {
+  lite_api::CxxConfig config;
+  config.set_model_dir(FLAGS_model_dir);
+  config.set_valid_places({lite_api::Place{TARGET(kCUDA), PRECISION(kFloat)}});
+  cudaStream_t io_stream;
+  lite::TargetWrapperCuda::CreateStream(&io_stream);
+  config.set_io_stream(&io_stream);
+  RunModel(config);
+}
+TEST(Resnet50, config_all_stream) {
+  lite_api::CxxConfig config;
+  config.set_model_dir(FLAGS_model_dir);
+  config.set_valid_places({lite_api::Place{TARGET(kCUDA), PRECISION(kFloat)}});
+  cudaStream_t exec_stream;
+  lite::TargetWrapperCuda::CreateStream(&exec_stream);
+  config.set_exec_stream(&exec_stream);
+  cudaStream_t io_stream;
+  lite::TargetWrapperCuda::CreateStream(&io_stream);
+  config.set_io_stream(&io_stream);
+  RunModel(config);
+}
+TEST(Resnet50, config_multi_exec_stream) {
+  lite_api::CxxConfig config;
+  config.set_model_dir(FLAGS_model_dir);
+  config.set_valid_places({lite_api::Place{TARGET(kCUDA), PRECISION(kFloat)}});
+  config.set_multi_stream(true);
+  RunModel(config);
+}
+}  // namespace lite
+}  // namespace paddle
--- a/lite/backends/cuda/target_wrapper.h
+++ b/lite/backends/cuda/target_wrapper.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include "lite/backends/cuda/cuda_utils.h"
 #include "lite/core/target_wrapper.h"
 namespace paddle {
@@ -31,34 +32,40 @@ class TargetWrapper<TARGET(kCUDA)> {
  static size_t num_devices();
  static size_t maximum_stream() { return 0; }
-  static size_t GetCurDevice() {
+  static int GetCurDevice() {
    int dev_id;
-    cudaGetDevice(&dev_id);
+    CUDA_CALL(cudaGetDevice(&dev_id));
    return dev_id;
  }
-  static void CreateStream(stream_t* stream) {}
+  static void CreateStream(stream_t* stream) {
-  static void DestroyStream(const stream_t& stream) {}
+    CUDA_CALL(cudaStreamCreate(stream));
+  }
+  static void DestroyStream(const stream_t& stream) {
+    CUDA_CALL(cudaStreamDestroy(stream));
+  }
-  static void CreateEvent(event_t* event) { cudaEventCreate(event); }
+  static void CreateEvent(event_t* event) { CUDA_CALL(cudaEventCreate(event)); }
  static void CreateEventWithFlags(
      event_t* event, unsigned int flags = cudaEventDisableTiming) {
-    cudaEventCreateWithFlags(event, flags);
+    CUDA_CALL(cudaEventCreateWithFlags(event, flags));
+  }
+  static void DestroyEvent(const event_t& event) {
+    CUDA_CALL(cudaEventDestroy(event));
  }
-  static void DestroyEvent(const event_t& event) { cudaEventDestroy(event); }
  static void RecordEvent(const event_t& event) {}
  static void RecordEvent(const event_t& event, const stream_t& stream) {
-    cudaEventRecord(event, stream);
+    CUDA_CALL(cudaEventRecord(event, stream));
  }
  static void SyncEvent(const event_t& event) {}
  static void StreamSync(const stream_t& stream) {
-    cudaStreamSynchronize(stream);
+    CUDA_CALL(cudaStreamSynchronize(stream));
  }
  static void StreamSync(const stream_t& stream, const event_t& event) {
-    cudaStreamWaitEvent(stream, event, 0);
+    CUDA_CALL(cudaStreamWaitEvent(stream, event, 0));
  }
-  static void DeviceSync() { cudaDeviceSynchronize(); }
+  static void DeviceSync() { CUDA_CALL(cudaDeviceSynchronize()); }
  static void* Malloc(size_t size);
  static void Free(void* ptr);

--- a/lite/core/device_info.h
+++ b/lite/core/device_info.h
@@ -26,6 +26,10 @@
 namespace paddle {
 namespace lite {
+// kMaxStream is determined by multi-stream performance testing, may change with
+// default multi-stream algorithm changes.
+constexpr int kMaxStream = 6;
 #if ((defined LITE_WITH_ARM) || (defined LITE_WITH_MLU))
 typedef enum {
@@ -159,7 +163,7 @@ class Env {
    static Devs* devs = new Devs();
    return *devs;
  }
-  static void Init(int max_stream = 6) {
+  static void Init(int max_stream = lite::kMaxStream) {
 #ifdef LITE_WITH_MLU
    CNRT_CALL(cnrtInit(0));
 #endif
@@ -305,6 +309,7 @@ class Device<TARGET(kCUDA)> {
  bool has_hmma_;
  bool has_imma_;
  int runtime_version_;
+  // Currently used in exec multi-stream.
  std::vector<cudaStream_t> exec_stream_;
  std::vector<cudaStream_t> io_stream_;
 };

--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
@@ -40,7 +40,9 @@ class Optimizer {
 public:
  Optimizer() {}
-  Optimizer(Program&& program, const std::vector<Place>& valid_places) {
+  Optimizer(Program&& program,
+            const std::vector<Place>& valid_places,
+            const std::vector<std::string>& passes) {
    program_ = &program;
    valid_places_ = valid_places;
    CHECK(!valid_places.empty()) << "At least one valid_place should be set";
@@ -50,7 +52,7 @@ class Optimizer {
    factor.ConsiderPrecision();
    factor.ConsiderDataLayout();
-    Run(std::move(program), valid_places, factor, {});
+    Run(std::move(program), valid_places, factor, passes);
  }
  void Run(Program&& program,

--- a/lite/core/program.cc
+++ b/lite/core/program.cc
@@ -73,7 +73,7 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) {
  std::map<std::string, cpp::VarDesc> origin_var_maps;
  auto& main_block = *desc->GetBlock<cpp::BlockDesc>(0);
  auto var_size = main_block.VarsSize();
-  for (int i = 0; i < var_size; i++) {
+  for (int i = 0; i < static_cast<int>(var_size); i++) {
    auto v = main_block.GetVar<cpp::VarDesc>(i);
    auto name = v->Name();
    origin_var_maps.emplace(name, *v);
@@ -144,6 +144,15 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) {
    }
  }
 }
+#ifdef LITE_WITH_CUDA
+void RuntimeProgram::UpdateContext(cudaStream_t* exec, cudaStream_t* io) {
+  for (auto& inst : instructions_) {
+    inst.UpdateContext(exec, io);
+  }
+}
+#endif
 void RuntimeProgram::Run() {
 #ifdef LITE_WITH_PRECISION_PROFILE
  auto inst_precision_profiler = paddle::lite::profile::PrecisionProfiler();
@@ -210,7 +219,8 @@ void Program::Build(const cpp::ProgramDesc& prog) {
    if (op_type == "while" || op_type == "conditional_block" ||
        op_type == "subgraph") {
      auto sub_block_idx = op_desc.GetAttr<int32_t>("sub_block");
-      CHECK(sub_block_idx >= 0 && sub_block_idx < program.BlocksSize())
+      CHECK(sub_block_idx >= 0 &&
+            sub_block_idx < static_cast<int>(program.BlocksSize()))
          << "Invalid attribute sub_block(" << sub_block_idx << ") for "
          << op_type;
      auto sub_block_desc =

--- a/lite/core/program.h
+++ b/lite/core/program.h
@@ -128,6 +128,12 @@ struct Instruction {
    }
  }
  void Sync() const { kernel_->mutable_context()->As<CUDAContext>().Sync(); }
+  void UpdateContext(cudaStream_t* exec, cudaStream_t* io) {
+    if (kernel_->target() == TargetType::kCUDA) {
+      kernel_->mutable_context()->As<CUDAContext>().SetExecStream(*exec);
+      kernel_->mutable_context()->As<CUDAContext>().SetIoStream(*io);
+    }
+  }
 #endif
 #ifdef LITE_WITH_PROFILE
@@ -215,6 +221,12 @@ class LITE_API RuntimeProgram {
  // be added in vars_.
  void UpdateVarsOfProgram(cpp::ProgramDesc* desc);
+#ifdef LITE_WITH_CUDA
+  // UpdateContext will update the exec stream and io stream of all kernels in
+  // the program.
+  void UpdateContext(cudaStream_t* exec, cudaStream_t* io);
+#endif
 private:
  RuntimeProgram(const RuntimeProgram&) = delete;
  std::vector<Instruction> instructions_;