diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index b2cdc1a369c36ed5c605005aee73663d35968ccf..76b51f5890ff39ac673ec9c014fcd882b371adc6 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -290,6 +290,8 @@ struct Argument { DECL_ARGUMENT_FIELD(xpu_device_id, XpuDeviceId, int); DECL_ARGUMENT_FIELD(xpu_enable_multi_stream, XpuEnableMultiStream, bool); + DECL_ARGUMENT_FIELD(use_opencl, UseOpenCL, bool); + DECL_ARGUMENT_FIELD(use_nnadapter, UseNNAdapter, bool); DECL_ARGUMENT_FIELD(nnadapter_model_cache_dir, NNAdapterModelCacheDir, diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index 4551d8dbf5225243868dc91d2d1d0537c55b3028..71bfd1d7bfca25ca6af608ccf7d633a6ef567882 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -254,6 +254,7 @@ void IRPassManager::CreatePasses(Argument *argument, pass->Set("use_xpu", new bool(argument->use_xpu())); pass->Set("xpu_l3_workspace_size", new int(argument->xpu_l3_workspace_size())); + pass->Set("use_opencl", new bool(argument->use_opencl())); pass->Set("cpu_math_library_num_threads", new int(argument->cpu_math_library_num_threads())); pass->Set("locked", new bool(argument->xpu_locked())); diff --git a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc index 577e3df2e68f8ef9973e80a33438679631a92e63..45f0c589a7e9affe025cf77e1c6acdc2c062144f 100644 --- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc @@ -252,6 +252,7 @@ void LiteSubgraphPass::SetUpEngine( bool use_xpu = Get("use_xpu"); int xpu_device_id = Get("xpu_device_id"); int xpu_l3_workspace_size = Get("xpu_l3_workspace_size"); + bool use_opencl = Get("use_opencl"); int cpu_math_library_num_threads = Get("cpu_math_library_num_threads"); bool locked = Get("locked"); bool autotune = Get("autotune"); @@ -285,6 +286,8 @@ void LiteSubgraphPass::SetUpEngine( #ifdef LITE_WITH_NNADAPTER target_type = TARGET(kNNAdapter); #endif + } else if (use_opencl) { + target_type = TARGET(kOpenCL); } else { #ifdef PADDLE_WITH_ARM target_type = TARGET(kARM); @@ -313,6 +316,33 @@ void LiteSubgraphPass::SetUpEngine( #endif paddle::lite_api::Place({TARGET(kHost), PRECISION(kFloat)}), }; + + // opencl has no int64, and has bugs with image io. + if (use_opencl) { + config.valid_places = { + paddle::lite_api::Place{ + TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)}, + paddle::lite_api::Place{ + TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageFolder)}, + paddle::lite_api::Place{ + TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)}, + paddle::lite_api::Place{ + TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)}, + paddle::lite_api::Place{ + TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageFolder)}, + paddle::lite_api::Place{ + TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)}, + paddle::lite_api::Place{ + TARGET(kOpenCL), PRECISION(kInt32), DATALAYOUT(kNCHW)}, +#ifdef PADDLE_WITH_ARM + paddle::lite_api::Place{TARGET(kARM), PRECISION(kFloat)}, +#else + paddle::lite_api::Place{TARGET(kX86), PRECISION(kFloat)}, +#endif + paddle::lite_api::Place{TARGET(kHost), PRECISION(kFloat)}, + }; + } + config.cpu_math_library_num_threads = cpu_math_library_num_threads; config.xpu_l3_workspace_size = xpu_l3_workspace_size; config.device_id = xpu_device_id; diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 00d667776eecc0c1de193515bf8cb45d54fca1a5..08d569635b0c9309ad03f7bf62e61693a34facb4 100755 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -446,6 +446,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { CP_MEMBER(xpu_adaptive_seqlen_); CP_MEMBER(xpu_enable_multi_stream_); + // Lite OpenCL Related + CP_MEMBER(use_opencl_); + // NPU related. CP_MEMBER(use_npu_); CP_MEMBER(npu_device_id_); @@ -1157,6 +1160,11 @@ void AnalysisConfig::EnableLiteEngine( Update(); } +void AnalysisConfig::EnableOpenCL() { + use_opencl_ = true; + Update(); +} + void AnalysisConfig::PartiallyRelease() { prog_file_.clear(); prog_file_.shrink_to_fit(); diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index dcfa0951d3d27c1360f7f9116bf37e60e3c5c142..686af28f76b8198f38c9139498e85301b776a85a 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -1150,6 +1150,7 @@ void AnalysisPredictor::PrepareArgument() { argument_.SetXpuAdaptiveSeqlen(config_.xpu_adaptive_seqlen_); argument_.SetXpuDeviceId(config_.xpu_device_id_); argument_.SetXpuEnableMultiStream(config_.xpu_enable_multi_stream_); + argument_.SetUseOpenCL(config_.use_opencl_); // NNAdapter related argument_.SetUseNNAdapter(config_.NNAdapter().use_nnadapter); argument_.SetNNAdapterDeviceNames( diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index 579321fd17bdae612daa7c3d499e459b15265bbf..af34bfc796d21e507209cd411e9b2082abe03b45 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -415,6 +415,12 @@ struct PD_INFER_DECL AnalysisConfig { /// bool use_onnxruntime() const { return use_onnxruntime_; } /// + /// \brief A boolean state telling whether the Lite OpenCL is turned on. + /// + /// \return bool Whether the Lite OpenCL is turned on. + /// + bool use_opencl() const { return use_opencl_; } + /// /// \brief A boolean state telling whether the ONNXRuntime Optimization is /// turned on. /// @@ -724,6 +730,11 @@ struct PD_INFER_DECL AnalysisConfig { const std::vector& passes_filter = {}, const std::vector& ops_filter = {}); + /// + /// \brief Turn on the usage of Lite sub-graph engine with opencl. + /// + void EnableOpenCL(); + /// /// \brief A boolean state indicating whether the Lite sub-graph engine is /// used. @@ -1118,6 +1129,9 @@ struct PD_INFER_DECL AnalysisConfig { bool xpu_adaptive_seqlen_; bool xpu_enable_multi_stream_; + // LITE OPENCL SETTINGS + bool use_opencl_{false}; + // NNAdapter related LiteNNAdapterConfig nnadapter_config_; diff --git a/paddle/fluid/inference/lite/engine.cc b/paddle/fluid/inference/lite/engine.cc index c48b718d8878e8aa9d8ce93f4e7f179cd2b8e5f0..cc7fbfa64f0ec4f0ff81e0b6992a7642aa53bf66 100644 --- a/paddle/fluid/inference/lite/engine.cc +++ b/paddle/fluid/inference/lite/engine.cc @@ -89,6 +89,14 @@ paddle::lite_api::PaddlePredictor* EngineManager::Create( cfg.nnadapter_model_cache_buffer[i]); } #endif + + if (cfg.use_opencl) { + lite_cxx_config.set_opencl_binary_path_name(cfg.opencl_bin_path, + cfg.opencl_bin_name); + lite_cxx_config.set_opencl_tune(cfg.opencl_tune_mode); + lite_cxx_config.set_opencl_precision(cfg.opencl_precision_type); + } + // create predictor std::shared_ptr p = paddle::lite_api::CreatePaddlePredictor(lite_cxx_config); diff --git a/paddle/fluid/inference/lite/engine.h b/paddle/fluid/inference/lite/engine.h index bc38b5efaeb87bc0ac6109b95508f442a82956d8..39256b97f2b96d1f4f71804cfb003eb485671fba 100644 --- a/paddle/fluid/inference/lite/engine.h +++ b/paddle/fluid/inference/lite/engine.h @@ -66,6 +66,12 @@ struct EngineConfig { std::string nnadapter_subgraph_partition_config_path; std::vector nnadapter_model_cache_token; std::vector> nnadapter_model_cache_buffer; + + bool use_opencl{}; + std::string opencl_bin_path = "./"; + std::string opencl_bin_name = "lite_opencl_kernel.bin"; + paddle::lite_api::CLTuneMode opencl_tune_mode{}; + paddle::lite_api::CLPrecisionType opencl_precision_type{}; }; class EngineManager { diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index e076beb1c8744b6016ee41714c0dcec0e1080b9d..2bfe221659adee4fdc202ca4b3c9a8b984377518 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -691,6 +691,7 @@ void BindAnalysisConfig(py::module *m) { .def("enable_onnxruntime", &AnalysisConfig::EnableONNXRuntime) .def("disable_onnxruntime", &AnalysisConfig::DisableONNXRuntime) .def("onnxruntime_enabled", &AnalysisConfig::use_onnxruntime) + .def("use_opencl", &AnalysisConfig::use_opencl) .def("enable_ort_optimization", &AnalysisConfig::EnableORTOptimization) .def("use_gpu", &AnalysisConfig::use_gpu) .def("use_xpu", &AnalysisConfig::use_xpu) @@ -783,6 +784,7 @@ void BindAnalysisConfig(py::module *m) { py::arg("zero_copy") = false, py::arg("passes_filter") = std::vector(), py::arg("ops_filter") = std::vector()) + .def("enable_opencl", &AnalysisConfig::EnableOpenCL) .def("lite_engine_enabled", &AnalysisConfig::lite_engine_enabled) .def("switch_ir_debug", &AnalysisConfig::SwitchIrDebug,