// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #pragma once #include "lite/utils/any.h" #ifdef LITE_WITH_CUDA #include "lite/backends/cuda/context.h" #endif #ifdef LITE_WITH_OPENCL #include "lite/backends/opencl/cl_context.h" #include "lite/backends/opencl/cl_runtime.h" #endif #ifdef LITE_WITH_MLU #include #include #include // NOLINT #include "lite/backends/mlu/mlu_utils.h" #endif #ifdef LITE_WITH_XPU #include "lite/backends/xpu/xpu_header_sitter.h" #endif #include #include #include #include #include #include #include "lite/core/device_info.h" #include "lite/core/target_wrapper.h" #include "lite/core/tensor.h" #include "lite/utils/all.h" #include "lite/utils/env.h" namespace paddle { namespace lite { template class Context; using HostContext = Context; using X86Context = Context; using ARMContext = Context; using NPUContext = Context; using APUContext = Context; using XPUContext = Context; using OpenCLContext = Context; using FPGAContext = Context; using BMContext = Context; using MLUContext = Context; using RKNPUContext = Context; template <> class Context { public: // NOTE: InitOnce should only be used by ContextScheduler void InitOnce() {} void CopySharedTo(HostContext* ctx) {} std::string name() const { return "HostContext"; } }; #ifdef LITE_WITH_NPU template <> class Context { public: // NOTE: InitOnce should only be used by ContextScheduler void InitOnce() {} void CopySharedTo(NPUContext* ctx) {} NPUContext& operator=(const NPUContext& ctx) {} std::string name() const { return "NPUContext"; } static void SetSubgraphModelCacheDir(std::string subgraph_model_cache_dir) { subgraph_model_cache_dir_ = subgraph_model_cache_dir; } static std::string SubgraphModelCacheDir() { return subgraph_model_cache_dir_; } private: static std::string subgraph_model_cache_dir_; }; #endif #ifdef LITE_WITH_APU template <> class Context { public: // NOTE: InitOnce should only be used by ContextScheduler void InitOnce() {} void CopySharedTo(APUContext* ctx) {} APUContext& operator=(const APUContext& ctx) {} std::string name() const { return "APUContext"; } }; #endif #ifdef LITE_WITH_BM template <> class Context { public: // NOTE: InitOnce should only be used by ContextScheduler void InitOnce() { TargetWrapperBM::SetDevice(TargetWrapperBM::GetDevice()); } void CopySharedTo(BMContext* ctx) {} void* GetHandle() { return TargetWrapperBM::GetHandle(); } std::string name() const { return "BMContext"; } }; #endif #ifdef LITE_WITH_RKNPU template <> class Context { public: // NOTE: InitOnce should only be used by ContextScheduler void InitOnce() {} void CopySharedTo(RKNPUContext* ctx) {} RKNPUContext& operator=(const RKNPUContext& ctx) {} std::string name() const { return "RKNPUContext"; } }; #endif #ifdef LITE_WITH_XPU template <> class Context { public: // NOTE: InitOnce should only be used by ContextScheduler void InitOnce() {} void CopySharedTo(XPUContext* ctx) {} static xdnn::Context* GetRawContext() { if (_tls_raw_ctx == nullptr) { _tls_raw_ctx = xdnn::create_context(); CHECK(_tls_raw_ctx); int r = xdnn::set_workspace_l3_size(_tls_raw_ctx, _workspace_l3_size_per_thread); if (r != 0) { LOG(WARNING) << "xdnn::set_workspace_l3_size() failed, r = " << r << ", _workspace_l3_size_per_thread = " << _workspace_l3_size_per_thread; } } return _tls_raw_ctx; } static void SetWorkspaceL3Size(int l3_size = 0xfffc00) { _workspace_l3_size_per_thread = l3_size; } // **DEPRECATED**, use xpu_set_device() at the very beginning of each worker // thread static void SetDev(int dev_no = 0) { const char* dev_env = getenv("LITE_XPU_DEV"); if (dev_env) { xpu_set_device(atoi(dev_env)); return; } xpu_set_device(dev_no); } std::string name() const { return "XPUContext"; } public: static std::string _multi_encoder_precision; // NOLINT private: static thread_local xdnn::Context* _tls_raw_ctx; static int _workspace_l3_size_per_thread; }; #endif #ifdef LITE_WITH_ARM template <> class Context { public: // NOTE: InitOnce should only be used by ContextScheduler void InitOnce() { DeviceInfo::Init(); } void CopySharedTo(ARMContext* ctx) {} void SetRunMode(lite_api::PowerMode mode, int threads) { return DeviceInfo::Global().SetRunMode(mode, threads); } void SetCache(int l1size, int l2size, int l3size) { return DeviceInfo::Global().SetCache(l1size, l2size, l3size); } void SetArch(ARMArch arch) { return DeviceInfo::Global().SetArch(arch); } lite_api::PowerMode mode() const { return DeviceInfo::Global().mode(); } int threads() const { return DeviceInfo::Global().threads(); } ARMArch arch() const { return DeviceInfo::Global().arch(); } int l1_cache_size() const { return DeviceInfo::Global().l1_cache_size(); } int l2_cache_size() const { return DeviceInfo::Global().l2_cache_size(); } int l3_cache_size() const { return DeviceInfo::Global().l3_cache_size(); } int llc_size() const { return DeviceInfo::Global().llc_size(); } bool has_dot() const { return DeviceInfo::Global().has_dot(); } bool has_fp16() const { return DeviceInfo::Global().has_fp16(); } template T* workspace_data() { return DeviceInfo::Global().workspace_data(); } bool ExtendWorkspace(size_t size) { return DeviceInfo::Global().ExtendWorkspace(size); } std::string name() const { return "ARMContext"; } }; #endif #ifdef LITE_WITH_FPGA // TODO(tianxiaogang): add needed implementation to context template <> class Context { public: void InitOnce() {} FPGAContext& operator=(const FPGAContext& ctx) {} void CopySharedTo(FPGAContext* ctx) {} std::string name() const { return "FPGAContext"; } }; #endif #ifdef LITE_WITH_MLU template <> class Context { public: typename Env::Devs& devs = Env::Global(); void InitOnce() {} MLUContext& operator=(const MLUContext& ctx) { this->Init(ctx.device_id_, ctx.exec_queue_id_); return *this; } void Init(int dev_id, int exec_queue_id = 0) { CHECK_GT(devs.size(), 0UL) << "Env is not initialized or current target is not exit!"; if (dev_id >= static_cast(devs.size())) { LOG(WARNING) << "device index exceeds the number of devices, set to " "default device(0)!"; device_id_ = 0; } else { device_id_ = dev_id; } SetMluDevice(device_id_); // get queue id from map std::unique_lock lk(map_mutex_); if (queue_id_map_.find(exec_queue_id) == queue_id_map_.end()) { queue_id_map_[exec_queue_id] = next_queue_id_++ % devs[dev_id].max_queue(); } exec_queue_id_ = queue_id_map_[exec_queue_id]; VLOG(4) << "pick mlu queue id: " << exec_queue_id_; lk.unlock(); io_queue_ = devs[dev_id].io_queues()[exec_queue_id_]; exec_queue_ = devs[dev_id].exec_queues()[exec_queue_id_]; } void CopySharedTo(MLUContext* ctx) { ctx->forward_param_ = forward_param_; } const cnrtQueue_t& exec_queue() const { return exec_queue_; } void SetExecQueue(cnrtQueue_t queue) { exec_queue_ = queue; } const cnrtQueue_t& io_queue() const { return io_queue_; } void SetIoQueue(cnrtQueue_t queue) { io_queue_ = queue; } cnmlCoreVersion_t MLUCoreVersion() { return paddle::lite::TargetWrapperMlu::MLUCoreVersion(); } int MLUCoreNumber() { return paddle::lite::TargetWrapperMlu::MLUCoreNumber(); } u32_t affinity() { return affinity_; } cnrtInvokeFuncParam_t forward_param() { return forward_param_; } int device_id() { return device_id_; } std::string name() const { return "MLUContext"; } private: static int next_queue_id_; static std::map queue_id_map_; static std::mutex map_mutex_; int device_id_; // overall information int exec_queue_id_; cnrtQueue_t io_queue_; cnrtQueue_t exec_queue_; std::vector input_notifiers_; std::vector output_notifiers_; cnrtInvokeFuncParam_t forward_param_; u32_t affinity_ = 0x01; }; #endif // LITE_WITH_MLU #ifdef LITE_WITH_X86 template <> class Context { public: // NOTE: InitOnce should only be used by ContextScheduler void InitOnce() {} void CopySharedTo(X86Context* ctx) {} std::string name() const { return "X86Context"; } private: // overall information // // kernel information }; #endif #ifdef LITE_WITH_OPENCL template <> class Context { std::shared_ptr cl_context_; public: CLContext* cl_context() { return cl_context_.get(); } void InitOnce() { // Init cl runtime. CHECK(CLRuntime::Global()->IsInitSuccess()) << "OpenCL runtime init failed"; cl_context_ = std::make_shared(); } void CopySharedTo(OpenCLContext* ctx) { ctx->cl_context_ = cl_context_; } }; #endif // Context for running a kernel. // Holds the necessary resource and information. class KernelContext { public: template ContextT& As() { if (!ctx_.valid()) { ctx_.set(); } return *ctx_.get_mutable(); } private: Any ctx_; }; // The ContextScheduler helps to assign different context for each kernel. class ContextScheduler { public: static ContextScheduler& Global() { static auto* x = new ContextScheduler; return *x; } std::unique_ptr NewContext( TargetType target, /*only used for cuda context*/ int exec_stream_id = 0) { std::unique_ptr ctx(new KernelContext); switch (target) { case TARGET(kHost): kernel_contexts_[TargetType::kHost].As().CopySharedTo( &ctx->As()); break; #ifdef LITE_WITH_X86 case TARGET(kX86): kernel_contexts_[TargetType::kX86].As().CopySharedTo( &ctx->As()); break; #endif #ifdef LITE_WITH_CUDA case TARGET(kCUDA): { int dev_id = TargetWrapper::GetCurDevice(); auto& context = ctx->As(); context.Init(dev_id, exec_stream_id); kernel_contexts_[TargetType::kCUDA].As().CopySharedTo( &context); } break; #endif #ifdef LITE_WITH_ARM case TARGET(kARM): kernel_contexts_[TargetType::kARM].As().CopySharedTo( &ctx->As()); break; #endif #ifdef LITE_WITH_NPU case TARGET(kNPU): kernel_contexts_[TargetType::kNPU].As().CopySharedTo( &ctx->As()); break; #endif #ifdef LITE_WITH_APU case TARGET(kAPU): kernel_contexts_[TargetType::kAPU].As().CopySharedTo( &ctx->As()); break; #endif #ifdef LITE_WITH_RKNPU case TARGET(kRKNPU): kernel_contexts_[TargetType::kRKNPU].As().CopySharedTo( &ctx->As()); break; #endif #ifdef LITE_WITH_XPU case TARGET(kXPU): kernel_contexts_[TargetType::kXPU].As().CopySharedTo( &ctx->As()); break; #endif #ifdef LITE_WITH_OPENCL case TARGET(kOpenCL): kernel_contexts_[TargetType::kOpenCL].As().CopySharedTo( &ctx->As()); break; #endif #ifdef LITE_WITH_FPGA case TARGET(kFPGA): kernel_contexts_[TargetType::kFPGA].As().CopySharedTo( &ctx->As()); break; #endif #ifdef LITE_WITH_BM case TARGET(kBM): kernel_contexts_[TargetType::kBM].As().CopySharedTo( &ctx->As()); break; #endif #ifdef LITE_WITH_MLU case TARGET(kMLU): { int dev_id = TargetWrapper::GetCurDevice(); auto& context = ctx->As(); context.Init(dev_id, exec_stream_id); kernel_contexts_[TargetType::kMLU].As().CopySharedTo( &context); LOG(INFO) << "New Context for MLU"; } break; #endif default: #if (!defined LITE_ON_MODEL_OPTIMIZE_TOOL) && (!defined LITE_WITH_PYTHON) LOG(FATAL) << "unsupported target " << TargetToStr(target); #endif break; } return ctx; } private: template void InitContext() { kernel_contexts_[Type].As().InitOnce(); } ContextScheduler() { InitContext(); #ifdef LITE_WITH_X86 InitContext(); #endif #ifdef LITE_WITH_CUDA InitContext(); #endif #ifdef LITE_WITH_ARM InitContext(); #endif #ifdef LITE_WITH_OPENCL InitContext(); #endif #ifdef LITE_WITH_FPGA InitContext(); #endif #ifdef LITE_WITH_NPU InitContext(); #endif #ifdef LITE_WITH_APU InitContext(); #endif #ifdef LITE_WITH_RKNPU InitContext(); #endif #ifdef LITE_WITH_XPU InitContext(); #endif #ifdef LITE_WITH_BM InitContext(); #endif #ifdef LITE_WITH_MLU InitContext(); #endif } private: std::map kernel_contexts_; }; } // namespace lite } // namespace paddle