// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #pragma once #include "lite/utils/any.h" #ifdef LITE_WITH_CUDA #include "lite/backends/cuda/blas.h" #include "lite/backends/cuda/cuda_utils.h" #endif #ifdef LITE_WITH_OPENCL #include #include #include "lite/backends/opencl/cl_context.h" #include "lite/backends/opencl/cl_runtime.h" #endif #ifdef LITE_WITH_NPU #include "lite/backends/npu/npu_helper.h" #endif #include #include #include #include #include #include #include "lite/core/device_info.h" #include "lite/core/target_wrapper.h" #include "lite/core/tensor.h" #include "lite/utils/all.h" #ifdef LITE_WITH_OPENCL DECLARE_string(cl_path); #endif namespace paddle { namespace lite { template class Context; using HostContext = Context; using X86Context = Context; using CUDAContext = Context; using ARMContext = Context; using NPUContext = Context; using OpenCLContext = Context; using FPGAContext = Context; template <> class Context { public: // NOTE: InitOnce should only be used by ContextScheduler void InitOnce() {} void CopySharedTo(HostContext* ctx) {} std::string name() const { return "HostContext"; } }; #ifdef LITE_WITH_NPU template <> class Context { public: Context() {} explicit Context(const NPUContext& ctx); // NOTE: InitOnce should only be used by ContextScheduler void InitOnce() {} void CopySharedTo(NPUContext* ctx) {} NPUContext& operator=(const NPUContext& ctx) {} std::string name() const { return "NPUContext"; } hiai::AiModelMngerClient* client(const std::string& model_name) const { return npu::DeviceInfo::Global().client(model_name); } }; #endif #ifdef LITE_WITH_ARM template <> class Context { public: Context() {} explicit Context(const ARMContext& ctx); ARMContext& operator=(const ARMContext& ctx) {} // NOTE: InitOnce should only be used by ContextScheduler void InitOnce() { DeviceInfo::Init(); } void CopySharedTo(ARMContext* ctx) {} void SetRunMode(lite_api::PowerMode mode, int threads) { return DeviceInfo::Global().SetRunMode(mode, threads); } void SetCache(int l1size, int l2size, int l3size) { return DeviceInfo::Global().SetCache(l1size, l2size, l3size); } void SetArch(ARMArch arch) { return DeviceInfo::Global().SetArch(arch); } lite_api::PowerMode mode() const { return DeviceInfo::Global().mode(); } int threads() const { return DeviceInfo::Global().threads(); } ARMArch arch() const { return DeviceInfo::Global().arch(); } int l1_cache_size() const { return DeviceInfo::Global().l1_cache_size(); } int l2_cache_size() const { return DeviceInfo::Global().l2_cache_size(); } int l3_cache_size() const { return DeviceInfo::Global().l3_cache_size(); } int llc_size() const { return DeviceInfo::Global().llc_size(); } bool has_dot() const { return DeviceInfo::Global().has_dot(); } bool has_fp16() const { return DeviceInfo::Global().has_fp16(); } template T* workspace_data() { return DeviceInfo::Global().workspace_data(); } bool ExtendWorkspace(size_t size) { return DeviceInfo::Global().ExtendWorkspace(size); } std::string name() const { return "ARMContext"; } }; #endif #ifdef LITE_WITH_FPGA // TODO(tianxiaogang): add needed implementation to context template <> class Context { public: Context() {} void InitOnce() {} FPGAContext& operator=(const FPGAContext& ctx) {} void CopySharedTo(FPGAContext* ctx) {} std::string name() const { return "FPGAContext"; } }; #endif #ifdef LITE_WITH_CUDA // Only works with CUDA kernels. template <> class Context { public: typename Env::Devs& devs = Env::Global(); // NOTE: InitOnce should only be used by ContextScheduler void InitOnce() { cublas_fp32_ = std::make_shared>(); } void Init(int dev_id, int exec_stream_id = 0, int io_stream_id = 0) { CHECK_GT(devs.size(), 0) << "Env is not initialized or current target is not exit!"; if (dev_id >= devs.size()) { LOG(WARNING) << "device index exceeds the number of devices, set to " "default device(0)!"; device_id_ = 0; } else { device_id_ = dev_id; } if (io_stream_id >= devs[dev_id].max_stream()) { LOG(WARNING) << "data stream index exceeds the maximum stream number, " "set to default stream(0)!"; io_stream_id = 0; } if (exec_stream_id >= devs[dev_id].max_stream()) { LOG(WARNING) << "exec stream index exceeds the maximum stream number, " "set to default stream(0)!"; exec_stream_id = 0; } exec_stream_ = devs[dev_id].exec_streams()[exec_stream_id]; io_stream_ = devs[dev_id].io_streams()[io_stream_id]; exec_stream_id_ = exec_stream_id; io_stream_id_ = io_stream_id; } void CopySharedTo(CUDAContext* ctx) { CHECK(ctx); CHECK(cublas_fp32_) << "cublas_fp32 should be set first"; ctx->cublas_fp32_ = cublas_fp32_; } const cudaStream_t& exec_stream() const { return exec_stream_; } void SetExecStream(cudaStream_t stream) { exec_stream_ = stream; } const cudaStream_t& io_stream() const { return io_stream_; } void SetIoStream(cudaStream_t stream) { io_stream_ = stream; } std::shared_ptr> cublas_fp32() { return cublas_fp32_; } void SetCuBlasFP32(std::shared_ptr> cublas_fp32) { cublas_fp32_ = cublas_fp32; } const std::vector& input_events() { return input_events_; } void SetInputEvents(const std::vector& input_events) { input_events_.clear(); input_events_.assign(input_events.begin(), input_events.end()); } const std::vector& output_events() { return output_events_; } void SetOutputEvents(const std::vector& output_events) { output_events_.clear(); output_events_.assign(output_events.begin(), output_events.end()); } std::string name() const { return "CUDAContext"; } private: int device_id_; // overall information int exec_stream_id_; int io_stream_id_; cudaStream_t exec_stream_; cudaStream_t io_stream_; // not thread-safe, should allocate for each thread. std::shared_ptr> cublas_fp32_; // kernel information std::vector input_events_; std::vector output_events_; }; #endif #ifdef LITE_WITH_X86 template <> class Context { public: Context() {} Context(Context&& ctx) {} // NOTE: InitOnce should only be used by ContextScheduler void InitOnce() {} void CopySharedTo(X86Context* ctx) {} std::string name() const { return "X86Context"; } private: // overall information // // kernel information }; #endif #ifdef LITE_WITH_OPENCL template <> class Context { std::shared_ptr cl_context_; using WaitListType = std::unordered_map(nullptr)), std::shared_ptr>; std::shared_ptr cl_wait_list_; public: CLContext* cl_context() { return cl_context_.get(); } WaitListType* cl_wait_list() { return cl_wait_list_.get(); } void InitOnce() { // Init cl runtime. CHECK(CLRuntime::Global()->IsInitSuccess()) << "OpenCL runtime init failed"; CLRuntime::Global()->set_cl_path(FLAGS_cl_path); cl_context_ = std::make_shared(); cl_wait_list_ = std::make_shared(); } void CopySharedTo(OpenCLContext* ctx) { ctx->cl_context_ = cl_context_; ctx->cl_wait_list_ = cl_wait_list_; } }; #endif // Context for running a kernel. // Holds the necessary resource and information. class KernelContext { public: template ContextT& As() { if (!ctx_.valid()) { ctx_.set(); } return *ctx_.get_mutable(); } private: Any ctx_; }; // The ContextScheduler helps to assign different context for each kernel. class ContextScheduler { public: static ContextScheduler& Global() { static auto* x = new ContextScheduler; return *x; } std::unique_ptr NewContext(TargetType target) { std::unique_ptr ctx(new KernelContext); switch (target) { case TARGET(kHost): kernel_contexts_[TargetType::kHost].As().CopySharedTo( &ctx->As()); break; #ifdef LITE_WITH_X86 case TARGET(kX86): kernel_contexts_[TargetType::kX86].As().CopySharedTo( &ctx->As()); break; #endif #ifdef LITE_WITH_CUDA case TARGET(kCUDA): { int dev_id = TargetWrapper::GetCurDevice(); auto& context = ctx->As(); context.Init(dev_id); kernel_contexts_[TargetType::kCUDA].As().CopySharedTo( &context); } break; #endif #ifdef LITE_WITH_ARM case TARGET(kARM): kernel_contexts_[TargetType::kARM].As().CopySharedTo( &ctx->As()); break; #endif #ifdef LITE_WITH_NPU case TARGET(kNPU): kernel_contexts_[TargetType::kNPU].As().CopySharedTo( &ctx->As()); break; #endif #ifdef LITE_WITH_OPENCL case TARGET(kOpenCL): kernel_contexts_[TargetType::kOpenCL].As().CopySharedTo( &ctx->As()); break; #endif #ifdef LITE_WITH_FPGA case TARGET(kFPGA): kernel_contexts_[TargetType::kFPGA].As().CopySharedTo( &ctx->As()); break; #endif default: #ifndef LITE_ON_MODEL_OPTIMIZE_TOOL LOG(FATAL) << "unsupported target " << TargetToStr(target); #endif break; } return ctx; } private: template void InitContext() { kernel_contexts_[Type].As().InitOnce(); } ContextScheduler() { InitContext(); #ifdef LITE_WITH_X86 InitContext(); #endif #ifdef LITE_WITH_CUDA InitContext(); #endif #ifdef LITE_WITH_ARM InitContext(); #endif #ifdef LITE_WITH_OPENCL InitContext(); #endif #ifdef LITE_WITH_FPGA InitContext(); #endif #ifdef LITE_WITH_NPU InitContext(); #endif } private: std::map kernel_contexts_; }; } // namespace lite } // namespace paddle