diff --git a/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt b/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt index 49dbb0106b634b35898c36dde4a5b27698f3cbb2..fe2bd27524fbfc7f3d2b68f5e3da8feefd46557a 100644 --- a/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt +++ b/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt @@ -2,7 +2,7 @@ cc_library(code_generator SRCS operation.cc code_generator.cc code_generator_helper.cc DEPS graph subgraph_detector) if(WITH_GPU) -# cc_test(test_code_generator SRCS code_generator_tester.cc DEPS code_generator device_code lod_tensor graph_viz_pass) + cc_test(test_code_generator SRCS code_generator_tester.cc DEPS code_generator device_code lod_tensor graph_viz_pass) endif() cc_library(fusion_group_pass diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator.cc b/paddle/fluid/framework/ir/fusion_group/code_generator.cc index 6555a65c3bd4c21740b3d44ac18a756d50d22f00..431d3c05f6dd4b44074729716555744773f950e7 100644 --- a/paddle/fluid/framework/ir/fusion_group/code_generator.cc +++ b/paddle/fluid/framework/ir/fusion_group/code_generator.cc @@ -209,7 +209,10 @@ std::set CodeGenerator::DistilIntermediateIds( for (size_t i = 0; i < expressions.size(); i++) { for (auto id : expressions[i].GetOutputIds()) { auto intermediate_state = expressions[i].GetIntermediateState(); - if (intermediate_state[id]) intermediate_ids.insert(id); + if (intermediate_state.find(id) != intermediate_state.end() && + intermediate_state[id]) { + intermediate_ids.insert(id); + } } } return intermediate_ids; diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator_helper.h b/paddle/fluid/framework/ir/fusion_group/code_generator_helper.h index 63197c00ff1ced99bd1e91a51a530e7999e77268..03d28277afbbb7467f638d521414d702bc8e8179 100644 --- a/paddle/fluid/framework/ir/fusion_group/code_generator_helper.h +++ b/paddle/fluid/framework/ir/fusion_group/code_generator_helper.h @@ -44,10 +44,11 @@ static inline std::string VarName(int index) { class OperationExpression { public: - explicit OperationExpression(std::string op_type, std::vector input_ids, - std::vector output_ids, - std::string rhs_type, std::string lhs_type, - std::unordered_map intermediate_state) + explicit OperationExpression( + std::string op_type, const std::vector& input_ids, + const std::vector& output_ids, std::string rhs_type, + std::string lhs_type, + const std::unordered_map& intermediate_state = {}) : op_type_(op_type), input_ids_(input_ids), output_ids_(output_ids), diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc index 92dec555cab996e2dc7549311ae813953e6ce9dd..89b05fc577bb46606ff5c43d0dd697bd7b8aed38 100644 --- a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc +++ b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc @@ -176,7 +176,6 @@ void TestMainImpl(std::string func_name, std::string code_str, bool is_float16 = std::type_index(typeid(T)) == std::type_index(typeid(paddle::platform::float16)); - paddle::framework::InitDevices(false, {0}); paddle::platform::CUDAPlace place = paddle::platform::CUDAPlace(0); paddle::platform::CUDADeviceCode device_code(place, func_name, code_str); device_code.Compile(is_float16); @@ -266,7 +265,7 @@ void TestElementwiseMain( } int n = cpu_tensors[0].numel(); - if (dtype == "float16") { + if (dtype == "__half") { TestMainImpl(func_name, code_str, cpu_tensors, n, input_ids, output_ids); } else { @@ -275,7 +274,7 @@ void TestElementwiseMain( } // Check the results - float eps = (dtype == "float16") ? 1E-2 : 1E-5; + float eps = (dtype == "__half") ? 1E-2 : 1E-5; for (int i = 0; i < n; i++) { fusion_group::CheckOutput(expressions, cpu_tensors, input_ids, output_ids, i, eps); @@ -312,7 +311,7 @@ void TestMain(fusion_group::SubGraph* subgraph, std::vector input_ids, } TEST(code_generator, elementwise) { - for (std::string dtype : {"float", "float16"}) { + for (std::string dtype : {"float", "__half"}) { // t2 = t0 * t1 // t4 = t2 + t3 // t6 = t4 - t5 @@ -342,7 +341,7 @@ TEST(code_generator, elementwise) { } TEST(code_generator, elementwise_grad) { - for (std::string dtype : {"float", "float16"}) { + for (std::string dtype : {"float", "__half"}) { // The var order: t0, t1, t2, t3, t0', t1', t2', t3' // t2 = t0 * t1 // t3 = relu(t2) @@ -407,7 +406,7 @@ std::unique_ptr BuildGraph(bool backward, std::unique_ptr graph( new paddle::framework::ir::Graph(layers.main_program())); - auto proto_dtype = (dtype == "float16") + auto proto_dtype = (dtype == "__half") ? paddle::framework::proto::VarType::FP16 : paddle::framework::proto::VarType::FP32; for (auto* n : graph->Nodes()) { @@ -463,10 +462,10 @@ std::unordered_set DistilGradNodes( } TEST(code_generator, subgraph) { - for (std::string dtype : {"float", "float16"}) { + for (std::string dtype : {"float", "__half"}) { std::unique_ptr graph = BuildGraph(false, dtype); - fusion_group::SubGraph subgraph(0, "elementwise_kernel_1", true, + fusion_group::SubGraph subgraph(0, "elementwise_kernel_1", false, graph->Nodes()); // Expressions generated by code_generator (they may be different): @@ -482,10 +481,10 @@ TEST(code_generator, subgraph) { } TEST(code_generator, subgraph_grad) { - for (std::string dtype : {"float", "float16"}) { + for (std::string dtype : {"float", "__half"}) { std::unique_ptr graph = BuildGraph(true, dtype); - fusion_group::SubGraph subgraph(0, "elementwise_grad_kernel_1", true, + fusion_group::SubGraph subgraph(0, "elementwise_grad_kernel_1", false, DistilGradNodes(graph)); // Expressions generated by code_generator (they may be different): diff --git a/paddle/fluid/framework/ir/fusion_group/cuda_resources.h b/paddle/fluid/framework/ir/fusion_group/cuda_resources.h index 6514b87b06ec1602ef5deb55e53094aeb1803f65..67838b4e3c75387e617efc3712b82dfe15382395 100644 --- a/paddle/fluid/framework/ir/fusion_group/cuda_resources.h +++ b/paddle/fluid/framework/ir/fusion_group/cuda_resources.h @@ -35,6 +35,7 @@ __device__ inline double Sqrt(double x) { return sqrt(x); } )"; +// List some bulit-in functions of __half implemented in cuda_fp16.hpp static constexpr char predefined_cuda_functions_fp16[] = R"( #define __HALF_TO_US(var) *(reinterpret_cast(&(var))) #define __HALF_TO_CUS(var) *(reinterpret_cast(&(var))) @@ -193,6 +194,12 @@ __CUDA_FP16_DECL__ __half __hdiv(__half a, __half b) { return v; } +__CUDA_FP16_DECL__ __half __hneg(const __half a) +{ + __half zero; + zero = __float2half(0.0); + return __hsub(zero, a); +} /* Some basic arithmetic operations expected of a builtin */ __device__ __forceinline__ __half operator+(const __half &lh, const __half &rh) { return __hadd(lh, rh); } @@ -200,6 +207,10 @@ __device__ __forceinline__ __half operator-(const __half &lh, const __half &rh) __device__ __forceinline__ __half operator*(const __half &lh, const __half &rh) { return __hmul(lh, rh); } __device__ __forceinline__ __half operator/(const __half &lh, const __half &rh) { return __hdiv(lh, rh); } +/* Unary plus and inverse operators */ +__device__ __forceinline__ __half operator+(const __half &h) { return h; } +__device__ __forceinline__ __half operator-(const __half &h) { return __hneg(h); } + /* Some basic comparison operations to make it look like a builtin */ __device__ __forceinline__ bool operator==(const __half &lh, const __half &rh) { return __heq(lh, rh); } __device__ __forceinline__ bool operator!=(const __half &lh, const __half &rh) { return __hne(lh, rh); } diff --git a/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc b/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc index d70c81736dcab584312f83990a32d994f004cdcf..4dbb8ae4c6ec703c80da2f3a7a3e4bc59f7b2439 100644 --- a/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc +++ b/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc @@ -30,6 +30,14 @@ namespace ir { void FusionGroupPass::ApplyImpl(ir::Graph* graph) const { FusePassBase::Init("fusion_group_pass", graph); if (Get("use_gpu")) { + // TODO(liuyiqun): open this check. + // if (!platform::CUDADeviceCode::IsAvailable()) { + // LOG(WARNING) + // << "Disable fusion_group because CUDA Driver or NVRTC is not + // avaiable."; + // return 0; + // } + fusion_group::OperationMap::Init(); int num_elementwise_groups = DetectFusionGroup(graph, 0); AddStatis(num_elementwise_groups); @@ -61,7 +69,7 @@ int FusionGroupPass::DetectFusionGroup(Graph* graph, int type) const { subgraph.DetectIntermediateOutWithGraph(graph); } if (subgraph.IsValid(min_subgraph_size)) { - subgraph.SetFuncName("FusedElementwise" + std::to_string(index++)); + subgraph.SetFuncName("fused_elementwise_" + std::to_string(index++)); if (GenerateCode(&subgraph)) { InsertFusionGroupOp(graph, &subgraph); num_subgraphs++; diff --git a/paddle/fluid/platform/device_code.cc b/paddle/fluid/platform/device_code.cc index 4f13f8e38893bd55c12771e9c8b180ac37a5259a..e8b2d5d4ed12d516776eace322d0aff6cb381d71 100644 --- a/paddle/fluid/platform/device_code.cc +++ b/paddle/fluid/platform/device_code.cc @@ -79,9 +79,61 @@ DeviceCodePool::DeviceCodePool(const std::vector& places) { #endif } } + +#ifdef PADDLE_WITH_CUDA + CUDADeviceCode::CheckAvailableStatus(); +#endif } #ifdef PADDLE_WITH_CUDA +static bool CheckCUDADriverResult(CUresult result, std::string caller, + std::string kernel_name = "") { + if (result != CUDA_SUCCESS) { + const char* error = nullptr; + dynload::cuGetErrorString(result, &error); + LOG_FIRST_N(WARNING, 1) << "Call " << caller << " for < " << kernel_name + << " > failed: " << error << " (" << result << ")"; + return false; + } + return true; +} + +bool CUDADeviceCode::available_ = false; +void CUDADeviceCode::CheckAvailableStatus() { + available_ = false; + if (!dynload::HasNVRTC() || !dynload::HasCUDADriver()) { + LOG_FIRST_N(WARNING, 1) + << "NVRTC and CUDA driver are need for JIT compiling of CUDA code."; + return; + } + + int nvrtc_major = 0; + int nvrtc_minor = 0; + nvrtcResult nvrtc_result = dynload::nvrtcVersion(&nvrtc_major, &nvrtc_minor); + + int driver_version = 0; + int dirver_major = 0; + int driver_minor = 0; + CUresult driver_result = dynload::cuDriverGetVersion(&driver_version); + if (driver_result == CUDA_SUCCESS) { + dirver_major = driver_version / 1000; + driver_minor = (driver_version % 1000) / 10; + } + + LOG_FIRST_N(INFO, 1) << "CUDA Driver Version: " << dirver_major << "." + << driver_minor << "; NVRTC Version: " << nvrtc_major + << "." << nvrtc_minor; + if (nvrtc_result != NVRTC_SUCCESS || driver_result != CUDA_SUCCESS) { + return; + } + + int count = 0; + if (CheckCUDADriverResult(dynload::cuDeviceGetCount(&count), + "cuDeviceGetCount")) { + available_ = true; + } +} + static std::string FindCUDAIncludePath() { auto EndWith = [](std::string str, std::string substr) -> bool { size_t pos = str.rfind(substr); @@ -137,7 +189,7 @@ CUDADeviceCode::CUDADeviceCode(const Place& place, const std::string& name, bool CUDADeviceCode::Compile(bool include_path) { is_compiled_ = false; if (!dynload::HasNVRTC() || !dynload::HasCUDADriver()) { - LOG(WARNING) + LOG_FIRST_N(WARNING, 1) << "NVRTC and CUDA driver are need for JIT compiling of CUDA code."; return false; } @@ -160,10 +212,11 @@ bool CUDADeviceCode::Compile(bool include_path) { std::string compute_flag = "--gpu-architecture=compute_" + std::to_string(compute_capability); std::vector options = {"--std=c++11", compute_flag.c_str()}; + std::string include_option; if (include_path) { std::string cuda_include_path = FindCUDAIncludePath(); if (!cuda_include_path.empty()) { - std::string include_option = "--include-path=" + cuda_include_path; + include_option = "--include-path=" + cuda_include_path; options.push_back(include_option.c_str()); } } @@ -209,13 +262,13 @@ bool CUDADeviceCode::Compile(bool include_path) { } if (!CheckCUDADriverResult(dynload::cuModuleLoadData(&module_, ptx_.data()), - "cuModuleLoadData")) { + "cuModuleLoadData", name_)) { return false; } if (!CheckCUDADriverResult( dynload::cuModuleGetFunction(&function_, module_, name_.c_str()), - "cuModuleGetFunction")) { + "cuModuleGetFunction", name_)) { return false; } @@ -253,19 +306,9 @@ void CUDADeviceCode::Launch(const size_t n, std::vector* args) const { bool CUDADeviceCode::CheckNVRTCResult(nvrtcResult result, std::string function) { if (result != NVRTC_SUCCESS) { - LOG(WARNING) << "Call " << function - << " failed: " << dynload::nvrtcGetErrorString(result); - return false; - } - return true; -} - -bool CUDADeviceCode::CheckCUDADriverResult(CUresult result, - std::string function) { - if (result != CUDA_SUCCESS) { - const char* error = nullptr; - LOG(WARNING) << "Call " << function - << " failed: " << dynload::cuGetErrorString(result, &error); + LOG_FIRST_N(WARNING, 1) + << "Call " << function << " for < " << name_ + << " > failed: " << dynload::nvrtcGetErrorString(result); return false; } return true; diff --git a/paddle/fluid/platform/device_code.h b/paddle/fluid/platform/device_code.h index 385207544063bfc3c1d53a51cc55e2ff18e60b50..6128d8b78db05e6ba92ffae46ee4c9cc8895c105 100644 --- a/paddle/fluid/platform/device_code.h +++ b/paddle/fluid/platform/device_code.h @@ -56,9 +56,13 @@ class CUDADeviceCode : public DeviceCode { workload_per_thread_ = workload_per_thread; } + static void CheckAvailableStatus(); + static bool IsAvailable() { return available_; } + private: bool CheckNVRTCResult(nvrtcResult result, std::string function); - bool CheckCUDADriverResult(CUresult result, std::string function); + + static bool available_; bool is_compiled_{false}; int max_threads_{0}; diff --git a/paddle/fluid/platform/dynload/cuda_driver.h b/paddle/fluid/platform/dynload/cuda_driver.h index a37a47b7900d7ac50b277197b42d4f431bde7179..d39aceaa11bca0cce4f160c0a66dbd92d1813acb 100644 --- a/paddle/fluid/platform/dynload/cuda_driver.h +++ b/paddle/fluid/platform/dynload/cuda_driver.h @@ -60,6 +60,8 @@ extern bool HasCUDADriver(); * include all needed cuda driver functions **/ #define CUDA_ROUTINE_EACH(__macro) \ + __macro(cuInit); \ + __macro(cuDriverGetVersion); \ __macro(cuGetErrorString); \ __macro(cuModuleLoadData); \ __macro(cuModuleGetFunction); \ @@ -68,7 +70,7 @@ extern bool HasCUDADriver(); __macro(cuLaunchKernel); \ __macro(cuCtxCreate); \ __macro(cuCtxGetCurrent); \ - __macro(cuDeviceGet); \ + __macro(cuDeviceGetCount); \ __macro(cuDevicePrimaryCtxGetState) CUDA_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDA_WRAP); diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc index dd62265e333c9832eb6b1f7bed7cffe726a17c70..8ed998b0a464eb2e732b3dec7b1bd82c84163e3f 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.cc +++ b/paddle/fluid/platform/dynload/dynamic_loader.cc @@ -164,10 +164,10 @@ static inline void* GetDsoHandleFromSearchPath(const std::string& search_root, } auto error_msg = "Failed to find dynamic library: %s ( %s ) \n Please specify " - "its path correctly using following ways: \n Method. set " + "its path correctly using following ways: \n set " "environment variable LD_LIBRARY_PATH on Linux or " - "DYLD_LIBRARY_PATH on Mac OS. \n For instance, issue command: " - "export LD_LIBRARY_PATH=... \n Note: After Mac OS 10.11, " + "DYLD_LIBRARY_PATH on Mac OS. \n For instance, issue command: " + "export LD_LIBRARY_PATH=... \n Note: After Mac OS 10.11, " "using the DYLD_LIBRARY_PATH is impossible unless System " "Integrity Protection (SIP) is disabled."; #if !defined(_WIN32) @@ -238,17 +238,17 @@ void* GetCusolverDsoHandle() { void* GetNVRTCDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) - return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.dylib"); + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.dylib", false); #else - return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.so"); + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.so", false); #endif } void* GetCUDADsoHandle() { #if defined(__APPLE__) || defined(__OSX__) - return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.dylib"); + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.dylib", false); #else - return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.so"); + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.so", false); #endif } diff --git a/paddle/fluid/platform/dynload/nvrtc.h b/paddle/fluid/platform/dynload/nvrtc.h index b4437099baa63b46bb646242c45b34de524da916..08f81d4ea85610238aa0053c0008c0a03e2361d6 100644 --- a/paddle/fluid/platform/dynload/nvrtc.h +++ b/paddle/fluid/platform/dynload/nvrtc.h @@ -60,6 +60,7 @@ extern bool HasNVRTC(); * include all needed nvrtc functions **/ #define NVRTC_ROUTINE_EACH(__macro) \ + __macro(nvrtcVersion); \ __macro(nvrtcGetErrorString); \ __macro(nvrtcCompileProgram); \ __macro(nvrtcCreateProgram); \