未验证 提交 560c8153 编写于 作者: Y Yiqun Liu 提交者: GitHub

Add some check for CUDA Driver API and NVRTC (#22719)

* Add the check for whether CUDA Driver and NVRTC is available for the runtime system.

* Call cuInit to initialize the CUDA Driver API before all CUDA callings.
test=develop

* Change the behavior when libnvrtc.so can not be found, printing a warning instead of exiting.
test=develop

* Do not initialize CUDA Driver API for windows and macos.
test=develop

* Remove the call of cuInit when entering paddle and enable the test_code_generator.
test=develop

* Add some built-in functions for __half.
test=develop

* Change save_intermediate_out to false in unittest.
test=develop

* Fix error reference to tempropary variable when seting including path for device_code.
test=develop
上级 9ec5eb0e
......@@ -2,7 +2,7 @@ cc_library(code_generator
SRCS operation.cc code_generator.cc code_generator_helper.cc
DEPS graph subgraph_detector)
if(WITH_GPU)
# cc_test(test_code_generator SRCS code_generator_tester.cc DEPS code_generator device_code lod_tensor graph_viz_pass)
cc_test(test_code_generator SRCS code_generator_tester.cc DEPS code_generator device_code lod_tensor graph_viz_pass)
endif()
cc_library(fusion_group_pass
......
......@@ -209,7 +209,10 @@ std::set<int> CodeGenerator::DistilIntermediateIds(
for (size_t i = 0; i < expressions.size(); i++) {
for (auto id : expressions[i].GetOutputIds()) {
auto intermediate_state = expressions[i].GetIntermediateState();
if (intermediate_state[id]) intermediate_ids.insert(id);
if (intermediate_state.find(id) != intermediate_state.end() &&
intermediate_state[id]) {
intermediate_ids.insert(id);
}
}
}
return intermediate_ids;
......
......@@ -44,10 +44,11 @@ static inline std::string VarName(int index) {
class OperationExpression {
public:
explicit OperationExpression(std::string op_type, std::vector<int> input_ids,
std::vector<int> output_ids,
std::string rhs_type, std::string lhs_type,
std::unordered_map<int, bool> intermediate_state)
explicit OperationExpression(
std::string op_type, const std::vector<int>& input_ids,
const std::vector<int>& output_ids, std::string rhs_type,
std::string lhs_type,
const std::unordered_map<int, bool>& intermediate_state = {})
: op_type_(op_type),
input_ids_(input_ids),
output_ids_(output_ids),
......
......@@ -176,7 +176,6 @@ void TestMainImpl(std::string func_name, std::string code_str,
bool is_float16 = std::type_index(typeid(T)) ==
std::type_index(typeid(paddle::platform::float16));
paddle::framework::InitDevices(false, {0});
paddle::platform::CUDAPlace place = paddle::platform::CUDAPlace(0);
paddle::platform::CUDADeviceCode device_code(place, func_name, code_str);
device_code.Compile(is_float16);
......@@ -266,7 +265,7 @@ void TestElementwiseMain(
}
int n = cpu_tensors[0].numel();
if (dtype == "float16") {
if (dtype == "__half") {
TestMainImpl<paddle::platform::float16>(func_name, code_str, cpu_tensors, n,
input_ids, output_ids);
} else {
......@@ -275,7 +274,7 @@ void TestElementwiseMain(
}
// Check the results
float eps = (dtype == "float16") ? 1E-2 : 1E-5;
float eps = (dtype == "__half") ? 1E-2 : 1E-5;
for (int i = 0; i < n; i++) {
fusion_group::CheckOutput(expressions, cpu_tensors, input_ids, output_ids,
i, eps);
......@@ -312,7 +311,7 @@ void TestMain(fusion_group::SubGraph* subgraph, std::vector<int> input_ids,
}
TEST(code_generator, elementwise) {
for (std::string dtype : {"float", "float16"}) {
for (std::string dtype : {"float", "__half"}) {
// t2 = t0 * t1
// t4 = t2 + t3
// t6 = t4 - t5
......@@ -342,7 +341,7 @@ TEST(code_generator, elementwise) {
}
TEST(code_generator, elementwise_grad) {
for (std::string dtype : {"float", "float16"}) {
for (std::string dtype : {"float", "__half"}) {
// The var order: t0, t1, t2, t3, t0', t1', t2', t3'
// t2 = t0 * t1
// t3 = relu(t2)
......@@ -407,7 +406,7 @@ std::unique_ptr<paddle::framework::ir::Graph> BuildGraph(bool backward,
std::unique_ptr<paddle::framework::ir::Graph> graph(
new paddle::framework::ir::Graph(layers.main_program()));
auto proto_dtype = (dtype == "float16")
auto proto_dtype = (dtype == "__half")
? paddle::framework::proto::VarType::FP16
: paddle::framework::proto::VarType::FP32;
for (auto* n : graph->Nodes()) {
......@@ -463,10 +462,10 @@ std::unordered_set<paddle::framework::ir::Node*> DistilGradNodes(
}
TEST(code_generator, subgraph) {
for (std::string dtype : {"float", "float16"}) {
for (std::string dtype : {"float", "__half"}) {
std::unique_ptr<paddle::framework::ir::Graph> graph =
BuildGraph(false, dtype);
fusion_group::SubGraph subgraph(0, "elementwise_kernel_1", true,
fusion_group::SubGraph subgraph(0, "elementwise_kernel_1", false,
graph->Nodes());
// Expressions generated by code_generator (they may be different):
......@@ -482,10 +481,10 @@ TEST(code_generator, subgraph) {
}
TEST(code_generator, subgraph_grad) {
for (std::string dtype : {"float", "float16"}) {
for (std::string dtype : {"float", "__half"}) {
std::unique_ptr<paddle::framework::ir::Graph> graph =
BuildGraph(true, dtype);
fusion_group::SubGraph subgraph(0, "elementwise_grad_kernel_1", true,
fusion_group::SubGraph subgraph(0, "elementwise_grad_kernel_1", false,
DistilGradNodes(graph));
// Expressions generated by code_generator (they may be different):
......
......@@ -35,6 +35,7 @@ __device__ inline double Sqrt(double x) { return sqrt(x); }
)";
// List some bulit-in functions of __half implemented in cuda_fp16.hpp
static constexpr char predefined_cuda_functions_fp16[] = R"(
#define __HALF_TO_US(var) *(reinterpret_cast<unsigned short *>(&(var)))
#define __HALF_TO_CUS(var) *(reinterpret_cast<const unsigned short *>(&(var)))
......@@ -193,6 +194,12 @@ __CUDA_FP16_DECL__ __half __hdiv(__half a, __half b) {
return v;
}
__CUDA_FP16_DECL__ __half __hneg(const __half a)
{
__half zero;
zero = __float2half(0.0);
return __hsub(zero, a);
}
/* Some basic arithmetic operations expected of a builtin */
__device__ __forceinline__ __half operator+(const __half &lh, const __half &rh) { return __hadd(lh, rh); }
......@@ -200,6 +207,10 @@ __device__ __forceinline__ __half operator-(const __half &lh, const __half &rh)
__device__ __forceinline__ __half operator*(const __half &lh, const __half &rh) { return __hmul(lh, rh); }
__device__ __forceinline__ __half operator/(const __half &lh, const __half &rh) { return __hdiv(lh, rh); }
/* Unary plus and inverse operators */
__device__ __forceinline__ __half operator+(const __half &h) { return h; }
__device__ __forceinline__ __half operator-(const __half &h) { return __hneg(h); }
/* Some basic comparison operations to make it look like a builtin */
__device__ __forceinline__ bool operator==(const __half &lh, const __half &rh) { return __heq(lh, rh); }
__device__ __forceinline__ bool operator!=(const __half &lh, const __half &rh) { return __hne(lh, rh); }
......
......@@ -30,6 +30,14 @@ namespace ir {
void FusionGroupPass::ApplyImpl(ir::Graph* graph) const {
FusePassBase::Init("fusion_group_pass", graph);
if (Get<bool>("use_gpu")) {
// TODO(liuyiqun): open this check.
// if (!platform::CUDADeviceCode::IsAvailable()) {
// LOG(WARNING)
// << "Disable fusion_group because CUDA Driver or NVRTC is not
// avaiable.";
// return 0;
// }
fusion_group::OperationMap::Init();
int num_elementwise_groups = DetectFusionGroup(graph, 0);
AddStatis(num_elementwise_groups);
......@@ -61,7 +69,7 @@ int FusionGroupPass::DetectFusionGroup(Graph* graph, int type) const {
subgraph.DetectIntermediateOutWithGraph(graph);
}
if (subgraph.IsValid(min_subgraph_size)) {
subgraph.SetFuncName("FusedElementwise" + std::to_string(index++));
subgraph.SetFuncName("fused_elementwise_" + std::to_string(index++));
if (GenerateCode(&subgraph)) {
InsertFusionGroupOp(graph, &subgraph);
num_subgraphs++;
......
......@@ -79,9 +79,61 @@ DeviceCodePool::DeviceCodePool(const std::vector<platform::Place>& places) {
#endif
}
}
#ifdef PADDLE_WITH_CUDA
CUDADeviceCode::CheckAvailableStatus();
#endif
}
#ifdef PADDLE_WITH_CUDA
static bool CheckCUDADriverResult(CUresult result, std::string caller,
std::string kernel_name = "") {
if (result != CUDA_SUCCESS) {
const char* error = nullptr;
dynload::cuGetErrorString(result, &error);
LOG_FIRST_N(WARNING, 1) << "Call " << caller << " for < " << kernel_name
<< " > failed: " << error << " (" << result << ")";
return false;
}
return true;
}
bool CUDADeviceCode::available_ = false;
void CUDADeviceCode::CheckAvailableStatus() {
available_ = false;
if (!dynload::HasNVRTC() || !dynload::HasCUDADriver()) {
LOG_FIRST_N(WARNING, 1)
<< "NVRTC and CUDA driver are need for JIT compiling of CUDA code.";
return;
}
int nvrtc_major = 0;
int nvrtc_minor = 0;
nvrtcResult nvrtc_result = dynload::nvrtcVersion(&nvrtc_major, &nvrtc_minor);
int driver_version = 0;
int dirver_major = 0;
int driver_minor = 0;
CUresult driver_result = dynload::cuDriverGetVersion(&driver_version);
if (driver_result == CUDA_SUCCESS) {
dirver_major = driver_version / 1000;
driver_minor = (driver_version % 1000) / 10;
}
LOG_FIRST_N(INFO, 1) << "CUDA Driver Version: " << dirver_major << "."
<< driver_minor << "; NVRTC Version: " << nvrtc_major
<< "." << nvrtc_minor;
if (nvrtc_result != NVRTC_SUCCESS || driver_result != CUDA_SUCCESS) {
return;
}
int count = 0;
if (CheckCUDADriverResult(dynload::cuDeviceGetCount(&count),
"cuDeviceGetCount")) {
available_ = true;
}
}
static std::string FindCUDAIncludePath() {
auto EndWith = [](std::string str, std::string substr) -> bool {
size_t pos = str.rfind(substr);
......@@ -137,7 +189,7 @@ CUDADeviceCode::CUDADeviceCode(const Place& place, const std::string& name,
bool CUDADeviceCode::Compile(bool include_path) {
is_compiled_ = false;
if (!dynload::HasNVRTC() || !dynload::HasCUDADriver()) {
LOG(WARNING)
LOG_FIRST_N(WARNING, 1)
<< "NVRTC and CUDA driver are need for JIT compiling of CUDA code.";
return false;
}
......@@ -160,10 +212,11 @@ bool CUDADeviceCode::Compile(bool include_path) {
std::string compute_flag =
"--gpu-architecture=compute_" + std::to_string(compute_capability);
std::vector<const char*> options = {"--std=c++11", compute_flag.c_str()};
std::string include_option;
if (include_path) {
std::string cuda_include_path = FindCUDAIncludePath();
if (!cuda_include_path.empty()) {
std::string include_option = "--include-path=" + cuda_include_path;
include_option = "--include-path=" + cuda_include_path;
options.push_back(include_option.c_str());
}
}
......@@ -209,13 +262,13 @@ bool CUDADeviceCode::Compile(bool include_path) {
}
if (!CheckCUDADriverResult(dynload::cuModuleLoadData(&module_, ptx_.data()),
"cuModuleLoadData")) {
"cuModuleLoadData", name_)) {
return false;
}
if (!CheckCUDADriverResult(
dynload::cuModuleGetFunction(&function_, module_, name_.c_str()),
"cuModuleGetFunction")) {
"cuModuleGetFunction", name_)) {
return false;
}
......@@ -253,19 +306,9 @@ void CUDADeviceCode::Launch(const size_t n, std::vector<void*>* args) const {
bool CUDADeviceCode::CheckNVRTCResult(nvrtcResult result,
std::string function) {
if (result != NVRTC_SUCCESS) {
LOG(WARNING) << "Call " << function
<< " failed: " << dynload::nvrtcGetErrorString(result);
return false;
}
return true;
}
bool CUDADeviceCode::CheckCUDADriverResult(CUresult result,
std::string function) {
if (result != CUDA_SUCCESS) {
const char* error = nullptr;
LOG(WARNING) << "Call " << function
<< " failed: " << dynload::cuGetErrorString(result, &error);
LOG_FIRST_N(WARNING, 1)
<< "Call " << function << " for < " << name_
<< " > failed: " << dynload::nvrtcGetErrorString(result);
return false;
}
return true;
......
......@@ -56,9 +56,13 @@ class CUDADeviceCode : public DeviceCode {
workload_per_thread_ = workload_per_thread;
}
static void CheckAvailableStatus();
static bool IsAvailable() { return available_; }
private:
bool CheckNVRTCResult(nvrtcResult result, std::string function);
bool CheckCUDADriverResult(CUresult result, std::string function);
static bool available_;
bool is_compiled_{false};
int max_threads_{0};
......
......@@ -60,6 +60,8 @@ extern bool HasCUDADriver();
* include all needed cuda driver functions
**/
#define CUDA_ROUTINE_EACH(__macro) \
__macro(cuInit); \
__macro(cuDriverGetVersion); \
__macro(cuGetErrorString); \
__macro(cuModuleLoadData); \
__macro(cuModuleGetFunction); \
......@@ -68,7 +70,7 @@ extern bool HasCUDADriver();
__macro(cuLaunchKernel); \
__macro(cuCtxCreate); \
__macro(cuCtxGetCurrent); \
__macro(cuDeviceGet); \
__macro(cuDeviceGetCount); \
__macro(cuDevicePrimaryCtxGetState)
CUDA_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDA_WRAP);
......
......@@ -164,7 +164,7 @@ static inline void* GetDsoHandleFromSearchPath(const std::string& search_root,
}
auto error_msg =
"Failed to find dynamic library: %s ( %s ) \n Please specify "
"its path correctly using following ways: \n Method. set "
"its path correctly using following ways: \n set "
"environment variable LD_LIBRARY_PATH on Linux or "
"DYLD_LIBRARY_PATH on Mac OS. \n For instance, issue command: "
"export LD_LIBRARY_PATH=... \n Note: After Mac OS 10.11, "
......@@ -238,17 +238,17 @@ void* GetCusolverDsoHandle() {
void* GetNVRTCDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.dylib");
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.dylib", false);
#else
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.so");
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.so", false);
#endif
}
void* GetCUDADsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.dylib");
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.dylib", false);
#else
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.so");
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.so", false);
#endif
}
......
......@@ -60,6 +60,7 @@ extern bool HasNVRTC();
* include all needed nvrtc functions
**/
#define NVRTC_ROUTINE_EACH(__macro) \
__macro(nvrtcVersion); \
__macro(nvrtcGetErrorString); \
__macro(nvrtcCompileProgram); \
__macro(nvrtcCreateProgram); \
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册