提交 0e3a6329 编写于 作者: M Megvii Engine Team

build(cuda): support cu111 build

GitOrigin-RevId: b3067ba4d5f1225048838e4b2460d858c475b75e
上级 e9db061e
......@@ -39,6 +39,9 @@ option(MGE_DISABLE_FLOAT16 "Disable MegEngine float16 support." OFF)
option(MGE_WITH_CUDA "Enable MegEngine CUDA support." ON)
option(MGE_CUDA_USE_STATIC "Enable MegEngine CUDA static linking." ON)
option(MGE_WITH_TRT "Build MegEngine with TensorRT." ON)
option(MGE_WITH_CUDA_STUB "Build MegEngine with CUDA stub." ON)
option(MGE_WITH_NVRTC_STUB "Build MegEngine with NVRTC stub." OFF)
option(MGE_WITH_CUDNN_SHARED "Build MegEngine with CUDNN shared." OFF)
option(MGE_USE_SYSTEM_LIB "Build MegEngine with system libraries." OFF)
option(MGB_WITH_FLATBUFFERS "Build MegBrain with FlatBuffers serialization support." ON)
option(MGE_WITH_CAMBRICON "Build MegEngine with Cambricon support" OFF)
......@@ -55,6 +58,14 @@ option(MGE_BUILD_SDK "Build load_and_run" ON)
option(MGE_INFERENCE_ONLY "Build inference only library." OFF)
option(MGE_WITH_MKLDNN "Enable Intel MKL_DNN support," ON)
option(MGE_WITH_ROCM "Enable ROCM support" OFF)
option(MGE_WITH_LARGE_ARCHIVE "Enable big archive link support" OFF)
if(MGE_WITH_NVRTC_STUB OR MGE_WITH_CUDA_STUB)
set(MGE_WITH_ANY_CUDA_STUB ON)
else()
set(MGE_WITH_ANY_CUDA_STUB OFF)
endif()
if(NOT ${MGE_BIN_REDUCE} STREQUAL "")
message(STATUS "build with BIN REDUCE")
......@@ -205,14 +216,24 @@ else()
endif()
endif()
if(MGE_WITH_CUDA)
include(cmake/cudnn.cmake)
if(MGE_CUDA_USE_STATIC AND ("${CUDNN_VERSION}" VERSION_GREATER "8.0.0" OR "${CUDNN_VERSION}" VERSION_EQUAL "8.0.0") AND (NOT MGE_WITH_CUDNN_SHARED))
message(WARNING "Static link CUDNN8 will auto enable MGE_WITH_LARGE_ARCHIVE=ON")
set(MGE_WITH_LARGE_ARCHIVE ON)
endif()
endif()
CHECK_CXX_COMPILER_FLAG(-fuse-ld=gold CXX_SUPPORT_GOLD)
if(CXX_SUPPORT_GOLD AND NOT ANDROID AND NOT APPLE AND NOT MSVC AND NOT WIN32)
if(MGE_WITH_LARGE_ARCHIVE)
message(STATUS "Set -mcmodel=large and disable -fuse-ld=gold")
set(MGE_COMMON_LINKER_FLAGS "-mcmodel=large")
elseif(CXX_SUPPORT_GOLD AND NOT ANDROID AND NOT APPLE AND NOT MSVC AND NOT WIN32 AND NOT MGE_WITH_LARGE_ARCHIVE)
message(STATUS "Using GNU gold linker.")
set(MGE_COMMON_LINKER_FLAGS "-fuse-ld=gold")
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${MGE_COMMON_LINKER_FLAGS}")
set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} ${MGE_COMMON_LINKER_FLAGS}")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${MGE_COMMON_LINKER_FLAGS}")
set(MGE_COMMON_LINKER_FLAGS "-fuse-ld=gold")
endif()
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${MGE_COMMON_LINKER_FLAGS}")
set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} ${MGE_COMMON_LINKER_FLAGS}")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${MGE_COMMON_LINKER_FLAGS}")
if(NOT MGE_WITH_JIT)
if(MGE_WITH_HALIDE)
......@@ -353,11 +374,28 @@ if(MGE_WITH_CUDA)
if(NOT MGE_ENABLE_EXCEPTIONS)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -fno-exceptions")
endif()
if(NOT MGE_CUDA_GENCODE)
if(${MGE_ARCH} STREQUAL "x86_64" OR ${MGE_ARCH} STREQUAL "i386")
set(MEGDNN_THREADS_512 0)
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.0.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.0.0")
if(MGE_WITH_CUDA AND MGE_CUDA_USE_STATIC AND ("${CUDNN_VERSION}" VERSION_GREATER "8.0.0" OR "${CUDNN_VERSION}" VERSION_EQUAL "8.0.0") AND (NOT MGE_WITH_CUDNN_SHARED))
message(WARNING "Static link CUDNN8 with many sm is unworkable, we only enable sm61 sm70 sm75 by default, and enable MGE_WITH_LARGE_ARCHIVE=ON")
set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=sm_61")
set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_70,code=sm_70")
set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_75,code=sm_75")
elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "11.1.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "11.1.0")
set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=sm_61")
set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_70,code=sm_70")
set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_75,code=sm_75")
set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_80,code=sm_80")
set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_86,code=sm_86")
set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_86,code=compute_86")
elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "11.0.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "11.0.0")
set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=sm_61")
set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_70,code=sm_70")
set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_75,code=sm_75")
set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_80,code=sm_80")
set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_80,code=compute_80")
elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.0.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.0.0")
set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_52,code=sm_52")
set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_60,code=sm_60")
set(MGE_CUDA_GENCODE "${MGE_CUDA_GENCODE} -gencode arch=compute_61,code=sm_61")
......@@ -385,7 +423,6 @@ if(MGE_WITH_CUDA)
endif()
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${MGE_CUDA_GENCODE}")
include(cmake/cudnn.cmake)
if(MGE_WITH_TRT)
include(cmake/tensorrt.cmake)
endif()
......@@ -394,12 +431,30 @@ if(MGE_WITH_CUDA)
if(MSVC OR WIN32)
list(APPEND MGE_CUDA_LIBS ${TRT_LIBRARY} ${CUDNN_LIBRARY})
message(STATUS "windows TRT_LIBRARY: ${TRT_LIBRARY}")
else()
if(TensorRT_VERSION_MAJOR GREATER_EQUAL 7)
list(APPEND MGE_CUDA_LIBS -Wl,--whole-archive libnvinfer myelin_compiler_static myelin_executor_static myelin_pattern_runtime_static myelin_pattern_library_static -Wl,--no-whole-archive)
else()
list(APPEND MGE_CUDA_LIBS -Wl,--whole-archive libnvinfer -Wl,--no-whole-archive)
endif()
endif()
endif()
if("${CUDNN_VERSION}" STREQUAL "7.5.0")
if(MSVC OR WIN32)
message(STATUS "windows CUDNN_LIBRARY: ${CUDNN_LIBRARY}")
list(APPEND MGE_CUDA_LIBS ${CUDNN_LIBRARY})
else()
list(APPEND MGE_CUDA_LIBS -Wl,--whole-archive libnvinfer libcudnn -Wl,--no-whole-archive)
message(STATUS "cudnn 7.5.0 has bug in cudnnConvolutionBiasActivationForward, need --whole-archive to workaround, ref https://docs.nvidia.com/deeplearning/cudnn/release-notes/rel_7xx.html")
list(APPEND MGE_CUDA_LIBS -Wl,--whole-archive libcudnn -Wl,--no-whole-archive)
endif()
else()
list(APPEND MGE_CUDA_LIBS -Wl,--whole-archive libcudnn -Wl,--no-whole-archive)
if(MSVC OR WIN32)
message(STATUS "windows CUDNN_LIBRARY: ${CUDNN_LIBRARY}")
list(APPEND MGE_CUDA_LIBS ${CUDNN_LIBRARY})
else()
list(APPEND MGE_CUDA_LIBS libcudnn)
endif()
endif()
if(MSVC OR WIN32)
list(APPEND MGE_CUDA_LIBS cusolver.lib cublas.lib curand.lib cudart_static.lib cusparse.lib)
......@@ -447,15 +502,37 @@ if(MGE_WITH_CUDA)
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER "10.1.0" OR ${CMAKE_CUDA_COMPILER_VERSION} VERSION_EQUAL "10.1.0")
list(APPEND MGE_CUDA_LIBS cublasLt cusolver cublas curand)
endif()
list(APPEND MGE_CUDA_LIBS cudart)
endif()
if(NOT MGE_WITH_CUDA_STUB)
if(MSVC OR WIN32)
list(APPEND MGE_CUDA_LIBS cuda.lib)
else()
list(APPEND MGE_CUDA_LIBS cuda)
endif()
endif()
if(NOT MGE_WITH_NVRTC_STUB)
if(MSVC OR WIN32)
list(APPEND MGE_CUDA_LIBS nvrtc.lib)
else()
list(APPEND MGE_CUDA_LIBS nvrtc)
endif()
endif()
if(MGE_WITH_ANY_CUDA_STUB)
add_subdirectory(dnn/cuda-stub)
list(APPEND MGE_CUDA_LIBS cuda-stub)
endif()
add_subdirectory(dnn/cuda-stub)
if(MSVC OR WIN32)
list(APPEND MGE_CUDA_LIBS nvrtc.lib cuda-stub)
list(APPEND MGE_CUDA_LIBS nvrtc.lib)
else()
list(APPEND MGE_CUDA_LIBS nvrtc cuda-stub nvToolsExt)
list(APPEND MGE_CUDA_LIBS nvToolsExt)
endif()
set(MGE_CUDA_LIBS "${MGE_CUDA_LIBS}")
set(MGE_CUDA_LIBS "${MGE_CUDA_LIBS} -lrt")
endif()
if(MGE_WITH_CAMBRICON)
......@@ -800,6 +877,9 @@ if(TARGET _imperative_rt)
COMMAND ${CMAKE_COMMAND} -E create_symlink
${CMAKE_CURRENT_BINARY_DIR}/imperative/python/${PACKAGE_NAME}/core/$<TARGET_FILE_NAME:${MODULE_NAME}>
${CMAKE_CURRENT_SOURCE_DIR}/imperative/python/${PACKAGE_NAME}/core/$<TARGET_FILE_NAME:${MODULE_NAME}>
COMMAND ${CMAKE_COMMAND} -E create_symlink
${CMAKE_CURRENT_BINARY_DIR}/imperative/python/${PACKAGE_NAME}/version.py
${CMAKE_CURRENT_SOURCE_DIR}/imperative/python/${PACKAGE_NAME}/version.py
DEPENDS _imperative_rt
VERBATIM
)
......@@ -863,3 +943,9 @@ if(MGE_WITH_JIT_MLIR)
add_subdirectory(tools/mlir/mgb-opt)
add_subdirectory(tools/mlir/mgb-file-check)
endif()
if(MGE_WITH_CUDA AND MGE_CUDA_USE_STATIC AND("${CUDNN_VERSION}" VERSION_GREATER "8.0.0" OR "${CUDNN_VERSION}" VERSION_EQUAL "8.0.0") AND (NOT MGE_WITH_CUDNN_SHARED))
message(WARNING "Static link CUDNN8 with many sm is unworkable, please use -DMGE_WITH_CUDNN_SHARED=ON or -DMGE_WITH_LARGE_ARCHIVE=ON -DMGE_CUDA_GENCODE=\"-gencode arch=compute_70,code=sm_70 arch=compute_75,code=sm_75\" ")
message(WARNING "Static link CUDNN8 with many sm is unworkable, please use -DMGE_WITH_CUDNN_SHARED=ON or -DMGE_WITH_LARGE_ARCHIVE=ON -DMGE_CUDA_GENCODE=\"-gencode arch=compute_70,code=sm_70 arch=compute_75,code=sm_75\" ")
message(WARNING "Static link CUDNN8 with many sm is unworkable, please use -DMGE_WITH_CUDNN_SHARED=ON or -DMGE_WITH_LARGE_ARCHIVE=ON -DMGE_CUDA_GENCODE=\"-gencode arch=compute_70,code=sm_70 arch=compute_75,code=sm_75\" ")
endif()
\ No newline at end of file
......@@ -11,7 +11,7 @@ if("${CUDNN_ROOT_DIR}" STREQUAL "" AND NOT "$ENV{CUDNN_ROOT_DIR}" STREQUAL "")
set(CUDNN_ROOT_DIR $ENV{CUDNN_ROOT_DIR})
endif()
if(MGE_CUDA_USE_STATIC)
if(MGE_CUDA_USE_STATIC AND NOT MGE_WITH_CUDNN_SHARED)
find_library(CUDNN_LIBRARY
NAMES libcudnn_static.a cudnn.lib
PATHS $ENV{LD_LIBRARY_PATH} ${CUDNN_ROOT_DIR} ${PC_CUDNN_LIBRARY_DIRS} ${CMAKE_INSTALL_PREFIX}
......@@ -42,7 +42,12 @@ if(CUDNN_INCLUDE_DIR STREQUAL "CUDNN_INCLUDE_DIR-NOTFOUND")
message(FATAL_ERROR "Can not find CuDNN Library")
endif()
file(READ ${CUDNN_INCLUDE_DIR}/cudnn.h CUDNN_VERSION_FILE_CONTENTS)
if(EXISTS ${CUDNN_INCLUDE_DIR}/cudnn_version.h)
file(READ ${CUDNN_INCLUDE_DIR}/cudnn_version.h CUDNN_VERSION_FILE_CONTENTS)
else()
file(READ ${CUDNN_INCLUDE_DIR}/cudnn.h CUDNN_VERSION_FILE_CONTENTS)
endif()
string(REGEX MATCH "define CUDNN_MAJOR * +([0-9]+)"
CUDNN_MAJOR_VERSION "${CUDNN_VERSION_FILE_CONTENTS}")
string(REGEX REPLACE "define CUDNN_MAJOR * +([0-9]+)" "\\1"
......@@ -55,7 +60,9 @@ string(REGEX MATCH "define CUDNN_PATCHLEVEL * +([0-9]+)"
CUDNN_PATCH_VERSION "${CUDNN_VERSION_FILE_CONTENTS}")
string(REGEX REPLACE "define CUDNN_PATCHLEVEL * +([0-9]+)" "\\1"
CUDNN_PATCH_VERSION "${CUDNN_PATCH_VERSION}")
set(CUDNN_VERSION ${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION})
set(CUDNN_VERSION ${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}.${CUDNN_PATCH_VERSION})
if(MGE_CUDA_USE_STATIC)
add_library(libcudnn STATIC IMPORTED)
......
file (GLOB_RECURSE SOURCES src/*.cpp)
file (GLOB_RECURSE CUDA_STUB src/libcuda.cpp)
file (GLOB_RECURSE NVRTC_STUB src/libnvrtc.cpp)
if(MGE_WITH_CUDA_STUB)
list(APPEND STUB_SRC ${CUDA_STUB})
endif()
if(MGE_WITH_NVRTC_STUB)
list(APPEND STUB_SRC ${NVRTC_STUB})
endif()
if(MSVC OR WIN32)
add_library (cuda-stub STATIC ${SOURCES})
add_library (cuda-stub STATIC ${STUB_SRC})
else()
add_library (cuda-stub SHARED ${SOURCES})
add_library (cuda-stub SHARED ${STUB_SRC})
endif()
set_target_properties(cuda-stub PROPERTIES OUTPUT_NAME cuda)
set_target_properties(cuda-stub PROPERTIES OUTPUT_NAME cuda_stub)
target_compile_definitions(cuda-stub PRIVATE __CUDA_API_VERSION_INTERNAL)
if (MSVC OR WIN32)
target_link_libraries(cuda-stub PRIVATE -Wl,--no-undefined)
......
#if defined(_WIN32)
#include <windows.h>
#define RTLD_LAZY 0
static void* dlopen(const char* file, int) {
return static_cast<void*>(LoadLibraryA(file));
}
static void* dlerror() {
const char* errmsg = "dlerror not aviable in windows";
return const_cast<char*>(errmsg);
}
static void* dlsym(void* handle, const char* name) {
FARPROC symbol = GetProcAddress((HMODULE)handle, name);
return reinterpret_cast<void*>(symbol);
}
#else
#include <dlfcn.h>
#include <unistd.h>
#endif
#include <sstream>
#include <string>
#include <vector>
static std::vector<std::string> split_string(const std::string& s, char delim) {
std::vector<std::string> elems;
std::stringstream ss(s);
std::string item;
while (std::getline(ss, item, delim)) {
elems.push_back(item);
}
return elems;
}
static std::vector<std::string> get_env_dir(const char* env_name) {
const char* env_p = std::getenv(env_name);
std::vector<std::string> env_dir;
if (env_p) {
env_dir = split_string(env_p, ':');
}
return env_dir;
}
static void* try_open_handle(std::vector<std::string> dir_vec,
std::string default_so_name) {
void* handle = nullptr;
for (auto& tk_path : dir_vec) {
handle = dlopen((tk_path + "/" + default_so_name).c_str(), RTLD_LAZY);
if (handle) {
break;
}
}
return handle;
}
static void* try_open_handle(const char** so_vec, int nr_so) {
void* handle = nullptr;
for (int i = 0; i < nr_so; ++i) {
handle = dlopen(so_vec[i], RTLD_LAZY);
if (handle) {
break;
}
}
return handle;
}
static void* get_library_handle() {
std::vector<std::string> cuda_tk_dir = get_env_dir("CUDA_TK_PATH");
std::vector<std::string> ld_dir = get_env_dir("LD_LIBRARY_PATH");
void* handle = nullptr;
if (!handle) {
handle = try_open_handle(ld_dir, default_so_name);
}
if (!handle) {
handle = try_open_handle(cuda_tk_dir, default_so_name);
}
if (!handle) {
handle = try_open_handle(default_so_paths,
sizeof(default_so_paths) / sizeof(char*));
}
if (!handle) {
handle = try_open_handle(extra_so_paths,
sizeof(extra_so_paths) / sizeof(char*));
}
if (!handle) {
LOGE("Failed to load %s API library", g_default_api_name);
return nullptr;
}
return handle;
}
static void log_failed_load(int func_idx) {
LOGE("failed to load %s func: %s", g_default_api_name,
g_func_name[func_idx]);
}
static void* resolve_library_func(void* handle, const char* func) {
if (!handle) {
LOGE("%s handle should not be nullptr!", g_default_api_name);
return nullptr;
}
auto ret = dlsym(handle, func);
if (!ret) {
LOGE("failed to load %s func: %s", g_default_api_name, func);
}
return ret;
}
此差异已折叠。
此差异已折叠。
此差异已折叠。
......@@ -3,36 +3,14 @@
#include <cstdio>
#define LOGE(fmt, v...) fprintf(stderr, "err: " fmt "\n", ##v)
extern "C" {
#include <cuda.h>
#include "cuda.h"
}
#include <cudaProfiler.h>
#include "cudaProfiler.h"
#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
#if defined(_WIN32)
#include <windows.h>
#define RTLD_LAZY 0
static void* dlopen(const char* file, int) {
return static_cast<void*>(LoadLibraryA(file));
}
static void* dlerror() {
const char* errmsg = "dlerror not aviable in windows";
return const_cast<char*>(errmsg);
}
static void* dlsym(void* handle, const char* name) {
FARPROC symbol = GetProcAddress((HMODULE)handle, name);
return reinterpret_cast<void*>(symbol);
}
#else
#include <dlfcn.h>
#include <unistd.h>
#endif
static void log_failed_load(int func_idx);
namespace {
template <typename T>
......@@ -42,68 +20,63 @@ CUresult on_init_failed(int func_idx) {
log_failed_load(func_idx);
return CUDA_ERROR_UNKNOWN;
}
}
} // namespace
#define _WRAPLIB_API_CALL CUDAAPI
#define _WRAPLIB_CALLBACK CUDA_CB
#include "./libcuda-wrap.h"
#if CUDA_VERSION == 10010
#include "./libcuda-wrap_10.1.h"
#elif CUDA_VERSION == 10020
#include "./libcuda-wrap_10.2.h"
#elif CUDA_VERSION == 11010
#include "./libcuda-wrap_11.1.h"
#elif CUDA_VERSION == 11020
#include "./libcuda-wrap_11.2.h"
#else
#error "cuda stub not support this cuda version, you can close cuda stub to passby"
#endif
#undef _WRAPLIB_CALLBACK
#undef _WRAPLIB_API_CALL
static const char* default_so_name =
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
"nvcuda.dll";
#elif defined(__APPLE__) || defined(__MACOSX)
"libcuda.dylib";
#else
"libcuda.so.1";
#endif
// Harvested from cuda_drvapi_dynlink.c
static const char* default_so_paths[] = {
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
"nvcuda.dll",
#elif defined(__unix__) || defined (__QNX__) || defined(__APPLE__) || defined(__MACOSX)
"nvcuda.dll",
#elif defined(__unix__) || defined(__QNX__) || defined(__APPLE__) || \
defined(__MACOSX)
#if defined(__APPLE__) || defined(__MACOSX)
"/usr/local/cuda/lib/libcuda.dylib",
"/usr/local/cuda/lib/libcuda.dylib",
#elif defined(__ANDROID__)
#if defined (__aarch64__)
"/system/vendor/lib64/libcuda.so",
#if defined(__aarch64__)
"/system/vendor/lib64/libcuda.so",
#elif defined(__arm__)
"/system/vendor/lib/libcuda.so",
"/system/vendor/lib/libcuda.so",
#endif
#else
"libcuda.so.1",
// In case some users does not have correct search path configured in
// /etc/ld.so.conf
"/usr/lib/x86_64-linux-gnu/libcuda.so",
"/usr/local/nvidia/lib64/libcuda.so",
"libcuda.so.1",
#endif
#else
#error "Unknown platform"
#endif
};
static void* get_library_handle() {
void* handle = nullptr;
for (size_t i = 0; i < (sizeof(default_so_paths) / sizeof(char*)); i++) {
handle = dlopen(default_so_paths[i], RTLD_LAZY);
if (handle) {
break;
}
}
if (!handle) {
LOGE("Failed to load CUDA Driver API library");
return nullptr;
}
return handle;
}
static void log_failed_load(int func_idx) {
LOGE("failed to load cuda func: %s", g_func_name[func_idx]);
}
static const char* extra_so_paths[] = {
"/usr/lib/x86_64-linux-gnu/libcuda.so",
"/usr/local/nvidia/lib64/libcuda.so",
};
static void* resolve_library_func(void* handle, const char* func) {
if (!handle) {
LOGE("handle should not be nullptr!");
return nullptr;
}
auto ret = dlsym(handle, func);
if (!ret) {
LOGE("failed to load cuda func: %s", func);
}
return ret;
}
static const char* g_default_api_name = "cuda";
#include "./dlopen_helper.h"
\ No newline at end of file
// generated by wraplib.py
// --- begin functions to be implemented
#ifndef _WRAPLIB_API_CALL
#define _WRAPLIB_API_CALL
#endif
#ifndef _WRAPLIB_CALLBACK
#define _WRAPLIB_CALLBACK
#endif
#ifndef ON_ENTRY
#define ON_ENTRY(x)
#endif
static void* get_library_handle();
static void* resolve_library_func(void*, const char*);
namespace {
template <typename T>
T on_init_failed(int func_idx);
}
// --- end functions to be implemented
#include <cstddef>
#include <mutex>
extern "C" {
const char _WRAPLIB_API_CALL* nvrtcGetErrorString(nvrtcResult arg0);
nvrtcResult _WRAPLIB_API_CALL nvrtcVersion(int* arg0, int* arg1);
nvrtcResult _WRAPLIB_API_CALL nvrtcGetNumSupportedArchs(int* arg0);
nvrtcResult _WRAPLIB_API_CALL nvrtcGetSupportedArchs(int* arg0);
nvrtcResult _WRAPLIB_API_CALL nvrtcCreateProgram(nvrtcProgram* arg0,
const char* arg1,
const char* arg2, int arg3,
const char* const* arg4,
const char* const* arg5);
nvrtcResult _WRAPLIB_API_CALL nvrtcDestroyProgram(nvrtcProgram* arg0);
nvrtcResult _WRAPLIB_API_CALL nvrtcCompileProgram(nvrtcProgram arg0, int arg1,
const char* const* arg2);
nvrtcResult _WRAPLIB_API_CALL nvrtcGetPTXSize(nvrtcProgram arg0, size_t* arg1);
nvrtcResult _WRAPLIB_API_CALL nvrtcGetPTX(nvrtcProgram arg0, char* arg1);
nvrtcResult _WRAPLIB_API_CALL nvrtcGetCUBINSize(nvrtcProgram arg0,
size_t* arg1);
nvrtcResult _WRAPLIB_API_CALL nvrtcGetCUBIN(nvrtcProgram arg0, char* arg1);
nvrtcResult _WRAPLIB_API_CALL nvrtcGetProgramLogSize(nvrtcProgram arg0,
size_t* arg1);
nvrtcResult _WRAPLIB_API_CALL nvrtcGetProgramLog(nvrtcProgram arg0, char* arg1);
nvrtcResult _WRAPLIB_API_CALL nvrtcAddNameExpression(nvrtcProgram arg0,
const char* const arg1);
nvrtcResult _WRAPLIB_API_CALL nvrtcGetLoweredName(nvrtcProgram arg0,
const char* const arg1,
const char** arg2);
}
static void load_library();
static const char _WRAPLIB_API_CALL* nvrtcGetErrorString_init(
nvrtcResult arg0) {
load_library();
return nvrtcGetErrorString(arg0);
}
static const char _WRAPLIB_API_CALL* nvrtcGetErrorString_error(nvrtcResult) {
return on_init_failed<const char*>(0);
}
static nvrtcResult _WRAPLIB_API_CALL nvrtcVersion_init(int* arg0, int* arg1) {
load_library();
return nvrtcVersion(arg0, arg1);
}
static nvrtcResult _WRAPLIB_API_CALL nvrtcVersion_error(int*, int*) {
return on_init_failed<nvrtcResult>(1);
}
static nvrtcResult _WRAPLIB_API_CALL nvrtcGetNumSupportedArchs_init(int* arg0) {
load_library();
return nvrtcGetNumSupportedArchs(arg0);
}
static nvrtcResult _WRAPLIB_API_CALL nvrtcGetNumSupportedArchs_error(int*) {
return on_init_failed<nvrtcResult>(2);
}
static nvrtcResult _WRAPLIB_API_CALL nvrtcGetSupportedArchs_init(int* arg0) {
load_library();
return nvrtcGetSupportedArchs(arg0);
}
static nvrtcResult _WRAPLIB_API_CALL nvrtcGetSupportedArchs_error(int*) {
return on_init_failed<nvrtcResult>(3);
}
static nvrtcResult _WRAPLIB_API_CALL nvrtcCreateProgram_init(
nvrtcProgram* arg0, const char* arg1, const char* arg2, int arg3,
const char* const* arg4, const char* const* arg5) {
load_library();
return nvrtcCreateProgram(arg0, arg1, arg2, arg3, arg4, arg5);
}
static nvrtcResult _WRAPLIB_API_CALL
nvrtcCreateProgram_error(nvrtcProgram*, const char*, const char*, int,
const char* const*, const char* const*) {
return on_init_failed<nvrtcResult>(4);
}
static nvrtcResult _WRAPLIB_API_CALL
nvrtcDestroyProgram_init(nvrtcProgram* arg0) {
load_library();
return nvrtcDestroyProgram(arg0);
}
static nvrtcResult _WRAPLIB_API_CALL nvrtcDestroyProgram_error(nvrtcProgram*) {
return on_init_failed<nvrtcResult>(5);
}
static nvrtcResult _WRAPLIB_API_CALL
nvrtcCompileProgram_init(nvrtcProgram arg0, int arg1, const char* const* arg2) {
load_library();
return nvrtcCompileProgram(arg0, arg1, arg2);
}
static nvrtcResult _WRAPLIB_API_CALL
nvrtcCompileProgram_error(nvrtcProgram, int, const char* const*) {
return on_init_failed<nvrtcResult>(6);
}
static nvrtcResult _WRAPLIB_API_CALL nvrtcGetPTXSize_init(nvrtcProgram arg0,
size_t* arg1) {
load_library();
return nvrtcGetPTXSize(arg0, arg1);
}
static nvrtcResult _WRAPLIB_API_CALL nvrtcGetPTXSize_error(nvrtcProgram,
size_t*) {
return on_init_failed<nvrtcResult>(7);
}
static nvrtcResult _WRAPLIB_API_CALL nvrtcGetPTX_init(nvrtcProgram arg0,
char* arg1) {
load_library();
return nvrtcGetPTX(arg0, arg1);
}
static nvrtcResult _WRAPLIB_API_CALL nvrtcGetPTX_error(nvrtcProgram, char*) {
return on_init_failed<nvrtcResult>(8);
}
static nvrtcResult _WRAPLIB_API_CALL nvrtcGetCUBINSize_init(nvrtcProgram arg0,
size_t* arg1) {
load_library();
return nvrtcGetCUBINSize(arg0, arg1);
}
static nvrtcResult _WRAPLIB_API_CALL nvrtcGetCUBINSize_error(nvrtcProgram,
size_t*) {
return on_init_failed<nvrtcResult>(9);
}
static nvrtcResult _WRAPLIB_API_CALL nvrtcGetCUBIN_init(nvrtcProgram arg0,
char* arg1) {
load_library();
return nvrtcGetCUBIN(arg0, arg1);
}
static nvrtcResult _WRAPLIB_API_CALL nvrtcGetCUBIN_error(nvrtcProgram, char*) {
return on_init_failed<nvrtcResult>(10);
}
static nvrtcResult _WRAPLIB_API_CALL
nvrtcGetProgramLogSize_init(nvrtcProgram arg0, size_t* arg1) {
load_library();
return nvrtcGetProgramLogSize(arg0, arg1);
}
static nvrtcResult _WRAPLIB_API_CALL nvrtcGetProgramLogSize_error(nvrtcProgram,
size_t*) {
return on_init_failed<nvrtcResult>(11);
}
static nvrtcResult _WRAPLIB_API_CALL nvrtcGetProgramLog_init(nvrtcProgram arg0,
char* arg1) {
load_library();
return nvrtcGetProgramLog(arg0, arg1);
}
static nvrtcResult _WRAPLIB_API_CALL nvrtcGetProgramLog_error(nvrtcProgram,
char*) {
return on_init_failed<nvrtcResult>(12);
}
static nvrtcResult _WRAPLIB_API_CALL
nvrtcAddNameExpression_init(nvrtcProgram arg0, const char* const arg1) {
load_library();
return nvrtcAddNameExpression(arg0, arg1);
}
static nvrtcResult _WRAPLIB_API_CALL
nvrtcAddNameExpression_error(nvrtcProgram, const char* const) {
return on_init_failed<nvrtcResult>(13);
}
static nvrtcResult _WRAPLIB_API_CALL nvrtcGetLoweredName_init(
nvrtcProgram arg0, const char* const arg1, const char** arg2) {
load_library();
return nvrtcGetLoweredName(arg0, arg1, arg2);
}
static nvrtcResult _WRAPLIB_API_CALL
nvrtcGetLoweredName_error(nvrtcProgram, const char* const, const char**) {
return on_init_failed<nvrtcResult>(14);
}
static constexpr size_t NR_FUNC = 15;
static void* g_func_table[NR_FUNC] = {(void*)(&nvrtcGetErrorString_init),
(void*)(&nvrtcVersion_init),
(void*)(&nvrtcGetNumSupportedArchs_init),
(void*)(&nvrtcGetSupportedArchs_init),
(void*)(&nvrtcCreateProgram_init),
(void*)(&nvrtcDestroyProgram_init),
(void*)(&nvrtcCompileProgram_init),
(void*)(&nvrtcGetPTXSize_init),
(void*)(&nvrtcGetPTX_init),
(void*)(&nvrtcGetCUBINSize_init),
(void*)(&nvrtcGetCUBIN_init),
(void*)(&nvrtcGetProgramLogSize_init),
(void*)(&nvrtcGetProgramLog_init),
(void*)(&nvrtcAddNameExpression_init),
(void*)(&nvrtcGetLoweredName_init)};
static void* g_func_table_error[NR_FUNC] = {
(void*)(&nvrtcGetErrorString_error),
(void*)(&nvrtcVersion_error),
(void*)(&nvrtcGetNumSupportedArchs_error),
(void*)(&nvrtcGetSupportedArchs_error),
(void*)(&nvrtcCreateProgram_error),
(void*)(&nvrtcDestroyProgram_error),
(void*)(&nvrtcCompileProgram_error),
(void*)(&nvrtcGetPTXSize_error),
(void*)(&nvrtcGetPTX_error),
(void*)(&nvrtcGetCUBINSize_error),
(void*)(&nvrtcGetCUBIN_error),
(void*)(&nvrtcGetProgramLogSize_error),
(void*)(&nvrtcGetProgramLog_error),
(void*)(&nvrtcAddNameExpression_error),
(void*)(&nvrtcGetLoweredName_error)};
static const char* const g_func_name[NR_FUNC] = {"nvrtcGetErrorString",
"nvrtcVersion",
"nvrtcGetNumSupportedArchs",
"nvrtcGetSupportedArchs",
"nvrtcCreateProgram",
"nvrtcDestroyProgram",
"nvrtcCompileProgram",
"nvrtcGetPTXSize",
"nvrtcGetPTX",
"nvrtcGetCUBINSize",
"nvrtcGetCUBIN",
"nvrtcGetProgramLogSize",
"nvrtcGetProgramLog",
"nvrtcAddNameExpression",
"nvrtcGetLoweredName"};
static void load_library() {
static bool done = false;
static std::mutex mtx;
std::lock_guard<std::mutex> lg{mtx};
if (done)
return;
void* handle = get_library_handle();
for (size_t i = 0; i < NR_FUNC; ++i) {
void* func;
if (!handle) {
func = nullptr;
} else {
func = resolve_library_func(handle, g_func_name[i]);
}
if (!func) {
func = g_func_table_error[i];
}
__atomic_store_n(g_func_table + i, func, __ATOMIC_RELAXED);
}
done = true;
}
const char _WRAPLIB_API_CALL* nvrtcGetErrorString(nvrtcResult arg0) {
typedef const char*(_WRAPLIB_API_CALL * f_ptr_t)(nvrtcResult);
ON_ENTRY(nvrtcGetErrorString);
f_ptr_t f = (f_ptr_t)(g_func_table[0]);
return f(arg0);
}
nvrtcResult _WRAPLIB_API_CALL nvrtcVersion(int* arg0, int* arg1) {
typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(int*, int*);
ON_ENTRY(nvrtcVersion);
f_ptr_t f = (f_ptr_t)(g_func_table[1]);
return f(arg0, arg1);
}
nvrtcResult _WRAPLIB_API_CALL nvrtcGetNumSupportedArchs(int* arg0) {
typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(int*);
ON_ENTRY(nvrtcGetNumSupportedArchs);
f_ptr_t f = (f_ptr_t)(g_func_table[2]);
return f(arg0);
}
nvrtcResult _WRAPLIB_API_CALL nvrtcGetSupportedArchs(int* arg0) {
typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(int*);
ON_ENTRY(nvrtcGetSupportedArchs);
f_ptr_t f = (f_ptr_t)(g_func_table[3]);
return f(arg0);
}
nvrtcResult _WRAPLIB_API_CALL nvrtcCreateProgram(nvrtcProgram* arg0,
const char* arg1,
const char* arg2, int arg3,
const char* const* arg4,
const char* const* arg5) {
typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(
nvrtcProgram*, const char*, const char*, int, const char* const*,
const char* const*);
ON_ENTRY(nvrtcCreateProgram);
f_ptr_t f = (f_ptr_t)(g_func_table[4]);
return f(arg0, arg1, arg2, arg3, arg4, arg5);
}
nvrtcResult _WRAPLIB_API_CALL nvrtcDestroyProgram(nvrtcProgram* arg0) {
typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(nvrtcProgram*);
ON_ENTRY(nvrtcDestroyProgram);
f_ptr_t f = (f_ptr_t)(g_func_table[5]);
return f(arg0);
}
nvrtcResult _WRAPLIB_API_CALL nvrtcCompileProgram(nvrtcProgram arg0, int arg1,
const char* const* arg2) {
typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(nvrtcProgram, int,
const char* const*);
ON_ENTRY(nvrtcCompileProgram);
f_ptr_t f = (f_ptr_t)(g_func_table[6]);
return f(arg0, arg1, arg2);
}
nvrtcResult _WRAPLIB_API_CALL nvrtcGetPTXSize(nvrtcProgram arg0, size_t* arg1) {
typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(nvrtcProgram, size_t*);
ON_ENTRY(nvrtcGetPTXSize);
f_ptr_t f = (f_ptr_t)(g_func_table[7]);
return f(arg0, arg1);
}
nvrtcResult _WRAPLIB_API_CALL nvrtcGetPTX(nvrtcProgram arg0, char* arg1) {
typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(nvrtcProgram, char*);
ON_ENTRY(nvrtcGetPTX);
f_ptr_t f = (f_ptr_t)(g_func_table[8]);
return f(arg0, arg1);
}
nvrtcResult _WRAPLIB_API_CALL nvrtcGetCUBINSize(nvrtcProgram arg0,
size_t* arg1) {
typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(nvrtcProgram, size_t*);
ON_ENTRY(nvrtcGetCUBINSize);
f_ptr_t f = (f_ptr_t)(g_func_table[9]);
return f(arg0, arg1);
}
nvrtcResult _WRAPLIB_API_CALL nvrtcGetCUBIN(nvrtcProgram arg0, char* arg1) {
typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(nvrtcProgram, char*);
ON_ENTRY(nvrtcGetCUBIN);
f_ptr_t f = (f_ptr_t)(g_func_table[10]);
return f(arg0, arg1);
}
nvrtcResult _WRAPLIB_API_CALL nvrtcGetProgramLogSize(nvrtcProgram arg0,
size_t* arg1) {
typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(nvrtcProgram, size_t*);
ON_ENTRY(nvrtcGetProgramLogSize);
f_ptr_t f = (f_ptr_t)(g_func_table[11]);
return f(arg0, arg1);
}
nvrtcResult _WRAPLIB_API_CALL nvrtcGetProgramLog(nvrtcProgram arg0,
char* arg1) {
typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(nvrtcProgram, char*);
ON_ENTRY(nvrtcGetProgramLog);
f_ptr_t f = (f_ptr_t)(g_func_table[12]);
return f(arg0, arg1);
}
nvrtcResult _WRAPLIB_API_CALL nvrtcAddNameExpression(nvrtcProgram arg0,
const char* const arg1) {
typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(nvrtcProgram,
const char* const);
ON_ENTRY(nvrtcAddNameExpression);
f_ptr_t f = (f_ptr_t)(g_func_table[13]);
return f(arg0, arg1);
}
nvrtcResult _WRAPLIB_API_CALL nvrtcGetLoweredName(nvrtcProgram arg0,
const char* const arg1,
const char** arg2) {
typedef nvrtcResult(_WRAPLIB_API_CALL * f_ptr_t)(
nvrtcProgram, const char* const, const char**);
ON_ENTRY(nvrtcGetLoweredName);
f_ptr_t f = (f_ptr_t)(g_func_table[14]);
return f(arg0, arg1, arg2);
}
/**
* \file dnn/cuda-stub/src/libnvrtc.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#pragma GCC visibility push(default)
#include <cstdio>
#define LOGE(fmt, v...) fprintf(stderr, "err: " fmt "\n", ##v)
#include "./nvrtc_type.h"
#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
static void log_failed_load(int func_idx);
namespace {
template <typename T>
T on_init_failed(int func_idx);
template <>
nvrtcResult on_init_failed(int func_idx) {
log_failed_load(func_idx);
return NVRTC_ERROR_INTERNAL_ERROR;
}
template <>
const char* on_init_failed(int func_idx) {
log_failed_load(func_idx);
return "load lib failed";
}
} // namespace
#include "./libnvrtc-wrap.h"
static const char* default_so_name =
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
"nvrtc.dll";
#elif defined(__APPLE__) || defined(__MACOSX)
"libnvrtc.dylib";
#else
"libnvrtc.so";
#endif
static const char* default_so_paths[] = {
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
"nvrtc.dll",
#elif defined(__unix__) || defined(__QNX__) || defined(__APPLE__) || \
defined(__MACOSX)
#if defined(__APPLE__) || defined(__MACOSX)
"/usr/local/cuda/lib/libnvrtc.dylib",
#elif defined(__ANDROID__)
#if defined(__aarch64__)
"/system/vendor/lib64/libnvrtc.so",
#elif defined(__arm__)
"/system/vendor/lib/libnvrtc.so",
#endif
#else
"libnvrtc.so",
// In case some users does not have correct search path configured in
// /etc/ld.so.conf
"/usr/lib/x86_64-linux-gnu/libnvrtc.so",
"/usr/local/nvidia/lib64/libnvrtc.so",
"/usr/local/cuda/lib64/libnvrtc.so",
#endif
#else
#error "Unknown platform"
#endif
};
static const char* extra_so_paths[] = {};
static const char* g_default_api_name = "nvrtc";
#include "./dlopen_helper.h"
\ No newline at end of file
#pragma once
#ifdef __cplusplus
extern "C" {
#endif /* __cplusplus */
#include <stdlib.h>
typedef enum {
NVRTC_SUCCESS = 0,
NVRTC_ERROR_OUT_OF_MEMORY = 1,
NVRTC_ERROR_PROGRAM_CREATION_FAILURE = 2,
NVRTC_ERROR_INVALID_INPUT = 3,
NVRTC_ERROR_INVALID_PROGRAM = 4,
NVRTC_ERROR_INVALID_OPTION = 5,
NVRTC_ERROR_COMPILATION = 6,
NVRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7,
NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8,
NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9,
NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10,
NVRTC_ERROR_INTERNAL_ERROR = 11
} nvrtcResult;
typedef struct _nvrtcProgram *nvrtcProgram;
#ifdef __cplusplus
}
#endif /* __cplusplus */
......@@ -26,6 +26,7 @@ static inline CUBLASLTMatmulDesc::SizeArgs from_local_size_args(
return {handle, transA, transB,
args.layout_a, args.layout_b, args.layout_c};
}
bool BatchedMatrixMulForwardImpl::AlgoCublasLt::is_available(
const SizeArgs& args) const {
auto cublasLt_args = from_local_size_args(args);
......@@ -35,6 +36,7 @@ bool BatchedMatrixMulForwardImpl::AlgoCublasLt::is_available(
.is_available(cublasLt_args, INT_MAX);
return res;
}
size_t BatchedMatrixMulForwardImpl::AlgoCublasLt::get_workspace_in_bytes(
const SizeArgs& args) const {
auto cublasLt_args = from_local_size_args(args);
......@@ -43,6 +45,7 @@ size_t BatchedMatrixMulForwardImpl::AlgoCublasLt::get_workspace_in_bytes(
desc.get_algorithm_heuristic(cublasLt_args, INT_MAX, algo);
return desc.get_workspace_bundle(cublasLt_args, algo).total_size_in_bytes();
}
void BatchedMatrixMulForwardImpl::AlgoCublasLt::exec(
const ExecArgs& args) const {
auto cublasLt_args = from_local_size_args(args);
......@@ -89,6 +92,7 @@ void BatchedMatrixMulForwardImpl::AlgoCublasLt::exec(
desc.layout_c, &algo, ws_bundle.get(0),
ws_bundle.get_size(0), stream));
};
auto batched_igemm = [&]() {
auto zero = handle->zero_device();
auto one = handle->one_device();
......@@ -133,6 +137,18 @@ void BatchedMatrixMulForwardImpl::AlgoCublasLt::exec(
};
ws_bundle.set(args.workspace.raw_ptr);
#if CUDA_VERSION >= 11000
if (desc.dt_compute == CUBLAS_COMPUTE_32I) {
batched_igemm();
} else if (desc.dt_compute == CUBLAS_COMPUTE_16F) {
batched_hgemm();
} else if (desc.dt_compute == CUBLAS_COMPUTE_32F) {
batched_sgemm();
} else {
megdnn_throw(
megdnn_mangle("compute_type must be int32/float16/float32"));
}
#else
if (desc.dt_compute == CUDA_R_32I) {
batched_igemm();
} else if (desc.dt_compute == CUDA_R_16F) {
......@@ -143,5 +159,6 @@ void BatchedMatrixMulForwardImpl::AlgoCublasLt::exec(
megdnn_throw(
megdnn_mangle("compute_type must be int32/float16/float32"));
}
#endif
}
#endif
......@@ -156,6 +156,9 @@ std::string ConvBiasForwardImpl::AlgoBase::SizeArgs::to_string() const {
case param::ConvBias::NonlineMode::IDENTITY:
nonlinear_mode_str = "IDENTITY";
break;
case param::ConvBias::NonlineMode::H_SWISH:
nonlinear_mode_str = "H_SWISH";
break;
default:
megdnn_throw("invalid conv bias nonlinear mode");
}
......
......@@ -165,6 +165,23 @@ void TensorDesc::set(const TensorLayout& layout,
}
}
std::string TensorDesc::to_string() {
cudnnDataType_t data_type;
int n;
int c;
int h;
int w;
int n_stride;
int c_stride;
int h_stride;
int w_stride;
cudnn_check(cudnnGetTensor4dDescriptor(desc, &data_type, &n, &c, &h, &w,
&n_stride, &c_stride, &h_stride,
&w_stride));
return ssprintf("<dtype_%d, %d,%d,%d,%d(%d,%d,%d,%d)>", data_type, n, c, h,
w, n_stride, c_stride, h_stride, w_stride);
}
template <typename Param>
FilterDesc<Param>::FilterDesc() {
cudnn_check(cudnnCreateFilterDescriptor(&desc));
......@@ -175,6 +192,20 @@ FilterDesc<Param>::~FilterDesc() {
cudnn_check(cudnnDestroyFilterDescriptor(desc));
}
template <typename Param>
std::string FilterDesc<Param>::to_string() {
cudnnDataType_t data_type;
cudnnTensorFormat_t format;
int k;
int c;
int h;
int w;
cudnn_check(cudnnGetFilter4dDescriptor(desc, &data_type, &format, &k, &c,
&h, &w));
return ssprintf("<dtype_%d, format_%d, %d,%d,%d,%d>", data_type,format, k, c, h,
w);
}
template <typename Param>
void FilterDesc<Param>::set(
const typename ConvolutionBase<Param>::CanonizedFilterMeta&
......
......@@ -30,6 +30,7 @@ class TensorDesc {
//! default layout is nchw
void set(const TensorLayout& layout, const param::Convolution::Format =
param::Convolution::Format::NCHW);
std::string to_string();
~TensorDesc();
cudnnTensorDescriptor_t desc;
};
......@@ -39,6 +40,7 @@ class FilterDesc {
public:
FilterDesc();
void set(const typename ConvolutionBase<Param>::CanonizedFilterMeta &meta);
std::string to_string();
~FilterDesc();
cudnnFilterDescriptor_t desc;
};
......
......@@ -25,6 +25,10 @@ using namespace cuda;
#define SE_CUDA_DATA_HALF CUBLAS_DATA_HALF
#endif
#if CUDA_VERSION < 11000
#define CUBLAS_COMPUTE_32I CUDA_R_32I
#endif
bool MatrixMulForwardImpl::AlgoCuBlas::is_available(
const SizeArgs& args) const {
if (args.opr->param().format != param::MatrixMul::Format::DEFAULT)
......@@ -117,7 +121,7 @@ void MatrixMulForwardImpl::AlgoCuBlas::exec(const ExecArgs& args) const {
args.tensor_b.layout.stride[0], args.tensor_a.raw_ptr,
CUDA_R_8I, args.tensor_a.layout.stride[0], zero,
args.tensor_c.raw_ptr, CUDA_R_32I,
args.tensor_c.layout.stride[0], CUDA_R_32I, CUBLAS_GEMM_DFALT));
args.tensor_c.layout.stride[0], CUBLAS_COMPUTE_32I, CUBLAS_GEMM_DFALT));
};
// Note that cublas takes column-major matrices as inputs,
......
......@@ -6,10 +6,11 @@
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "src/cuda/matrix_mul/cublasLt_wrapper.h"
#include "src/common/utils.h"
#include "src/cuda/matrix_mul/cublasLt_wrapper.h"
#include "src/cuda/utils.h"
#if CUDA_VERSION >= 10010
......@@ -33,6 +34,7 @@ static cudaDataType_t to_cuda_dtype(DType tp) {
}
}
#if CUDA_VERSION >= 11000
static cublasComputeType_t to_cublas_compute_type(DType tp) {
switch (tp.enumv()) {
case DTypeEnum::Float16:
......@@ -43,10 +45,11 @@ static cublasComputeType_t to_cublas_compute_type(DType tp) {
case DTypeEnum::QuantizedS32:
return CUBLAS_COMPUTE_32I;
default:
megdnn_throw(megdnn_mangle(
"dtype must be float16/float32/int32/Qs32"));
megdnn_throw(
megdnn_mangle("dtype must be float16/float32/int32/Qs32"));
}
}
#endif
static const char* cuda_type_to_str(cudaDataType_t tp) {
switch (tp) {
......@@ -106,9 +109,15 @@ void CUBLASLTMatmulDesc::set(const SizeArgs& args, bool batched) {
dt_b = to_cuda_dtype(args.layout_b.dtype);
dt_a = to_cuda_dtype(args.layout_a.dtype);
dt_c = to_cuda_dtype(args.layout_c.dtype);
dt_compute = to_cublas_compute_type(args.layout_c.dtype);
megdnn_assert(dt_a == dt_b, "matrix A and B should have same precision");
#if CUDA_VERSION >= 11000
dt_compute = to_cublas_compute_type(args.layout_c.dtype);
cublas_check(cublasLtMatmulDescCreate(&matmul_desc, dt_compute, dt_c));
#else
dt_compute = dt_c;
cublas_check(cublasLtMatmulDescCreate(&matmul_desc, dt_compute));
#endif
cublas_check(cublasLtMatmulDescSetAttribute(
matmul_desc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &pm, sizeof(pm)));
......@@ -262,8 +271,7 @@ WorkspaceBundle CUBLASLTMatmulDesc::get_workspace_bundle(
dt_c == CUDA_R_32I ? layout_trans_b : layout_b,
dt_c == CUDA_R_32I ? layout_trans_a : layout_a,
dt_c == CUDA_R_32I ? layout_trans_c : layout_c,
dt_c == CUDA_R_32I ? layout_trans_c : layout_c, &algo,
&result);
dt_c == CUDA_R_32I ? layout_trans_c : layout_c, &algo, &result);
// return empty WorkspaceBundle if cublasLtMatmulAlgoCheck() failed
if (status != CUBLAS_STATUS_SUCCESS)
return {nullptr, {}};
......
......@@ -48,7 +48,11 @@ struct CUBLASLTMatmulDesc {
bool is_batched;
cublasLtMatmulDesc_t matmul_desc;
cudaDataType_t dt_a, dt_b, dt_c;
#if CUDA_VERSION >= 11000
cublasComputeType_t dt_compute;
#else
cudaDataType_t dt_compute;
#endif
cublasLtMatrixLayout_t layout_a, layout_b, layout_c;
cublasLtMatrixLayout_t layout_trans_a, layout_trans_b, layout_trans_c;
size_t workspace_a, workspace_b, workspace_c;
......
......@@ -128,7 +128,23 @@ void MatrixMulForwardImpl::AlgoCuBlasLt::exec(const ExecArgs& args) const {
stream));
cublas_check(cublasLtMatrixTransformDescDestroy(transform_desc));
};
switch(desc.dt_compute) {
#if CUDA_VERSION >= 11000
switch (desc.dt_compute) {
case CUBLAS_COMPUTE_16F:
hgemm();
break;
case CUBLAS_COMPUTE_32F:
sgemm();
break;
case CUBLAS_COMPUTE_32I:
igemm();
break;
default:
megdnn_throw(megdnn_mangle(
"compute type must be float16/float32/int32"));
}
#else
switch (desc.dt_compute) {
case CUDA_R_16F:
hgemm();
break;
......@@ -139,8 +155,10 @@ void MatrixMulForwardImpl::AlgoCuBlasLt::exec(const ExecArgs& args) const {
igemm();
break;
default:
megdnn_throw(megdnn_mangle("compute type must be float16/float32/int32"));
megdnn_throw(megdnn_mangle(
"compute type must be float16/float32/int32"));
}
#endif
}
#endif
// vim: syntax=cpp.doxygen
......@@ -309,6 +309,9 @@ void benchmark_target_algo(Handle* handle, const std::vector<BenchArgs>& args,
arg.f / (1e12);
TensorShape src{arg.n, arg.ci, arg.hi, arg.wi},
filter{arg.co, arg.ci, arg.f, arg.f};
if (!algo){
algo = "no_name";
}
printf("src=%s, filter=%s, time(algo=%s)=%.2f %.2fTops, "
"time(cudnn)=%.2f %.2fTops, time(batched_matmul)=%.2f "
"%.2fTops, "
......
Makefile
/test/imperative_test
python/megengine/version.py
\ No newline at end of file
......@@ -70,3 +70,8 @@ add_custom_command(
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/python/requires-style.txt ${CMAKE_CURRENT_BINARY_DIR}/python/requires-style.txt
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/python/requires-test.txt ${CMAKE_CURRENT_BINARY_DIR}/python/requires-test.txt
)
add_custom_command(
TARGET ${MODULE_NAME} POST_BUILD
COMMAND "${PYTHON_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/python/gen_version.py --output ${CMAKE_CURRENT_BINARY_DIR}/python/megengine/version.py
)
import argparse
import os
import subprocess
def get_git_commit(src_dir):
try:
return subprocess.check_output(['git', 'rev-parse', 'HEAD'], cwd=src_dir).decode('ascii').strip()
except Exception:
return 'unknown'
def get_mge_version(version_txt_path):
v = {}
with open(version_txt_path) as fp:
exec(fp.read(), v)
return v
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="generate version.py to build path")
parser.add_argument("--output", type=str, required=True)
args = parser.parse_args()
python_dir = os.path.dirname(__file__)
version_txt_path = os.path.join(python_dir, 'version_template.py')
commit_id = get_git_commit(python_dir)
mge_ver_map = get_mge_version(version_txt_path)
mge_ver = mge_ver_map['__version__'] if '__version__' in mge_ver_map else 'unknown'
mge_intl = mge_ver_map['__internal__'] if '__internal__' in mge_ver_map else False
with open(args.output, 'w') as f:
f.write("__version__ = '{}'\n".format(mge_ver))
f.write("git_version = {}\n".format(repr(commit_id)))
if mge_intl:
f.write("__internal__ = True\n")
# -*- coding: utf-8 -*-
# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
#
# Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
__version__ = "1.3.0.dev"
......@@ -8,7 +8,7 @@
```bash
1: please refer to: https://docs.docker.com/engine/security/rootless/ to enable rootless docker env
2: cd ./scripts/whl/manylinux2010
2: cd ./scripts/whl/manylinux2014
3: ./build_image.sh
```
......@@ -56,24 +56,25 @@
```
# How to build
Note: Guarantee the git repo is mounted in docker container, do not use `git submodule update --init` in to init megbrain repo
## Build for linux
* MegBrain delivers `wheel` package with `manylinux2010` tag defined in [PEP-571](https://www.python.org/dev/peps/pep-0571/).
* MegBrain delivers `wheel` package with `manylinux2014` tag defined in [PEP-571](https://www.python.org/dev/peps/pep-0571/).
commands:
```bash
export CUDA_ROOT_DIR=/path/to/cuda
export CUDNN_ROOT_DIR=/path/to/cudnn
export TENSORRT_ROOT_DIR=/path/to/tensorrt
./scripts/whl/manylinux2010/build_wheel.sh
./scripts/whl/manylinux2014/build_wheel_common.sh -sdk cu101
```
* And you can find all of the outputs in `output` directory.If you just want to build for a specific Python verison, you can use `ALL_PYTHON` environment variable. eg:
```bash
ALL_PYTHON="36m" ./scripts/whl/manylinux2010/build_wheel.sh
ALL_PYTHON="36m" ./scripts/whl/manylinux2014/build_wheel_common.sh -sdk cu101
```
* If you just want to build with cpu only version, you can set `BUILD_WHL_CPU_ONLY` environment 'ON'. eg:
```bash
BUILD_WHL_CPU_ONLY="ON" ALL_PYTHON="36m" ./scripts/whl/manylinux2010/build_wheel.sh
BUILD_WHL_CPU_ONLY="ON" ALL_PYTHON="36m" ./scripts/whl/manylinux2014/build_wheel_common.sh -sdk cu101
```
## Build for MacOS
......
FROM quay.io/pypa/manylinux2014_x86_64:2020-12-31-56195b3
ENV UID=1024 \
PATH=${PATH}:/usr/local/cuda/bin \
LIBRARY_PATH=${LIBRARY_PATH}:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/opt/cudnn/lib64:/opt/tensorrt/lib \
LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/opt/cudnn/lib64:/opt/tensorrt/lib \
CPATH=${CPATH}:/usr/local/cuda/include:/opt/cudnn/include:/opt/tensorrt/include
ARG platform
COPY setup_mirror.sh .
RUN ./setup_mirror.sh "$platform"
ADD init_image.sh /tmp
RUN /tmp/init_image.sh && rm -f /tmp/init_image.sh
#!/bin/bash -e
cd $(dirname $0)
docker build -t env_manylinux2014:latest .
#!/bin/bash
set -e
CWD=$(dirname $0)
BASEDIR=$(readlink -f ${CWD}/../../..)
OUTPUTDIR=$(readlink -f ${CWD}/output)
USERID=$(id -u)
TMPFS_ARGS="--tmpfs /tmp:exec"
local_path=$(dirname $(readlink -f $0))
CUDNN_LIB_DIR="/opt/cudnn/lib64/"
CUDA_LIB_DIR="/usr/local/cuda/lib64/"
CUDA_SDK="unknown"
function usage() {
echo "use '-sdk cu111' to specify cuda toolkit config, also support cu101, cu112"
}
while [ "$1" != "" ]; do
case $1 in
-sdk)
shift
CUDA_SDK=$1
shift
;;
*)
usage
exit 1
esac
done
echo "Build with ${CUDA_SDK}"
if [ $CUDA_SDK == "cu101" ];then
COPY_LIB_LIST="${CUDA_LIB_DIR}/libnvrtc.so.10.1"
EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=OFF"
OUT_DIR="cu101"
BUILD_GCC8="ON"
REQUIR_CUDA_VERSION="10010"
REQUIR_CUDNN_VERSION="7.6.3"
REQUIR_TENSORRT_VERSION="6.0.1.5"
elif [ $CUDA_SDK == "cu111" ];then
COPY_LIB_LIST="\
${CUDA_LIB_DIR}/libnvrtc.so.11.1:\
${CUDA_LIB_DIR}/libcublasLt.so.11:\
${CUDA_LIB_DIR}/libcublas.so.11:\
${CUDNN_LIB_DIR}/libcudnn_adv_infer.so.8:\
${CUDNN_LIB_DIR}/libcudnn_adv_train.so.8:\
${CUDNN_LIB_DIR}/libcudnn_cnn_infer.so.8:\
${CUDNN_LIB_DIR}/libcudnn_cnn_train.so.8:\
${CUDNN_LIB_DIR}/libcudnn_ops_infer.so.8:\
${CUDNN_LIB_DIR}/libcudnn_ops_train.so.8:\
${CUDNN_LIB_DIR}/libcudnn.so.8"
EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON\
-gencode arch=compute_61,code=sm_61 \
arch=compute_70,code=sm_70 \
arch=compute_75,code=sm_75 \
arch=compute_80,code=sm_80 \
arch=compute_86,code=sm_86 \
arch=compute_86,code=compute_86"
OUT_DIR="cu111"
REQUIR_CUDA_VERSION="11010"
REQUIR_CUDNN_VERSION="8.0.5"
REQUIR_TENSORRT_VERSION="7.2.2.3"
elif [ $CUDA_SDK == "cu112" ];then
COPY_LIB_LIST="\
${CUDA_LIB_DIR}/libnvrtc.so.11.2:\
${CUDA_LIB_DIR}/libcublasLt.so.11:\
${CUDA_LIB_DIR}/libcublas.so.11:\
${CUDNN_LIB_DIR}/libcudnn_adv_infer.so.8:\
${CUDNN_LIB_DIR}/libcudnn_adv_train.so.8:\
${CUDNN_LIB_DIR}/libcudnn_cnn_infer.so.8:\
${CUDNN_LIB_DIR}/libcudnn_cnn_train.so.8:\
${CUDNN_LIB_DIR}/libcudnn_ops_infer.so.8:\
${CUDNN_LIB_DIR}/libcudnn_ops_train.so.8:\
${CUDNN_LIB_DIR}/libcudnn.so.8"
EXTRA_CMAKE_FLAG=" -DMGE_WITH_CUDNN_SHARED=ON \
-gencode arch=compute_61,code=sm_61 \
arch=compute_70,code=sm_70 \
arch=compute_75,code=sm_75 \
arch=compute_80,code=sm_80 \
arch=compute_86,code=sm_86 \
arch=compute_86,code=compute_86"
OUT_DIR="cu112"
REQUIR_CUDA_VERSION="11020"
REQUIR_CUDNN_VERSION="8.0.5"
REQUIR_TENSORRT_VERSION="7.2.2.3"
else
echo "no support sdk ${CUDA_SDK}, please set by '-sdk cu111'"
exit -1
fi
BUILD_WHL_CPU_ONLY=${BUILD_WHL_CPU_ONLY}
if [[ -z ${BUILD_WHL_CPU_ONLY} ]]
then
BUILD_WHL_CPU_ONLY="OFF"
fi
echo ${BASEDIR}
pushd ${BASEDIR}/third_party >/dev/null
./prepare.sh
popd >/dev/null
cd ${CWD}
mkdir -p ${OUTPUTDIR}
if [ ${BUILD_WHL_CPU_ONLY} = "OFF" ]; then
if [[ -z ${CUDA_ROOT_DIR} ]]; then
echo "Environment variable CUDA_ROOT_DIR not set."
exit -1
fi
if [[ -z ${CUDNN_ROOT_DIR} ]]; then
echo "Environment variable CUDNN_ROOT_DIR not set."
exit -1
fi
if [[ -z ${TENSORRT_ROOT_DIR} ]]; then
echo "Environment variable TENSORRT_ROOT_DIR not set."
exit -1
fi
## YOU SHOULD MODIFY CUDA VERSION AS BELOW WHEN UPGRADE
CUDA_ROOT_DIR_=${CUDA_ROOT_DIR%*/}
CUDNN_ROOT_DIR_=${CUDNN_ROOT_DIR%*/}
TENSORRT_ROOT_DIR_=${TENSORRT_ROOT_DIR%*/}
CUDA_VERSION_PATH=${CUDA_ROOT_DIR_}/include/cuda.h
if [ "$REQUIR_CUDA_VERSION" -ge "11000" ];then
CUDNN_VERSION_PATH=${CUDNN_ROOT_DIR_}/include/cudnn_version.h
else
CUDNN_VERSION_PATH=${CUDNN_ROOT_DIR_}/include/cudnn.h
fi
TENSORRT_VERSION_PATH=${TENSORRT_ROOT_DIR_}/include/NvInferVersion.h
if [ ! -e $CUDA_VERSION_PATH ] ; then
echo file $CUDA_VERSION_PATH is not exist
echo please check the Environment must use CUDA-$REQUIR_CUDA_VERSION
exit -1
fi
if [ ! -e $CUDNN_VERSION_PATH ] ; then
echo file $CUDNN_VERSION_PATH is not exist
echo please check the Environment must use CUDNN-V$REQUIR_CUDNN_VERSION
exit -1
fi
if [ ! -e $TENSORRT_VERSION_PATH ] ; then
echo file $TENSORRT_VERSION_PATH is not exist
echo please check the Environment must use TensorRT-$REQUIR_TENSORRT_VERSION
exit -1
fi
CUDA_VERSION_CONTEXT=$(head -300 ${CUDA_VERSION_PATH})
CUDNN_VERSION_CONTEXT=$(head -62 ${CUDNN_VERSION_PATH})
TENSORRT_VERSION_CONTEXT=$(tail -12 ${TENSORRT_VERSION_PATH})
if [ "$REQUIR_CUDA_VERSION" -ge "11000" ];then
CUDA_API_VERSION=$(echo $CUDA_VERSION_CONTEXT | grep -Eo "define CUDA_VERSION * +([0-9]+)")
else
CUDA_API_VERSION=$(echo $CUDA_VERSION_CONTEXT | grep -Eo "define __CUDA_API_VERSION * +([0-9]+)")
fi
CUDA_VERSION=${CUDA_API_VERSION:0-5}
echo CUDA_VERSION:$CUDA_VERSION
CUDNN_VERSION_MAJOR=$(echo $CUDNN_VERSION_CONTEXT | grep -Eo "define CUDNN_MAJOR * +([0-9]+)")
CUDNN_VERSION_MINOR=$(echo $CUDNN_VERSION_CONTEXT | grep -Eo "define CUDNN_MINOR * +([0-9]+)")
CUDNN_VERSION_PATCH=$(echo $CUDNN_VERSION_CONTEXT | grep -Eo "define CUDNN_PATCHLEVEL * +([0-9]+)")
CUDNN_VERSION=${CUDNN_VERSION_MAJOR:0-1}.${CUDNN_VERSION_MINOR:0-1}.${CUDNN_VERSION_PATCH:0-1}
echo CUDNN_VERSION:$CUDNN_VERSION
TENSORRT_VERSION_MAJOR=$(echo $TENSORRT_VERSION_CONTEXT | grep -Eo "NV_TENSORRT_MAJOR * +([0-9]+)")
TENSORRT_VERSION_MINOR=$(echo $TENSORRT_VERSION_CONTEXT | grep -Eo "NV_TENSORRT_MINOR * +([0-9]+)")
TENSORRT_VERSION_PATCH=$(echo $TENSORRT_VERSION_CONTEXT | grep -Eo "NV_TENSORRT_PATCH * +([0-9]+)")
TENSORRT_VERSION_BUILD=$(echo $TENSORRT_VERSION_CONTEXT | grep -Eo "NV_TENSORRT_BUILD * +([0-9]+)")
TENSORRT_VERSION=${TENSORRT_VERSION_MAJOR:0-1}.${TENSORRT_VERSION_MINOR:0-1}.${TENSORRT_VERSION_PATCH:0-1}.${TENSORRT_VERSION_BUILD:0-1}
echo TENSORRT_VERSION:$TENSORRT_VERSION
if [ $CUDA_VERSION != $REQUIR_CUDA_VERSION ] ; then
echo please check the Environment must use CUDA-10.1 NO.$REQUIR_CUDA_VERSION
exit -1
fi
if [ $CUDNN_VERSION != $REQUIR_CUDNN_VERSION ] ; then
echo please check the Environment must use CUDNN-V$REQUIR_CUDNN_VERSION
exit -1
fi
if [ $TENSORRT_VERSION != $REQUIR_TENSORRT_VERSION ] ; then
echo please check the Environment must use TENSORRT-$REQUIR_TENSORRT_VERSION
exit -1
fi
fi
if [[ -z ${BUILD_GCC8} ]];then
BUILD_GCC8=OFF
fi
if [ "$BUILD_GCC8" == "ON" ];then
run_cmd="scl enable devtoolset-8 /home/code/scripts/whl/manylinux2014/do_build_common.sh"
else
run_cmd="/home/code/scripts/whl/manylinux2014/do_build_common.sh"
fi
docker run --rm -it $TMPFS_ARGS \
-e UID=${USERID} \
-e LOCAL_VERSION=${LOCAL_VERSION} \
-e BUILD_WHL_CPU_ONLY=${BUILD_WHL_CPU_ONLY} \
-e ALL_PYTHON="${ALL_PYTHON}" \
-e EXTRA_CMAKE_FLAG="$EXTRA_CMAKE_FLAG" \
-e COPY_LIB_LIST="$COPY_LIB_LIST" \
-e OUT_DIR="$OUT_DIR" \
-v ${CUDA_ROOT_DIR}:/usr/local/cuda \
-v ${CUDNN_ROOT_DIR}:/opt/cudnn \
-v ${TENSORRT_ROOT_DIR}:/opt/tensorrt \
-v ${BASEDIR}:/home/code \
-v ${OUTPUTDIR}:/home/output:rw \
env_manylinux2014:latest /bin/bash -c "$run_cmd"
#!/bin/bash -ex
function handle_strip() {
echo "now handle strip $1"
objcopy --only-keep-debug $1 $1.dbg
strip -s $1
objcopy --add-gnu-debuglink=$1.dbg $1
rm $1.dbg
}
function full_copy_so(){
lib_path=$1
dst_dir=$2
append_rpath=$3
lib_name=$(basename $lib_path)
cp $lib_path $dst_dir/$lib_name
if [ "$append_rpath" != "" ];then
ori_rpath=$(patchelf --print-rpath $dst_dir/$lib_name)
if [ "$ori_rpath" != "" ];then
patchelf --set-rpath "$ori_rpath:$append_rpath" $dst_dir/$lib_name
else
patchelf --set-rpath "$append_rpath" $dst_dir/$lib_name
fi
fi
}
function patch_elf_depend_lib() {
echo "handle common depend lib"
LIBS_DIR=${BUILD_DIR}/staging/megengine/core/lib
mkdir -p ${LIBS_DIR}
cp /usr/lib64/libatomic.so.1 ${LIBS_DIR}
patchelf --remove-rpath ${BUILD_DIR}/staging/megengine/core/_imperative_rt.so
patchelf --force-rpath --set-rpath '$ORIGIN/lib' ${BUILD_DIR}/staging/megengine/core/_imperative_rt.so
cp ${BUILD_DIR}/src/libmegengine_export.so ${LIBS_DIR}
patchelf --remove-rpath ${LIBS_DIR}/libmegengine_export.so
patchelf --force-rpath --set-rpath '$ORIGIN/.' ${LIBS_DIR}/libmegengine_export.so
cp ${BUILD_DIR}/src/libmegengine_export.so ${LIBS_DIR}
patchelf --remove-rpath ${LIBS_DIR}/libmegengine_export.so
patchelf --force-rpath --set-rpath '$ORIGIN/.' ${LIBS_DIR}/libmegengine_export.so
if [ ${BUILD_WHL_CPU_ONLY} = "OFF" ]; then
echo "handle cuda lib"
cp ${BUILD_DIR}/dnn/cuda-stub/libcuda_stub.so ${LIBS_DIR}
cp /usr/local/cuda/lib64/libnvToolsExt.so.1 ${LIBS_DIR}
IFS=: read -a lib_name_array <<<"$COPY_LIB_LIST"
append_rpath='$ORIGIN/.'
for lib_name in ${lib_name_array[@]};do
full_copy_so $lib_name ${LIBS_DIR} $lib_append_rpath
done
fi
}
ALL_PYTHON=${ALL_PYTHON}
if [[ -z ${ALL_PYTHON} ]]
then
ALL_PYTHON="35m 36m 37m 38"
fi
BUILD_WHL_CPU_ONLY=${BUILD_WHL_CPU_ONLY}
if [[ -z ${BUILD_WHL_CPU_ONLY} ]]
then
BUILD_WHL_CPU_ONLY="OFF"
fi
SRC_DIR=$(readlink -f "`dirname $0`/../../../")
BUILD_DIR=${SRC_DIR}/build_dir/host/MGE_WITH_CUDA_OFF/MGE_INFERENCE_ONLY_OFF/Release/build/
if [ ${BUILD_WHL_CPU_ONLY} = "OFF" ]; then
BUILD_DIR=${SRC_DIR}/build_dir/host/MGE_WITH_CUDA_ON/MGE_INFERENCE_ONLY_OFF/Release/build/
fi
NEW_LIB_PATH=core/lib
for ver in ${ALL_PYTHON}
do
USE_AUDITWHEEL="ON"
python_ver=${ver:0:2}
MAJOR=${python_ver:0:1}
MINOR=${ver:1}
PYTHON_DIR=/opt/python/cp${python_ver}-cp${ver}/
export EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} ${EXTRA_CMAKE_FLAG}"
export EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DCMAKE_BUILD_TYPE=RelWithDebInfo"
export EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DCMAKE_PREFIX_PATH=${PYTHON_DIR}"
export EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DPYTHON_EXECUTABLE=${PYTHON_DIR}/bin/python3"
export EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DPYTHON_LIBRARY=${PYTHON_DIR}lib/"
export EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DPYTHON_INCLUDE_DIR=${PYTHON_DIR}include/python${MAJOR}.${MINOR}"
export EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DMGE_WITH_ATLAS=ON"
if [ ${BUILD_WHL_CPU_ONLY} = "OFF" ]; then
${SRC_DIR}/scripts/cmake-build/host_build.sh -c -t -r
else
${SRC_DIR}/scripts/cmake-build/host_build.sh -t -r
fi
cd ${BUILD_DIR}
rm -rf staging
mkdir -p staging
cp -a imperative/python/{megengine,setup.py,requires.txt,requires-style.txt,requires-test.txt} staging/
handle_strip ${BUILD_DIR}/src/libmegengine_export.so
cd ${BUILD_DIR}/staging/megengine/core
handle_strip _imperative_rt.so
mkdir -p lib/ucx
if [ ${USE_AUDITWHEEL} = "OFF" ]; then
patch_elf_depend_lib
fi
cd ${BUILD_DIR}/staging/
${PYTHON_DIR}/bin/python setup.py bdist_wheel
cd /home/output
if [ ${USE_AUDITWHEEL} = "ON" ]; then
LD_LIBRARY_PATH=${BUILD_DIR}/dnn/cuda-stub:$LD_LIBRARY_PATH auditwheel repair -L ${NEW_LIB_PATH} ${BUILD_DIR}/staging/dist/Meg*.whl
else
mkdir -p ${SRC_DIR}/scripts/whl/manylinux2014/output/wheelhouse/${OUT_DIR}
cd ${BUILD_DIR}/staging/dist/
org_whl_name=`ls Meg*${ver}*.whl`
compat_whl_name=`echo ${org_whl_name} | sed 's/linux/manylinux2014/'`
echo "org whl name: ${org_whl_name}"
echo "comapt whl name: ${compat_whl_name}"
mv ${org_whl_name} ${SRC_DIR}/scripts/whl/manylinux2014/output/wheelhouse/${OUT_DIR}/${compat_whl_name}
cd /home/output
fi
chown -R ${UID}.${UID} .
# compat for root-less docker env to remove output at host side
chmod -R 777 .
echo "python $ver done"
done
#!/bin/bash -e
GET_PIP_URL='https://bootstrap.pypa.io/get-pip.py'
SWIG_URL='https://downloads.sourceforge.net/project/swig/swig/swig-3.0.12/swig-3.0.12.tar.gz?use_mirror=autoselect'
LLVM_URL='https://github.com/llvm-mirror/llvm/archive/release_60.tar.gz'
CLANG_URL='https://github.com/llvm-mirror/clang/archive/release_60.tar.gz'
yum install -y pcre-devel devtoolset-9-libatomic-devel.x86_64
yum install -y devtoolset-8 devtoolset-8-libatomic-devel.x86_64
for ver in 35m 36m 37m 38
do
python_ver=${ver:0:2}
curl ${GET_PIP_URL} | /opt/python/cp${python_ver}-cp${ver}/bin/python - \
--no-cache-dir --only-binary :all:
/opt/python/cp${python_ver}-cp${ver}/bin/pip install \
--no-cache-dir --only-binary :all: numpy==1.18.1 setuptools==46.1.3
done
pushd /home >/dev/null
echo "Install swig"
curl -sSL ${SWIG_URL} | tar xz
pushd swig-3.0.12 >/dev/null
mkdir build
pushd build >/dev/null
../configure
make -j$(nproc)
make install
popd >/dev/null
popd >/dev/null
rm -rf swig-3.0.12
echo "Install llvm"
curl -sSL ${LLVM_URL} | tar xz
pushd llvm-release_60 >/dev/null
mkdir build
pushd build >/dev/null
cmake .. -DCMAKE_PREFIX_PATH=/opt/python/cp36-cp36m/ \
-DCMAKE_BUILD_TYPE=Release
make -j$(nproc)
make install
popd >/dev/null
popd >/dev/null
rm -rf llvm-release_60
echo "Install clang"
curl -sSL ${CLANG_URL} | tar xz
pushd clang-release_60 >/dev/null
mkdir build
pushd build >/dev/null
cmake .. -DCMAKE_PREFIX_PATH=/opt/python/cp36-cp36m/ \
-DCMAKE_BUILD_TYPE=Release
make -j$(nproc)
make install
popd >/dev/null
popd >/dev/null
rm -rf clang-release_60
popd >/dev/null
pushd /tmp >/dev/null
curl -sSL https://github.com/NixOS/patchelf/archive/0.12.tar.gz | tar xz
pushd /tmp/patchelf-0.12 >/dev/null
sed -i '331s/32/64/' ./src/patchelf.cc
./bootstrap.sh && ./configure && make install-strip
popd
rm -rf /tmp/patchelf-0.12
popd
yum clean all
#!/bin/bash
set -e
function set_tuna_yum_mirror() {
cp /etc/yum.repos.d/CentOS-Base.repo /etc/yum.repos.d/CentOS-Base.repo.bak
local repo=/etc/yum.repos.d/CentOS-Base.repo
local plugin=/etc/yum/pluginconf.d/fastestmirror.conf
sed -i "s/mirrorlist=/#mirrorlist=/g" $repo
sed -i "s/#baseurl/baseurl/g" $repo
sed -i "s/mirror.centos.org/mirrors.tuna.tsinghua.edu.cn/g" $repo
sed -i "s/http/https/g" $repo
sed -i "s/enabled=1/enabled=0/g" $plugin
yum clean all
# Build on brainpp unable to pull epel reo metadata so disable this
# https://unix.stackexchange.com/questions/148144/unable-to-pull-epel-repository-metadata
yum --disablerepo="epel" update nss
yum makecache
}
function set_epel() {
mv /etc/yum.repos.d/epel.repo /etc/yum.repos.d/epel.repo.backup
mv /etc/yum.repos.d/epel-testing.repo /etc/yum.repos.d/epel-testing.repo.backup
curl -o /etc/yum.repos.d/epel.repo http://mirrors.aliyun.com/repo/epel-7.repo
}
function set_yum_mirror() {
mv /etc/yum.repos.d/CentOS-Base.repo /etc/yum.repos.d/CentOS-Base.repo.backup
curl -o /etc/yum.repos.d/CentOS-Base.repo https://mirrors.aliyun.com/repo/Centos-7.repo
yum makecache
}
function set_pip_mirror() {
cat > /etc/pip.conf <<EOF
[global]
timeout = 180
index-url = https://mirrors.aliyun.com/pypi/simple
extra-index-url =
http://mirrors.i.brainpp.cn/pypi/simple/
http://pypi.i.brainpp.cn/brain/dev/+simple
https://pypi.tuna.tsinghua.edu.cn/simple
trusted-host =
mirrors.i.brainpp.cn
pypi.i.brainpp.cn
pypi.tuna.tsinghua.edu.cn
mirrors.aliyun.com
EOF
}
function main() {
local platform=$1
case $platform in
brainpp)
set_epel
set_yum_mirror
set_pip_mirror
;;
*)
echo "No setup required"
;;
esac
}
main "$@"
......@@ -81,8 +81,9 @@ void NMSKeep::CUDAKern::exec(const NMSKeep* opr, const DeviceTensorND& inp,
init(opr, inp.shape());
auto inp_ptr = inp.ptr<float>();
auto dev_overlap_mask = reinterpret_cast<uint64_t*>(workspace.raw_ptr()),
dev_rm_mask = reinterpret_cast<uint64_t*>(
void* workspace_ptr = workspace.raw_ptr();
auto dev_overlap_mask = reinterpret_cast<uint64_t*>(workspace_ptr),
dev_rm_mask = (uint64_t*)(
workspace.raw_ptr() + m_workspace_overlap_mask_bytes_align);
auto out_idx_ptr = reinterpret_cast<uint32_t*>(out_idx.ptr<int32_t>()),
out_size_ptr = reinterpret_cast<uint32_t*>(out_size.ptr<int32_t>());
......
......@@ -27,6 +27,9 @@
#include "megbrain/gopt/inference.h"
#include "megbrain/gopt/misc.h"
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
using namespace mgb;
using namespace gopt;
using namespace cg;
......@@ -1749,6 +1752,7 @@ void mgb::tensorrt::transform_dest_vars_inplace(
optimizer.apply_inplace(dest_vars);
}
#pragma GCC diagnostic pop
#endif
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
......@@ -20,7 +20,8 @@
#include "megbrain/utils/debug.h"
#if MGB_ENABLE_TENSOR_RT
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
#include "megbrain/tensorrt/tensorrt_opr.h"
#include "make_trt_net.h"
......@@ -111,7 +112,8 @@ intl::SimpleQuantizedTensorRTNetwork::SimpleQuantizedTensorRTNetwork() {
host_b = range_gen({1, 8, 1, 1});
{
float* ptr = reinterpret_cast<float*>(host_w->raw_ptr());
void* w_ptr = host_w->raw_ptr();
float* ptr = reinterpret_cast<float*>(w_ptr);
ptr[0] = -127*1.1f;
ptr[1] = 127*1.1f;
}
......@@ -362,6 +364,7 @@ intl::ConcatConvTensorRTNetwork::create_trt_network(bool has_batch_dim) {
return std::make_pair(builder, network);
}
#pragma GCC diagnostic pop
#endif // MGB_ENABLE_TENSOR_RT
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册