diff --git a/csrc/aio/py_lib/deepspeed_py_copy.h b/csrc/aio/py_lib/deepspeed_py_copy.h index 823c298f643127968a50b795e3924129b6e9ee19..819d568bb92b63b97e14e65ee371bdd6490df90b 100644 --- a/csrc/aio/py_lib/deepspeed_py_copy.h +++ b/csrc/aio/py_lib/deepspeed_py_copy.h @@ -6,8 +6,10 @@ Licensed under the MIT license. Functionality for swapping optimizer tensors to/from (NVMe) storage devices. */ +#if (__x86_64__ || __i386__) #include #include +#endif #include #include diff --git a/op_builder/async_io.py b/op_builder/async_io.py index 504173424e92d4760ebcd76f2d69f0d6a605d230..78aa2fe92d67c2a307271e1a8af9f43ed8a333f7 100644 --- a/op_builder/async_io.py +++ b/op_builder/async_io.py @@ -30,7 +30,9 @@ class AsyncIOBuilder(OpBuilder): return ['csrc/aio/py_lib', 'csrc/aio/common'] def cxx_args(self): - args = [ + CPU_ARCH = self.cpu_arch() + SIMD_WIDTH = self.simd_width() + return [ '-g', '-Wall', '-O0', @@ -38,17 +40,12 @@ class AsyncIOBuilder(OpBuilder): '-shared', '-fPIC', '-Wno-reorder', - '-march=native', + CPU_ARCH, '-fopenmp', + SIMD_WIDTH, '-laio', ] - simd_width = self.simd_width() - if len(simd_width) > 0: - args.append(simd_width) - - return args - def extra_ldflags(self): return ['-laio'] diff --git a/op_builder/builder.py b/op_builder/builder.py index 21547f896473ff8cd7057b5d899f66a297159cd6..3eeb4e4bfe749feb7a48c9973ecbb767d2de9fd5 100644 --- a/op_builder/builder.py +++ b/op_builder/builder.py @@ -160,13 +160,34 @@ class OpBuilder(ABC): valid = valid or result.wait() == 0 return valid + def strip_empty_entries(self, args): + ''' + Drop any empty strings from the list of compile and link flags + ''' + return [x for x in args if len(x) > 0] + + def cpu_arch(self): + if not self.command_exists('lscpu'): + self.warning( + f"{self.name} attempted to query 'lscpu' to detect the CPU architecture. " + "However, 'lscpu' does not appear to exist on " + "your system, will fall back to use -march=native.") + return '-march=native' + + result = subprocess.check_output('lscpu', shell=True) + result = result.decode('utf-8').strip().lower() + if 'ppc64le' in result: + # gcc does not provide -march on PowerPC, use -mcpu instead + return '-mcpu=native' + return '-march=native' + def simd_width(self): if not self.command_exists('lscpu'): self.warning( - f"{self.name} is attempted to query 'lscpu' to detect the existence " + f"{self.name} attempted to query 'lscpu' to detect the existence " "of AVX instructions. However, 'lscpu' does not appear to exist on " "your system, will fall back to non-vectorized execution.") - return '' + return '-D__SCALAR__' result = subprocess.check_output('lscpu', shell=True) result = result.decode('utf-8').strip().lower() @@ -175,7 +196,7 @@ class OpBuilder(ABC): return '-D__AVX512__' elif 'avx2' in result: return '-D__AVX256__' - return '' + return '-D__SCALAR__' def python_requirements(self): ''' @@ -220,11 +241,12 @@ class OpBuilder(ABC): def builder(self): from torch.utils.cpp_extension import CppExtension - return CppExtension(name=self.absolute_name(), - sources=self.sources(), - include_dirs=self.include_paths(), - extra_compile_args={'cxx': self.cxx_args()}, - extra_link_args=self.extra_ldflags()) + return CppExtension( + name=self.absolute_name(), + sources=self.strip_empty_entries(self.sources()), + include_dirs=self.strip_empty_entries(self.include_paths()), + extra_compile_args={'cxx': self.strip_empty_entries(self.cxx_args())}, + extra_link_args=self.strip_empty_entries(self.extra_ldflags())) def load(self, verbose=True): from ...git_version_info import installed_ops, torch_info @@ -264,15 +286,17 @@ class OpBuilder(ABC): os.makedirs(ext_path, exist_ok=True) start_build = time.time() + sources = [self.deepspeed_src_path(path) for path in self.sources()] + extra_include_paths = [ + self.deepspeed_src_path(path) for path in self.include_paths() + ] op_module = load( name=self.name, - sources=[self.deepspeed_src_path(path) for path in self.sources()], - extra_include_paths=[ - self.deepspeed_src_path(path) for path in self.include_paths() - ], - extra_cflags=self.cxx_args(), - extra_cuda_cflags=self.nvcc_args(), - extra_ldflags=self.extra_ldflags(), + sources=self.strip_empty_entries(sources), + extra_include_paths=self.strip_empty_entries(extra_include_paths), + extra_cflags=self.strip_empty_entries(self.cxx_args()), + extra_cuda_cflags=self.strip_empty_entries(self.nvcc_args()), + extra_ldflags=self.strip_empty_entries(self.extra_ldflags()), verbose=verbose) build_duration = time.time() - start_build if verbose: @@ -356,12 +380,12 @@ class CUDAOpBuilder(OpBuilder): from torch.utils.cpp_extension import CUDAExtension assert_no_cuda_mismatch() return CUDAExtension(name=self.absolute_name(), - sources=self.sources(), - include_dirs=self.include_paths(), - libraries=self.libraries_args(), + sources=self.strip_empty_entries(self.sources()), + include_dirs=self.strip_empty_entries(self.include_paths()), + libraries=self.strip_empty_entries(self.libraries_args()), extra_compile_args={ - 'cxx': self.cxx_args(), - 'nvcc': self.nvcc_args() + 'cxx': self.strip_empty_entries(self.cxx_args()), + 'nvcc': self.strip_empty_entries(self.nvcc_args()) }) def cxx_args(self): diff --git a/op_builder/cpu_adam.py b/op_builder/cpu_adam.py index e2fd8a9ef664a9f15234fd9e47caf462eb1d6056..ed6b4c8a25a7a9570809b6c8dc64ad0c03a3951e 100644 --- a/op_builder/cpu_adam.py +++ b/op_builder/cpu_adam.py @@ -29,38 +29,6 @@ class CPUAdamBuilder(CUDAOpBuilder): CUDA_INCLUDE = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include") return ['csrc/includes', CUDA_INCLUDE] - def cpu_arch(self): - if not self.command_exists('lscpu'): - self.warning( - "CPUAdam attempted to query 'lscpu' to detect the CPU architecture. " - "However, 'lscpu' does not appear to exist on " - "your system, will fall back to use -march=native.") - return '' - - result = subprocess.check_output('lscpu', shell=True) - result = result.decode('utf-8').strip().lower() - if 'ppc64le' in result: - # gcc does not provide -march on PowerPC, use -mcpu instead - return '-mcpu=native' - return '-march=native' - - def simd_width(self): - if not self.command_exists('lscpu'): - self.warning( - "CPUAdam attempted to query 'lscpu' to detect the existence " - "of AVX instructions. However, 'lscpu' does not appear to exist on " - "your system, will fall back to non-vectorized execution.") - return '' - - result = subprocess.check_output('lscpu', shell=True) - result = result.decode('utf-8').strip().lower() - if 'genuineintel' in result: - if 'avx512' in result: - return '-D__AVX512__' - elif 'avx2' in result: - return '-D__AVX256__' - return '-D__SCALAR__' - def cxx_args(self): import torch CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "lib64") @@ -77,5 +45,5 @@ class CPUAdamBuilder(CUDAOpBuilder): '-Wno-reorder', CPU_ARCH, '-fopenmp', - SIMD_WIDTH + SIMD_WIDTH, ]