enable async io op on powerpc architectures (#1224)

* enable async io op on powerpc architectures * drop any empty strings returned by cxx_args Co-authored-by: N Olatunji Ruwase <olruwase@microsoft.com>

enable async io op on powerpc architectures (#1224)
* enable async io op on powerpc architectures * drop any empty strings returned by cxx_args Co-authored-by: N Olatunji Ruwase <olruwase@microsoft.com>
89b0fb42 · Adam Moody · GitHub · 3fa24208 · 89b0fb42 · 89b0fb42
4 changed file
--- a/csrc/aio/py_lib/deepspeed_py_copy.h
+++ b/csrc/aio/py_lib/deepspeed_py_copy.h
@@ -6,8 +6,10 @@ Licensed under the MIT license.
 Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 */

+#if (__x86_64__ || __i386__)
 #include <cpuid.h>
 #include <x86intrin.h>
+#endif

 #include <deepspeed_aio_common.h>
 #include <stdlib.h>

--- a/op_builder/async_io.py
+++ b/op_builder/async_io.py
@@ -30,7 +30,9 @@ class AsyncIOBuilder(OpBuilder):
        return ['csrc/aio/py_lib', 'csrc/aio/common']

    def cxx_args(self):
-        args = [
+        CPU_ARCH = self.cpu_arch()
+        SIMD_WIDTH = self.simd_width()
+        return [
            '-g',
            '-Wall',
            '-O0',
@@ -38,17 +40,12 @@ class AsyncIOBuilder(OpBuilder):
            '-shared',
            '-fPIC',
            '-Wno-reorder',
-            '-march=native',
+            CPU_ARCH,
            '-fopenmp',
+            SIMD_WIDTH,
            '-laio',
        ]

-        simd_width = self.simd_width()
-        if len(simd_width) > 0:
-            args.append(simd_width)
-
-        return args
-
    def extra_ldflags(self):
        return ['-laio']


--- a/op_builder/builder.py
+++ b/op_builder/builder.py
@@ -160,13 +160,34 @@ class OpBuilder(ABC):
            valid = valid or result.wait() == 0
        return valid

+    def strip_empty_entries(self, args):
+        '''
+        Drop any empty strings from the list of compile and link flags
+        '''
+        return [x for x in args if len(x) > 0]
+
+    def cpu_arch(self):
+        if not self.command_exists('lscpu'):
+            self.warning(
+                f"{self.name} attempted to query 'lscpu' to detect the CPU architecture. "
+                "However, 'lscpu' does not appear to exist on "
+                "your system, will fall back to use -march=native.")
+            return '-march=native'
+
+        result = subprocess.check_output('lscpu', shell=True)
+        result = result.decode('utf-8').strip().lower()
+        if 'ppc64le' in result:
+            # gcc does not provide -march on PowerPC, use -mcpu instead
+            return '-mcpu=native'
+        return '-march=native'
+
    def simd_width(self):
        if not self.command_exists('lscpu'):
            self.warning(
-                f"{self.name} is attempted to query 'lscpu' to detect the existence "
+                f"{self.name} attempted to query 'lscpu' to detect the existence "
                "of AVX instructions. However, 'lscpu' does not appear to exist on "
                "your system, will fall back to non-vectorized execution.")
-            return ''
+            return '-D__SCALAR__'

        result = subprocess.check_output('lscpu', shell=True)
        result = result.decode('utf-8').strip().lower()
@@ -175,7 +196,7 @@ class OpBuilder(ABC):
                return '-D__AVX512__'
            elif 'avx2' in result:
                return '-D__AVX256__'
-        return ''
+        return '-D__SCALAR__'

    def python_requirements(self):
        '''
@@ -220,11 +241,12 @@ class OpBuilder(ABC):

    def builder(self):
        from torch.utils.cpp_extension import CppExtension
-        return CppExtension(name=self.absolute_name(),
-                            sources=self.sources(),
-                            include_dirs=self.include_paths(),
-                            extra_compile_args={'cxx': self.cxx_args()},
-                            extra_link_args=self.extra_ldflags())
+        return CppExtension(
+            name=self.absolute_name(),
+            sources=self.strip_empty_entries(self.sources()),
+            include_dirs=self.strip_empty_entries(self.include_paths()),
+            extra_compile_args={'cxx': self.strip_empty_entries(self.cxx_args())},
+            extra_link_args=self.strip_empty_entries(self.extra_ldflags()))

    def load(self, verbose=True):
        from ...git_version_info import installed_ops, torch_info
@@ -264,15 +286,17 @@ class OpBuilder(ABC):
        os.makedirs(ext_path, exist_ok=True)

        start_build = time.time()
+        sources = [self.deepspeed_src_path(path) for path in self.sources()]
+        extra_include_paths = [
+            self.deepspeed_src_path(path) for path in self.include_paths()
+        ]
        op_module = load(
            name=self.name,
-            sources=[self.deepspeed_src_path(path) for path in self.sources()],
-            extra_include_paths=[
-                self.deepspeed_src_path(path) for path in self.include_paths()
-            ],
-            extra_cflags=self.cxx_args(),
-            extra_cuda_cflags=self.nvcc_args(),
-            extra_ldflags=self.extra_ldflags(),
+            sources=self.strip_empty_entries(sources),
+            extra_include_paths=self.strip_empty_entries(extra_include_paths),
+            extra_cflags=self.strip_empty_entries(self.cxx_args()),
+            extra_cuda_cflags=self.strip_empty_entries(self.nvcc_args()),
+            extra_ldflags=self.strip_empty_entries(self.extra_ldflags()),
            verbose=verbose)
        build_duration = time.time() - start_build
        if verbose:
@@ -356,12 +380,12 @@ class CUDAOpBuilder(OpBuilder):
        from torch.utils.cpp_extension import CUDAExtension
        assert_no_cuda_mismatch()
        return CUDAExtension(name=self.absolute_name(),
-                             sources=self.sources(),
-                             include_dirs=self.include_paths(),
-                             libraries=self.libraries_args(),
+                             sources=self.strip_empty_entries(self.sources()),
+                             include_dirs=self.strip_empty_entries(self.include_paths()),
+                             libraries=self.strip_empty_entries(self.libraries_args()),
                             extra_compile_args={
-                                 'cxx': self.cxx_args(),
-                                 'nvcc': self.nvcc_args()
+                                 'cxx': self.strip_empty_entries(self.cxx_args()),
+                                 'nvcc': self.strip_empty_entries(self.nvcc_args())
                             })

    def cxx_args(self):

--- a/op_builder/cpu_adam.py
+++ b/op_builder/cpu_adam.py
@@ -29,38 +29,6 @@ class CPUAdamBuilder(CUDAOpBuilder):
        CUDA_INCLUDE = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")
        return ['csrc/includes', CUDA_INCLUDE]

-    def cpu_arch(self):
-        if not self.command_exists('lscpu'):
-            self.warning(
-                "CPUAdam attempted to query 'lscpu' to detect the CPU architecture. "
-                "However, 'lscpu' does not appear to exist on "
-                "your system, will fall back to use -march=native.")
-            return ''
-
-        result = subprocess.check_output('lscpu', shell=True)
-        result = result.decode('utf-8').strip().lower()
-        if 'ppc64le' in result:
-            # gcc does not provide -march on PowerPC, use -mcpu instead
-            return '-mcpu=native'
-        return '-march=native'
-
-    def simd_width(self):
-        if not self.command_exists('lscpu'):
-            self.warning(
-                "CPUAdam attempted to query 'lscpu' to detect the existence "
-                "of AVX instructions. However, 'lscpu' does not appear to exist on "
-                "your system, will fall back to non-vectorized execution.")
-            return ''
-
-        result = subprocess.check_output('lscpu', shell=True)
-        result = result.decode('utf-8').strip().lower()
-        if 'genuineintel' in result:
-            if 'avx512' in result:
-                return '-D__AVX512__'
-            elif 'avx2' in result:
-                return '-D__AVX256__'
-        return '-D__SCALAR__'
-
    def cxx_args(self):
        import torch
        CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "lib64")
@@ -77,5 +45,5 @@ class CPUAdamBuilder(CUDAOpBuilder):
            '-Wno-reorder',
            CPU_ARCH,
            '-fopenmp',
-            SIMD_WIDTH
+            SIMD_WIDTH,
        ]