diff --git a/csrc/aio/py_lib/deepspeed_py_copy.h b/csrc/aio/py_lib/deepspeed_py_copy.h
index 823c298f643127968a50b795e3924129b6e9ee19..819d568bb92b63b97e14e65ee371bdd6490df90b 100644
--- a/csrc/aio/py_lib/deepspeed_py_copy.h
+++ b/csrc/aio/py_lib/deepspeed_py_copy.h
@@ -6,8 +6,10 @@ Licensed under the MIT license.
 Functionality for swapping optimizer tensors to/from (NVMe) storage devices.
 */
 
+#if (__x86_64__ || __i386__)
 #include <cpuid.h>
 #include <x86intrin.h>
+#endif
 
 #include <deepspeed_aio_common.h>
 #include <stdlib.h>
diff --git a/op_builder/async_io.py b/op_builder/async_io.py
index 504173424e92d4760ebcd76f2d69f0d6a605d230..78aa2fe92d67c2a307271e1a8af9f43ed8a333f7 100644
--- a/op_builder/async_io.py
+++ b/op_builder/async_io.py
@@ -30,7 +30,9 @@ class AsyncIOBuilder(OpBuilder):
         return ['csrc/aio/py_lib', 'csrc/aio/common']
 
     def cxx_args(self):
-        args = [
+        CPU_ARCH = self.cpu_arch()
+        SIMD_WIDTH = self.simd_width()
+        return [
             '-g',
             '-Wall',
             '-O0',
@@ -38,17 +40,12 @@ class AsyncIOBuilder(OpBuilder):
             '-shared',
             '-fPIC',
             '-Wno-reorder',
-            '-march=native',
+            CPU_ARCH,
             '-fopenmp',
+            SIMD_WIDTH,
             '-laio',
         ]
 
-        simd_width = self.simd_width()
-        if len(simd_width) > 0:
-            args.append(simd_width)
-
-        return args
-
     def extra_ldflags(self):
         return ['-laio']
 
diff --git a/op_builder/builder.py b/op_builder/builder.py
index 21547f896473ff8cd7057b5d899f66a297159cd6..3eeb4e4bfe749feb7a48c9973ecbb767d2de9fd5 100644
--- a/op_builder/builder.py
+++ b/op_builder/builder.py
@@ -160,13 +160,34 @@ class OpBuilder(ABC):
             valid = valid or result.wait() == 0
         return valid
 
+    def strip_empty_entries(self, args):
+        '''
+        Drop any empty strings from the list of compile and link flags
+        '''
+        return [x for x in args if len(x) > 0]
+
+    def cpu_arch(self):
+        if not self.command_exists('lscpu'):
+            self.warning(
+                f"{self.name} attempted to query 'lscpu' to detect the CPU architecture. "
+                "However, 'lscpu' does not appear to exist on "
+                "your system, will fall back to use -march=native.")
+            return '-march=native'
+
+        result = subprocess.check_output('lscpu', shell=True)
+        result = result.decode('utf-8').strip().lower()
+        if 'ppc64le' in result:
+            # gcc does not provide -march on PowerPC, use -mcpu instead
+            return '-mcpu=native'
+        return '-march=native'
+
     def simd_width(self):
         if not self.command_exists('lscpu'):
             self.warning(
-                f"{self.name} is attempted to query 'lscpu' to detect the existence "
+                f"{self.name} attempted to query 'lscpu' to detect the existence "
                 "of AVX instructions. However, 'lscpu' does not appear to exist on "
                 "your system, will fall back to non-vectorized execution.")
-            return ''
+            return '-D__SCALAR__'
 
         result = subprocess.check_output('lscpu', shell=True)
         result = result.decode('utf-8').strip().lower()
@@ -175,7 +196,7 @@ class OpBuilder(ABC):
                 return '-D__AVX512__'
             elif 'avx2' in result:
                 return '-D__AVX256__'
-        return ''
+        return '-D__SCALAR__'
 
     def python_requirements(self):
         '''
@@ -220,11 +241,12 @@ class OpBuilder(ABC):
 
     def builder(self):
         from torch.utils.cpp_extension import CppExtension
-        return CppExtension(name=self.absolute_name(),
-                            sources=self.sources(),
-                            include_dirs=self.include_paths(),
-                            extra_compile_args={'cxx': self.cxx_args()},
-                            extra_link_args=self.extra_ldflags())
+        return CppExtension(
+            name=self.absolute_name(),
+            sources=self.strip_empty_entries(self.sources()),
+            include_dirs=self.strip_empty_entries(self.include_paths()),
+            extra_compile_args={'cxx': self.strip_empty_entries(self.cxx_args())},
+            extra_link_args=self.strip_empty_entries(self.extra_ldflags()))
 
     def load(self, verbose=True):
         from ...git_version_info import installed_ops, torch_info
@@ -264,15 +286,17 @@ class OpBuilder(ABC):
         os.makedirs(ext_path, exist_ok=True)
 
         start_build = time.time()
+        sources = [self.deepspeed_src_path(path) for path in self.sources()]
+        extra_include_paths = [
+            self.deepspeed_src_path(path) for path in self.include_paths()
+        ]
         op_module = load(
             name=self.name,
-            sources=[self.deepspeed_src_path(path) for path in self.sources()],
-            extra_include_paths=[
-                self.deepspeed_src_path(path) for path in self.include_paths()
-            ],
-            extra_cflags=self.cxx_args(),
-            extra_cuda_cflags=self.nvcc_args(),
-            extra_ldflags=self.extra_ldflags(),
+            sources=self.strip_empty_entries(sources),
+            extra_include_paths=self.strip_empty_entries(extra_include_paths),
+            extra_cflags=self.strip_empty_entries(self.cxx_args()),
+            extra_cuda_cflags=self.strip_empty_entries(self.nvcc_args()),
+            extra_ldflags=self.strip_empty_entries(self.extra_ldflags()),
             verbose=verbose)
         build_duration = time.time() - start_build
         if verbose:
@@ -356,12 +380,12 @@ class CUDAOpBuilder(OpBuilder):
         from torch.utils.cpp_extension import CUDAExtension
         assert_no_cuda_mismatch()
         return CUDAExtension(name=self.absolute_name(),
-                             sources=self.sources(),
-                             include_dirs=self.include_paths(),
-                             libraries=self.libraries_args(),
+                             sources=self.strip_empty_entries(self.sources()),
+                             include_dirs=self.strip_empty_entries(self.include_paths()),
+                             libraries=self.strip_empty_entries(self.libraries_args()),
                              extra_compile_args={
-                                 'cxx': self.cxx_args(),
-                                 'nvcc': self.nvcc_args()
+                                 'cxx': self.strip_empty_entries(self.cxx_args()),
+                                 'nvcc': self.strip_empty_entries(self.nvcc_args())
                              })
 
     def cxx_args(self):
diff --git a/op_builder/cpu_adam.py b/op_builder/cpu_adam.py
index e2fd8a9ef664a9f15234fd9e47caf462eb1d6056..ed6b4c8a25a7a9570809b6c8dc64ad0c03a3951e 100644
--- a/op_builder/cpu_adam.py
+++ b/op_builder/cpu_adam.py
@@ -29,38 +29,6 @@ class CPUAdamBuilder(CUDAOpBuilder):
         CUDA_INCLUDE = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")
         return ['csrc/includes', CUDA_INCLUDE]
 
-    def cpu_arch(self):
-        if not self.command_exists('lscpu'):
-            self.warning(
-                "CPUAdam attempted to query 'lscpu' to detect the CPU architecture. "
-                "However, 'lscpu' does not appear to exist on "
-                "your system, will fall back to use -march=native.")
-            return ''
-
-        result = subprocess.check_output('lscpu', shell=True)
-        result = result.decode('utf-8').strip().lower()
-        if 'ppc64le' in result:
-            # gcc does not provide -march on PowerPC, use -mcpu instead
-            return '-mcpu=native'
-        return '-march=native'
-
-    def simd_width(self):
-        if not self.command_exists('lscpu'):
-            self.warning(
-                "CPUAdam attempted to query 'lscpu' to detect the existence "
-                "of AVX instructions. However, 'lscpu' does not appear to exist on "
-                "your system, will fall back to non-vectorized execution.")
-            return ''
-
-        result = subprocess.check_output('lscpu', shell=True)
-        result = result.decode('utf-8').strip().lower()
-        if 'genuineintel' in result:
-            if 'avx512' in result:
-                return '-D__AVX512__'
-            elif 'avx2' in result:
-                return '-D__AVX256__'
-        return '-D__SCALAR__'
-
     def cxx_args(self):
         import torch
         CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "lib64")
@@ -77,5 +45,5 @@ class CPUAdamBuilder(CUDAOpBuilder):
             '-Wno-reorder',
             CPU_ARCH,
             '-fopenmp',
-            SIMD_WIDTH
+            SIMD_WIDTH,
         ]