diff --git a/csrc/includes/context.h b/csrc/includes/context.h
index e05c41dc1d0aaa0bb90c4d6f5790e0ea51315d06..f8ae6fc49199c77c15882a7bf8eb199d2b4f6815 100755
--- a/csrc/includes/context.h
+++ b/csrc/includes/context.h
@@ -33,8 +33,8 @@
 
 inline int DS_GET_BLOCKS(const int N)
 {
-    return std::max(
-        std::min((N + DS_CUDA_NUM_THREADS - 1) / DS_CUDA_NUM_THREADS, DS_MAXIMUM_NUM_BLOCKS),
+    return (std::max)(
+        (std::min)((N + DS_CUDA_NUM_THREADS - 1) / DS_CUDA_NUM_THREADS, DS_MAXIMUM_NUM_BLOCKS),
         // Use at least 1 block, since CUDA does not allow empty block
         1);
 }
diff --git a/csrc/includes/gemm_test.h b/csrc/includes/gemm_test.h
index ff06f884351cf207a23462cacb9223a3c274dfda..b920896b419e89d49fbd1b7a9023cebc6c34de7d 100644
--- a/csrc/includes/gemm_test.h
+++ b/csrc/includes/gemm_test.h
@@ -97,7 +97,7 @@ public:
     template <typename Func>
     int Run(int loops, Func f)
     {
-        float fast_latency = std::numeric_limits<float>::max();
+        float fast_latency = (std::numeric_limits<float>::max)();
         int fast_algo = 0;
 
         for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
@@ -252,7 +252,7 @@ public:
     template <typename Func>
     int Run(int loops, Func f)
     {
-        float fast_latency = std::numeric_limits<float>::max();
+        float fast_latency = (std::numeric_limits<float>::max)();
         int fast_algo = 0;
 
         for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
diff --git a/csrc/transformer/ds_transformer_cuda.cpp b/csrc/transformer/ds_transformer_cuda.cpp
index d6c09fb75ae23c52ecf7e44f27f3c0447f4ea95f..85ec0418971c4f053c892cd26b1b41a0406fb386 100644
--- a/csrc/transformer/ds_transformer_cuda.cpp
+++ b/csrc/transformer/ds_transformer_cuda.cpp
@@ -27,8 +27,8 @@ size_t get_workspace_size(int maxBatchSize,
 {
     size_t workSpacesize = 4 * (size_t(maxBatchSize) * seq_len * hidden_size);
     if (training) {
-        workSpacesize += (std::max((size_t(maxBatchSize) * seq_len * intermediate_size),
-                                   2 * (size_t(maxBatchSize) * heads * seq_len * seq_len)));
+        workSpacesize += ((std::max)((size_t(maxBatchSize) * seq_len * intermediate_size),
+                                     2 * (size_t(maxBatchSize) * heads * seq_len * seq_len)));
         if (gelu_checkpoint) workSpacesize += 2 * (size_t(maxBatchSize) * seq_len * hidden_size);
     }
     return workSpacesize * sizeof(T);