diff --git a/paddle/fluid/operators/data_norm_op.cu b/paddle/fluid/operators/data_norm_op.cu
index 8d433b99ca60b7a52b0abc919329a2ac93978b8e..643e7bced5e349dc4e23fab94906691ccece75b3 100644
--- a/paddle/fluid/operators/data_norm_op.cu
+++ b/paddle/fluid/operators/data_norm_op.cu
@@ -16,9 +16,11 @@ limitations under the License. */
 #include <string>
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/operators/data_norm_op.h"
-#include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/nccl_helper.h"
+#endif
 
 namespace paddle {
 namespace operators {
@@ -176,6 +178,7 @@ class DataNormGradKernel<platform::CUDADeviceContext, T>
         d_batch_sum, d_batch_square_sum);
 
     if (need_sync_stats) {
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
       auto comm = platform::NCCLCommContext::Instance().Get(0, ctx.GetPlace());
       PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
           reinterpret_cast<const void *>(d_batch_size),
@@ -194,7 +197,13 @@ class DataNormGradKernel<platform::CUDADeviceContext, T>
         LOG(FATAL) << "Fail to sync nccl stream: "
                    << cudaGetErrorString(e_sync);
       }
+#else
+      PADDLE_THROW(platform::errors::PreconditionNotMet(
+          "PaddlePaddle should compile with GPU, and need_sync_stats connot be "
+          "supported on windows now."));
+#endif
     }
+
     T *batch_size_data =
         ctx.Output<Tensor>("BatchSize")->mutable_data<T>(ctx.GetPlace());
     T *batch_sum_data =
diff --git a/python/paddle/fluid/tests/unittests/test_data_norm_op.py b/python/paddle/fluid/tests/unittests/test_data_norm_op.py
index a3f1ece4f43b3a3eb0c32b3d2d5967f6c717574a..ceaf54fccd8956f982796a6d28f848622d3edf73 100644
--- a/python/paddle/fluid/tests/unittests/test_data_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_data_norm_op.py
@@ -287,6 +287,11 @@ class TestDataNormOpWithSyncStats(OpTest):
     def test_sync_stats(self):
         if not core.is_compiled_with_cuda():
             return
+        if os.name == 'nt':
+            print(
+                'Skip TestDataNormOpWithSyncStats because nccl is not supported on windows'
+            )
+            return
         x = fluid.layers.data(name='x', shape=[1], dtype='int64', lod_level=0)
         emb = layers.embedding(
             input=x,