diff --git a/paddle/api/Util.cpp b/paddle/api/Util.cpp
index 7446d892fdc272fc3d91b38d75873270080dcbed..11bd05c09d1ecbbcec6b6596c16416c26635a072 100644
--- a/paddle/api/Util.cpp
+++ b/paddle/api/Util.cpp
@@ -47,7 +47,7 @@ bool isUsingGpu() { return FLAGS_use_gpu; }
 void setUseGpu(bool useGpu) { FLAGS_use_gpu = useGpu; }
 
 bool isGpuVersion() {
-#ifndef PADDLE_WITH_GPU
+#ifndef PADDLE_WITH_CUDA
   return false;
 #else
   return true;
diff --git a/paddle/capi/Matrix.cpp b/paddle/capi/Matrix.cpp
index 5b3737a759ce4d7d85feceda854556d765aba2d8..4547afaf1dc9af8bc7909a684db766fdd7b159c0 100644
--- a/paddle/capi/Matrix.cpp
+++ b/paddle/capi/Matrix.cpp
@@ -46,7 +46,7 @@ paddle_error paddle_matrix_set_row(paddle_matrix mat,
   if (rowID >= ptr->mat->getHeight()) return kPD_OUT_OF_RANGE;
   paddle::real* buf = ptr->mat->getRowBuf(rowID);
   size_t width = ptr->mat->getWidth();
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
   hl_memcpy(buf, rowArray, sizeof(paddle::real) * width);
 #else
   std::copy(rowArray, rowArray + width, buf);
diff --git a/paddle/framework/grad_op_builder_test.cc b/paddle/framework/grad_op_builder_test.cc
index 2dbc2e66208a3130620160b7a355b0413ad4fd84..793780ea44b80a0d750d35cbea601bfbfd5ccda4 100644
--- a/paddle/framework/grad_op_builder_test.cc
+++ b/paddle/framework/grad_op_builder_test.cc
@@ -183,4 +183,4 @@ TEST(GradOpDescBuilder, IOIgnoredInGradient) {
                 {f::GradVarName("in3_1"), f::GradVarName("in3_2")}));
   delete forw_op;
   delete grad_op;
-}
\ No newline at end of file
+}
diff --git a/paddle/framework/lod_tensor.h b/paddle/framework/lod_tensor.h
index b12c95b6b7b728ca902c63ae96bfecb2f653e20b..4db36ee76609ac6360fe2fc7b4a366e0284d1016 100644
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@@ -15,7 +15,7 @@
 #pragma once
 
 #include <memory>
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 #include <thrust/system/cuda/experimental/pinned_allocator.h>
@@ -29,7 +29,7 @@
 namespace paddle {
 namespace framework {
 
-#ifndef PADDLE_WITH_GPU
+#ifndef PADDLE_WITH_CUDA
 template <typename T>
 using Vector = std::vector<T>;
 #else
diff --git a/paddle/framework/op_proto_maker_test.cc b/paddle/framework/op_proto_maker_test.cc
index b01e30f75371ca4aa63dae86ddfb966b1d4c7830..988a14cf4de8fdf052ca7e8c41bff0c05ba2daaa 100644
--- a/paddle/framework/op_proto_maker_test.cc
+++ b/paddle/framework/op_proto_maker_test.cc
@@ -48,4 +48,4 @@ TEST(ProtoMaker, DuplicatedInOut) {
   paddle::framework::OpAttrChecker op_checker;
   auto proto_maker = TestInOutProtoMaker(&op_proto, &op_checker);
   ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
-}
\ No newline at end of file
+}
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index aca6579f36e6f840c8a9f759cd2baa391c19fddb..958cf581f53b2bb0252b655267f657c8d9ab371c 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -211,7 +211,7 @@ class OpKernelRegistrar : public Registrar {
 // TODO(fengjiayi): The following macros
 // seems ugly, do we have better method?
 
-#ifndef PADDLE_WITH_GPU
+#ifndef PADDLE_WITH_CUDA
 #define USE_OP_KERNEL(op_type) USE_OP_DEVICE_KERNEL(op_type, CPU)
 #else
 #define USE_OP_KERNEL(op_type)        \
diff --git a/paddle/framework/op_registry_test.cc b/paddle/framework/op_registry_test.cc
index f89f40b444f893d898595db50e245824a19284f6..b860fe6cac773d1e85adecc43f5dfec42b6c7661 100644
--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
@@ -183,4 +183,4 @@ class CosineOpComplete : public paddle::framework::CosineOp {
 TEST(OperatorRegistrar, Test) {
   using namespace paddle::framework;
   OperatorRegistrar<CosineOpComplete, CosineOpProtoAndCheckerMaker> reg("cos");
-}
\ No newline at end of file
+}
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index 21c1c6f9e688a34ddfb98f515059409568c06988..2ca838f838ad0b9211a59bf9247c48d283484d50 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -25,7 +25,7 @@ Eigen::DefaultDevice& ExecutionContext::GetEigenDevice<
   return *device_context_.GetEigenDevice<platform::CPUPlace>();
 }
 
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 template <>
 Eigen::GpuDevice&
 ExecutionContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
diff --git a/paddle/framework/tensor_impl.h b/paddle/framework/tensor_impl.h
index 1cde1f74b83fce2e1d90370a9dad792a43043ffd..379eac94f985c9fa10b6c773065551575f57f033 100644
--- a/paddle/framework/tensor_impl.h
+++ b/paddle/framework/tensor_impl.h
@@ -65,7 +65,7 @@ inline T* Tensor::mutable_data(platform::Place place) {
       holder_.reset(new PlaceholderImpl<T, platform::CPUPlace>(
           boost::get<platform::CPUPlace>(place), size));
     } else if (platform::is_gpu_place(place)) {
-#ifndef PADDLE_WITH_GPU
+#ifndef PADDLE_WITH_CUDA
       PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");
     }
 #else
@@ -103,7 +103,7 @@ inline void Tensor::CopyFrom(const Tensor& src,
     memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
                  boost::get<platform::CPUPlace>(src_place), src_ptr, size);
   }
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
   else if (platform::is_gpu_place(src_place) &&
            platform::is_cpu_place(dst_place)) {
     memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc
index 86c6945ab588a2bbbc5bd38c98a041e95ce17917..58cf0fc3cb6cf0bad693118ca57d71fb21c55a40 100644
--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -74,7 +74,7 @@ TEST(Tensor, MutableData) {
     EXPECT_EQ(p1, p2);
   }
 
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
   {
     Tensor src_tensor;
     float* p1 = nullptr;
@@ -126,7 +126,7 @@ TEST(Tensor, ShareDataWith) {
     ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
   }
 
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
   {
     Tensor src_tensor;
     Tensor dst_tensor;
@@ -163,7 +163,7 @@ TEST(Tensor, Slice) {
     EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address);
   }
 
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
   {
     Tensor src_tensor;
     src_tensor.mutable_data<double>(make_ddim({6, 9}), GPUPlace());
@@ -218,7 +218,7 @@ TEST(Tensor, CopyFrom) {
       EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
     }
   }
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
   {
     Tensor src_tensor;
     Tensor gpu_tensor;
diff --git a/paddle/function/BlockExpandOp.cpp b/paddle/function/BlockExpandOp.cpp
index ad78f5f5844a69597a74acba47f4fcdd8a39aa18..bd0fe119ce46df9c333258c9c1ad7b5b2bdc544f 100644
--- a/paddle/function/BlockExpandOp.cpp
+++ b/paddle/function/BlockExpandOp.cpp
@@ -194,7 +194,7 @@ public:
 
 REGISTER_TYPED_FUNC(BlockExpand, CPU, BlockExpandForward);
 REGISTER_TYPED_FUNC(BlockExpandGrad, CPU, BlockExpandBackward);
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 REGISTER_TYPED_FUNC(BlockExpand, GPU, BlockExpandForward);
 REGISTER_TYPED_FUNC(BlockExpandGrad, GPU, BlockExpandBackward);
 #endif
diff --git a/paddle/function/ContextProjectionOp.cpp b/paddle/function/ContextProjectionOp.cpp
index ab18c39df8a74603cc42c911d56e1d6f3fe37402..23916c0f4b6319004ca0793bc9305b8a1dd0ae89 100644
--- a/paddle/function/ContextProjectionOp.cpp
+++ b/paddle/function/ContextProjectionOp.cpp
@@ -395,7 +395,7 @@ REGISTER_TYPED_FUNC(ContextProjectionForward,
 REGISTER_TYPED_FUNC(ContextProjectionBackward,
                     CPU,
                     ContextProjectionBackwardFunc);
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 REGISTER_TYPED_FUNC(ContextProjectionForward,
                     GPU,
                     ContextProjectionForwardFunc);
diff --git a/paddle/function/CosSimOp.cpp b/paddle/function/CosSimOp.cpp
index 4418f144d3a9c88a3c575ce43def119f9ffd82bd..2e5c281f37d8ffb1062121b5dc5b4f790ab52089 100644
--- a/paddle/function/CosSimOp.cpp
+++ b/paddle/function/CosSimOp.cpp
@@ -233,7 +233,7 @@ private:
 
 REGISTER_TYPED_FUNC(CosSimForward, CPU, CosSimForwardFunc);
 REGISTER_TYPED_FUNC(CosSimBackward, CPU, CosSimBackwardFunc);
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 REGISTER_TYPED_FUNC(CosSimForward, GPU, CosSimForwardFunc);
 REGISTER_TYPED_FUNC(CosSimBackward, GPU, CosSimBackwardFunc);
 #endif
diff --git a/paddle/function/CropOp.cpp b/paddle/function/CropOp.cpp
index 39504cc2c19868639568c48d13c751133c62a17f..46f98f12c1f150fdf3ed53a7a96e5cf0020e14a4 100644
--- a/paddle/function/CropOp.cpp
+++ b/paddle/function/CropOp.cpp
@@ -169,7 +169,7 @@ private:
 
 REGISTER_TYPED_FUNC(Crop, CPU, CropFunc);
 REGISTER_TYPED_FUNC(CropGrad, CPU, CropGradFunc);
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 REGISTER_TYPED_FUNC(Crop, GPU, CropFunc);
 REGISTER_TYPED_FUNC(CropGrad, GPU, CropGradFunc);
 #endif
diff --git a/paddle/function/CrossMapNormalOp.cpp b/paddle/function/CrossMapNormalOp.cpp
index 1cf0918bedf28350d9f42f3294c2bf89ee1341ba..9e88669d37bd50179dcc0464e8c1cd6e2fed74db 100644
--- a/paddle/function/CrossMapNormalOp.cpp
+++ b/paddle/function/CrossMapNormalOp.cpp
@@ -336,7 +336,7 @@ private:
 
 REGISTER_TYPED_FUNC(CrossMapNormal, CPU, CrossMapNormalFunc);
 REGISTER_TYPED_FUNC(CrossMapNormalGrad, CPU, CrossMapNormalGradFunc);
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 REGISTER_TYPED_FUNC(CrossMapNormal, GPU, CrossMapNormalFunc);
 REGISTER_TYPED_FUNC(CrossMapNormalGrad, GPU, CrossMapNormalGradFunc);
 #endif
diff --git a/paddle/function/DepthwiseConvOp.cpp b/paddle/function/DepthwiseConvOp.cpp
index 7656ab3d0aca8f54e7a5f9bb3d74ddba295664ad..9863e3ae1d5fcb1eece5267fd4f2a6b593b799df 100644
--- a/paddle/function/DepthwiseConvOp.cpp
+++ b/paddle/function/DepthwiseConvOp.cpp
@@ -292,7 +292,7 @@ REGISTER_TYPED_FUNC(DepthwiseConvGradInput,
 REGISTER_TYPED_FUNC(DepthwiseConvGradFilter,
                     CPU,
                     DepthwiseConvGradFilterFunction);
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 REGISTER_TYPED_FUNC(DepthwiseConv, GPU, DepthwiseConvFunction);
 REGISTER_TYPED_FUNC(DepthwiseConvGradInput,
                     GPU,
diff --git a/paddle/function/DepthwiseConvOpTest.cpp b/paddle/function/DepthwiseConvOpTest.cpp
index 39033ecb2b5c6916ab6f500a6f1f1fba7c0fb4e2..b1a90da7db2b647dd384e3772820294140e5ec9d 100644
--- a/paddle/function/DepthwiseConvOpTest.cpp
+++ b/paddle/function/DepthwiseConvOpTest.cpp
@@ -17,7 +17,7 @@ limitations under the License. */
 
 namespace paddle {
 
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 TEST(DepthwiseConv, Forward) {
   DepthwiseConvolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
       "GemmConv-CPU", "DepthwiseConv-GPU", forward);
diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp
index 68e08c1480a1c976f2abb9b0d23c5aa63bd78f71..bdb56ddac38b91d756fc6f31282f29c0489fd660 100644
--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@@ -340,7 +340,7 @@ public:
 REGISTER_TYPED_FUNC(GemmConv, CPU, GemmConvFunction);
 REGISTER_TYPED_FUNC(GemmConvGradInput, CPU, GemmConvGradInputFunction);
 REGISTER_TYPED_FUNC(GemmConvGradFilter, CPU, GemmConvGradFilterFunction);
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 REGISTER_TYPED_FUNC(GemmConv, GPU, GemmConvFunction);
 REGISTER_TYPED_FUNC(GemmConvGradInput, GPU, GemmConvGradInputFunction);
 REGISTER_TYPED_FUNC(GemmConvGradFilter, GPU, GemmConvGradFilterFunction);
diff --git a/paddle/function/GemmConvOpTest.cpp b/paddle/function/GemmConvOpTest.cpp
index bd1cf3c6a4c3f3fa67f74dd36a6100602d23d6f6..b5b5e1f35b79e422b14f7495bc321533cc1d618a 100644
--- a/paddle/function/GemmConvOpTest.cpp
+++ b/paddle/function/GemmConvOpTest.cpp
@@ -24,7 +24,7 @@ TEST(GemmConv, NaiveConv) {
       "NaiveConv-CPU", "GemmConv-CPU", forward);
 }
 
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 TEST(GemmConv, Forward) {
   Convolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
       "GemmConv-CPU", "GemmConv-GPU", forward);
diff --git a/paddle/function/Im2ColTest.cpp b/paddle/function/Im2ColTest.cpp
index 55325e94b58dd587474751051d29ad1eb2fc34e3..a0a01a5fc7fc055dce6ddb3ee51c7ab18f8a4ca7 100644
--- a/paddle/function/Im2ColTest.cpp
+++ b/paddle/function/Im2ColTest.cpp
@@ -116,7 +116,7 @@ void TestIm2ColFunctor() {
 
 TEST(Im2ColFunctor, CPU) { TestIm2ColFunctor<DEVICE_TYPE_CPU, float>(); }
 
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 
 TEST(Im2ColFunctor, GPU) { TestIm2ColFunctor<DEVICE_TYPE_GPU, float>(); }
 
diff --git a/paddle/function/MulOp.cpp b/paddle/function/MulOp.cpp
index 655026320c19e3e2c81421d46b28e569deadae1a..704a8c41325ef86067a3bd8ed6d772b77df147c5 100644
--- a/paddle/function/MulOp.cpp
+++ b/paddle/function/MulOp.cpp
@@ -341,7 +341,7 @@ private:
 };
 
 REGISTER_TYPED_FUNC(MulOp, CPU, MulFunc);
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 REGISTER_TYPED_FUNC(MulOp, GPU, MulFunc);
 #endif
 }  // namespace paddle
diff --git a/paddle/function/PadOp.cpp b/paddle/function/PadOp.cpp
index 24c9bf4e72a2a04f1ba6ad5fbedc2880a663990b..eed2f2e3089b6b6167ef7c5a7acb7ecaa08945e1 100644
--- a/paddle/function/PadOp.cpp
+++ b/paddle/function/PadOp.cpp
@@ -207,7 +207,7 @@ private:
 
 REGISTER_TYPED_FUNC(Pad, CPU, PadFunc);
 REGISTER_TYPED_FUNC(PadGrad, CPU, PadGradFunc);
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 REGISTER_TYPED_FUNC(Pad, GPU, PadFunc);
 REGISTER_TYPED_FUNC(PadGrad, GPU, PadGradFunc);
 #endif
diff --git a/paddle/function/RowConvOp.cpp b/paddle/function/RowConvOp.cpp
index 09e702f71a5b1d72b715597f18cadf6b6f2b73a7..7c802d66273c6f7aa56b2f460e3dff4401967517 100644
--- a/paddle/function/RowConvOp.cpp
+++ b/paddle/function/RowConvOp.cpp
@@ -217,7 +217,7 @@ public:
 
 REGISTER_TYPED_FUNC(RowConv, CPU, RowConvFunc);
 REGISTER_TYPED_FUNC(RowConvGrad, CPU, RowConvGradFunc);
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 REGISTER_TYPED_FUNC(RowConv, GPU, RowConvFunc);
 REGISTER_TYPED_FUNC(RowConvGrad, GPU, RowConvGradFunc);
 #endif
diff --git a/paddle/function/SwitchOp.cpp b/paddle/function/SwitchOp.cpp
index db839b5b76ffce8880bd71e94585c4731cb7a94c..597723a2dded6a6a116e05b7d4c942cd633e2c99 100644
--- a/paddle/function/SwitchOp.cpp
+++ b/paddle/function/SwitchOp.cpp
@@ -132,7 +132,7 @@ public:
 
 REGISTER_TYPED_FUNC(NCHW2NHWC, CPU, NCHW2NHWCFunc);
 REGISTER_TYPED_FUNC(NHWC2NCHW, CPU, NHWC2NCHWFunc);
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 REGISTER_TYPED_FUNC(NCHW2NHWC, GPU, NCHW2NHWCFunc);
 REGISTER_TYPED_FUNC(NHWC2NCHW, GPU, NHWC2NCHWFunc);
 #endif
diff --git a/paddle/gserver/layers/BatchNormBaseLayer.cpp b/paddle/gserver/layers/BatchNormBaseLayer.cpp
index 55f52816ab496a1d62e5e2778babab12da17a4ce..bc7d1c83a48aefeb4bc6d3baa32b78aba712e58d 100644
--- a/paddle/gserver/layers/BatchNormBaseLayer.cpp
+++ b/paddle/gserver/layers/BatchNormBaseLayer.cpp
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "BatchNormalizationLayer.h"
 #include "Layer.h"
 #include "paddle/utils/Stat.h"
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 #include "CudnnBatchNormLayer.h"
 #endif
 
diff --git a/paddle/gserver/layers/BatchNormalizationLayer.cpp b/paddle/gserver/layers/BatchNormalizationLayer.cpp
index 33cf24431d70e541854858d077b2438587ea93f5..dacff25e5927daf9c991577a71be86b160228317 100644
--- a/paddle/gserver/layers/BatchNormalizationLayer.cpp
+++ b/paddle/gserver/layers/BatchNormalizationLayer.cpp
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/utils/Stat.h"
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 #include "hl_batch_transpose.h"
 #endif
 #include "BatchNormalizationLayer.h"
@@ -90,7 +90,7 @@ void BatchNormalizationLayer::expandMat(const MatrixPtr& in, MatrixPtr& out) {
   size_t batchSize = in->getHeight();
   CHECK_EQ(out->getHeight(), batchSize * imgPixels_);
   if (useGpu_) {
-#ifndef PADDLE_WITH_GPU
+#ifndef PADDLE_WITH_CUDA
     LOG(FATAL) << "paddle is compiled only for cpu";
 #else
     batchTranspose(
@@ -127,7 +127,7 @@ void BatchNormalizationLayer::shrinkMat(const MatrixPtr& in, MatrixPtr& out) {
   }
   CHECK_EQ(in->getHeight(), static_cast<size_t>(batchSize * imgPixels_));
   if (useGpu_) {
-#ifndef PADDLE_WITH_GPU
+#ifndef PADDLE_WITH_CUDA
     LOG(FATAL) << "paddle is compiled only for cpu";
 #else
     batchTranspose(
diff --git a/paddle/gserver/layers/PoolLayer.cpp b/paddle/gserver/layers/PoolLayer.cpp
index 43ab4e4d4741cc938e92f948d48eee853ebe808b..7b932d5a76e9c4fe7cbe5882bbc19eb3de4b503a 100644
--- a/paddle/gserver/layers/PoolLayer.cpp
+++ b/paddle/gserver/layers/PoolLayer.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "PoolLayer.h"
 #include "PoolProjectionLayer.h"
 #include "paddle/utils/Logging.h"
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 #include "CudnnPoolLayer.h"
 #endif
 namespace paddle {
@@ -53,7 +53,7 @@ Layer* PoolLayer::create(const LayerConfig& config) {
   const std::string& pool = config.inputs(0).pool_conf().pool_type();
   if (pool == "max-projection" || pool == "avg-projection") {
     return new PoolProjectionLayer(config);
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
   } else if (CudnnPoolLayer::typeCheck(pool)) {
     return new CudnnPoolLayer(config);
 #endif
diff --git a/paddle/gserver/tests/LayerGradUtil.cpp b/paddle/gserver/tests/LayerGradUtil.cpp
index 59df057a807e5d8bc0ab13b8f8961e3374517045..cd957c7c0bca4c6089cc07e8f4226b8260190f07 100644
--- a/paddle/gserver/tests/LayerGradUtil.cpp
+++ b/paddle/gserver/tests/LayerGradUtil.cpp
@@ -674,7 +674,7 @@ void testLayerGradKernel(TestConfig testConf,
                          bool useGpu,
                          bool useWeight,
                          float epsilon) {
-#ifndef PADDLE_WITH_GPU
+#ifndef PADDLE_WITH_CUDA
   if (useGpu) return;
 #endif
   FLAGS_use_gpu = useGpu;
diff --git a/paddle/gserver/tests/test_BatchNorm.cpp b/paddle/gserver/tests/test_BatchNorm.cpp
index c1c85f8fac43af6a7f260c10a18499e61ca7f17a..050fde9d0af54e6d3fd46e19903b926019ddd229 100644
--- a/paddle/gserver/tests/test_BatchNorm.cpp
+++ b/paddle/gserver/tests/test_BatchNorm.cpp
@@ -119,7 +119,7 @@ TEST(Layer, batchNorm) {
   CHECK_EQ(static_cast<int>(convLayer->getOutputValue()->getWidth()), 576);
 }
 
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 void batchNormInference(int n, int c, int h, int w) {
   MatrixPtr input = std::make_shared<GpuMatrix>(n, c * h * w);
   MatrixPtr cudnnOut = std::make_shared<GpuMatrix>(n, c * h * w);
diff --git a/paddle/gserver/tests/test_ConvUnify.cpp b/paddle/gserver/tests/test_ConvUnify.cpp
index 16556469cbf854a8e61321efe4e93e86f5641952..ffcc47e2a84356807b6591ba11e670d6d3f336ee 100644
--- a/paddle/gserver/tests/test_ConvUnify.cpp
+++ b/paddle/gserver/tests/test_ConvUnify.cpp
@@ -117,7 +117,7 @@ MatrixPtr doOneConvTest(size_t imgSize,
 }
 
 TEST(Layer, convParaUnified) {
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
   MatrixPtr input, resultCpu, resultGpu;
 
   /// TEST1 for conv ///
diff --git a/paddle/gserver/tests/test_DetectionOutput.cpp b/paddle/gserver/tests/test_DetectionOutput.cpp
index 1a83f48fae17410b0014cf6b258b4aac29a86242..dc39c97a87f8b346dc9cc09d6158b1b4069bcf2d 100644
--- a/paddle/gserver/tests/test_DetectionOutput.cpp
+++ b/paddle/gserver/tests/test_DetectionOutput.cpp
@@ -150,7 +150,7 @@ TEST(Layer, detectionOutputLayerFwd) {
                            useGpu,
                            result2);
 
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
   // GPU case 1.
   useGpu = true;
   inputLoc = Matrix::create(1, 16, false, useGpu);
diff --git a/paddle/gserver/tests/test_Evaluator.cpp b/paddle/gserver/tests/test_Evaluator.cpp
index 42bb5705726cab26f3b5c126cdf08a75930241bf..62a131171fa5ae973cb3069151a582aaeac9ee0e 100644
--- a/paddle/gserver/tests/test_Evaluator.cpp
+++ b/paddle/gserver/tests/test_Evaluator.cpp
@@ -51,7 +51,7 @@ void testEvaluator(TestConfig testConf,
                    string testEvaluatorName,
                    size_t batchSize,
                    bool useGpu) {
-#ifndef PADDLE_WITH_GPU
+#ifndef PADDLE_WITH_CUDA
   if (useGpu) return;
 #endif
   FLAGS_use_gpu = useGpu;
diff --git a/paddle/gserver/tests/test_KmaxSeqScore.cpp b/paddle/gserver/tests/test_KmaxSeqScore.cpp
index 1594de850210b631594e88bd7326b310a9c72eaf..6386259882f8c70da59e22574c60fadda636aa42 100644
--- a/paddle/gserver/tests/test_KmaxSeqScore.cpp
+++ b/paddle/gserver/tests/test_KmaxSeqScore.cpp
@@ -97,7 +97,7 @@ TEST(Layer, kmaxSeqScoreLayer) {
       Matrix::create(subSeqStartPosition.back(), 1, false, false);
 
   std::vector<bool> mode = {false};
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
   mode.push_back(true);
 #endif
 
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index e887dee5f98c6ac5af3171ce43c048d92322186c..90a3352898863a66819e23dd9ecba375ae104a60 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 #include <cudnn.h>
 #endif
 #include <gtest/gtest.h>
@@ -258,7 +258,7 @@ void testProjectionConv(size_t groups, bool isDeconv) {
                      true);
 }
 
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 TEST(Projection, conv) {
   /// test ConvProjection
   testProjectionConv(1, false);
@@ -422,7 +422,7 @@ TEST(Layer, depthwiseConvLayer) {
   //  'depthwise_conv' is a sepecial case of 'exconv' whose
   //  groups size equals to the input channels size.
   testDepthwiseConvLayer("exconv", /* useGpu= */ false);
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
   testDepthwiseConvLayer("exconv", /* useGpu= */ true);
 #endif
 }
@@ -480,7 +480,7 @@ void testConvLayer(const string& type, bool trans, bool useGpu) {
 
 TEST(Layer, convLayer) {
   testConvLayer("exconv", /* trans= */ false, /* useGpu= */ false);
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
   testConvLayer("exconv", /* trans= */ false, /* useGpu= */ true);
   testConvLayer("cudnn_conv", /* trans= */ false, /* useGpu= */ true);
 #endif
@@ -525,7 +525,7 @@ TEST(Layer, convTransLayer) {
   for (auto useGpu : {false, true}) {
     testConvTransLayer("exconvt", /* trans= */ false, /* useGpu= */ useGpu);
   }
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
   testConvTransLayer("cudnn_convt", /* trans= */ false, /* useGpu= */ true);
 #endif
 }
@@ -638,7 +638,7 @@ TEST(Layer, SelectiveFullyConnectedLayer) {
                 /* trans= */ false,
                 /* useGup= */ false,
                 false);
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
   testLayerGrad(config,
                 "selective_fc",
                 100,
@@ -1210,7 +1210,7 @@ void testPoolLayer(const string& poolType, bool trans, bool useGpu) {
   testLayerGrad(config, "pool", 100, trans, useGpu);
 }
 
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 void testPoolLayer2(const string& poolType, bool trans, bool useGpu) {
   TestConfig config;
   config.inputDefs.push_back({INPUT_DATA, "layer_0", 3200, 0});
@@ -1236,7 +1236,7 @@ TEST(Layer, PoolLayer) {
   testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ false);
   testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ false);
 
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
   testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ true);
   testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ true);
   testPoolLayer("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
@@ -1309,7 +1309,7 @@ void testPool3DLayer(const string& poolType, bool trans, bool useGpu) {
 TEST(Layer, Pool3DLayer) {
   testPool3DLayer("avg", /* trans= */ false, /* useGpu= */ false);
   testPool3DLayer("max", /* trans= */ false, /* useGpu= */ false);
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
   testPool3DLayer("avg", /* trans= */ false, /* useGpu= */ true);
   testPool3DLayer("max", /* trans= */ false, /* useGpu= */ true);
 #endif
@@ -1695,7 +1695,7 @@ void testBatchNormLayer(const string& type, bool trans, bool useGpu) {
 
 TEST(Layer, BatchNormalizationLayer) {
   testBatchNormLayer("batch_norm", false, false);
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
   testBatchNormLayer("batch_norm", false, true);
   if (hl_get_cudnn_lib_version() >= int(4000)) {
     testBatchNormLayer("cudnn_batch_norm", false, true);
@@ -1744,7 +1744,7 @@ void testBatchNorm3DLayer(const string& type, bool trans, bool useGpu) {
 
 TEST(Layer, testBatchNorm3DLayer) {
   testBatchNorm3DLayer("batch_norm", false, false);
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
   testBatchNorm3DLayer("batch_norm", false, true);
   if (hl_get_cudnn_lib_version() >= int(4000)) {
     testBatchNorm3DLayer("cudnn_batch_norm", false, true);
@@ -2262,7 +2262,7 @@ void test3DConvLayer(const string& type, bool trans, bool useGpu) {
 
 TEST(Layer, test3DConvLayer) {
   test3DConvLayer("conv3d", /* trans= */ false, /* useGpu= */ false);
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
   test3DConvLayer("conv3d", /* trans= */ false, /* useGpu= */ true);
 #endif
 }
@@ -2339,7 +2339,7 @@ void test3DDeConvLayer(const string& type, bool trans, bool useGpu) {
 
 TEST(Layer, test3DDeConvLayer) {
   test3DDeConvLayer("deconv3d", /* trans= */ false, /* useGpu= */ false);
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
   test3DDeConvLayer("deconv3d", /* trans= */ false, /* useGpu= */ true);
 #endif
 }
diff --git a/paddle/gserver/tests/test_NetworkCompare.cpp b/paddle/gserver/tests/test_NetworkCompare.cpp
index e322fef9a468c60a2f7dd4842207cf1cac44935e..2b92211936aad1a034369bda0830bed3438cf401 100644
--- a/paddle/gserver/tests/test_NetworkCompare.cpp
+++ b/paddle/gserver/tests/test_NetworkCompare.cpp
@@ -243,7 +243,7 @@ TEST(Compare, concat_slice) {
   compareNetwork(config_file_a, config_file_b);
 }
 
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 TEST(Compare, img_pool) {
   std::string config_file_a = "./gserver/tests/img_pool_a.conf";
   std::string config_file_b = "./gserver/tests/img_pool_b.conf";
diff --git a/paddle/gserver/tests/test_PriorBox.cpp b/paddle/gserver/tests/test_PriorBox.cpp
index cbc0fff7b895d0b34b0af42e881aa68a1754eeea..8dc5568784295b5a2e7d4decd178d612432a1a18 100644
--- a/paddle/gserver/tests/test_PriorBox.cpp
+++ b/paddle/gserver/tests/test_PriorBox.cpp
@@ -151,7 +151,7 @@ TEST(Layer, priorBoxLayerFwd) {
                     useGpu,
                     result);
 
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
   // reset the input parameters
   variance[1] = 0.1;
   variance[3] = 0.2;
diff --git a/paddle/gserver/tests/test_ProtoDataProvider.cpp b/paddle/gserver/tests/test_ProtoDataProvider.cpp
index 988dbc2513d7d64f647036ed5a8b890143d0e606..af6472619d1840e82787974d265d601b4a406c09 100644
--- a/paddle/gserver/tests/test_ProtoDataProvider.cpp
+++ b/paddle/gserver/tests/test_ProtoDataProvider.cpp
@@ -485,7 +485,7 @@ TEST(ProtoDataProvider, test) {
               // Currently in async mode, useGpu is not supported
               continue;
             }
-#ifndef PADDLE_WITH_GPU
+#ifndef PADDLE_WITH_CUDA
             if (useGpu) {
               continue;
             }
@@ -525,7 +525,7 @@ TEST(ProtoDataProvider, constant_slots) {
       for (int numConstantSlots : {1, 2}) {
         for (int useGpu : numTwoArray) {
           for (int dataCompression : numTwoArray) {
-#ifndef PADDLE_WITH_GPU
+#ifndef PADDLE_WITH_CUDA
             if (useGpu) {
               continue;
             }
@@ -708,7 +708,7 @@ TEST(ProtoSequenceDataProvider, test) {
               // Currently in async mode, useGpu is not supported
               continue;
             }
-#ifndef PADDLE_WITH_GPU
+#ifndef PADDLE_WITH_CUDA
             if (useGpu) {
               continue;
             }
diff --git a/paddle/gserver/tests/test_PyDataProvider.cpp b/paddle/gserver/tests/test_PyDataProvider.cpp
index f6522febf8cae8be03e98564758a93eca73de9d8..fe54799259d86064c4fcaec0e53707247981a1b4 100644
--- a/paddle/gserver/tests/test_PyDataProvider.cpp
+++ b/paddle/gserver/tests/test_PyDataProvider.cpp
@@ -37,7 +37,7 @@ TEST(PyDataProvider, py_fill_slots) {
   config.clear_files();
   std::string dataFile = "gserver/tests/pyDataProvider/pyDataProviderList";
   config.set_files(dataFile);
-#ifndef PADDLE_WITH_GPU
+#ifndef PADDLE_WITH_CUDA
   bool useGpu = false;
 #else
   bool useGpu = true;
@@ -71,7 +71,7 @@ TEST(PyDataProvider, py_fill_nest_slots) {
   std::string dataFile = "gserver/tests/pyDataProvider/pyDataProviderList";
   config.set_files(dataFile);
   EXPECT_EQ(config.IsInitialized(), true);
-#ifndef PADDLE_WITH_GPU
+#ifndef PADDLE_WITH_CUDA
   bool useGpu = false;
 #else
   bool useGpu = true;
diff --git a/paddle/gserver/tests/test_SelectiveFCLayer.cpp b/paddle/gserver/tests/test_SelectiveFCLayer.cpp
index b25d32fb2c045758abf1b65e78adb68b705ee3e9..4c87fe1bba1eff3c081754b3b255a0d7d1b3dfbe 100644
--- a/paddle/gserver/tests/test_SelectiveFCLayer.cpp
+++ b/paddle/gserver/tests/test_SelectiveFCLayer.cpp
@@ -321,7 +321,7 @@ TEST(Layer, SelectiveFcLayer_train_dense_mul) {
       "filelist=gserver/tests/SelectiveFcTest/dense_mul_list";
 
   for (auto useGpu : {false, true}) {
-#ifndef PADDLE_WITH_GPU
+#ifndef PADDLE_WITH_CUDA
     if (useGpu) {
       break;
     }
@@ -388,7 +388,7 @@ void testSelectiveFcLayerTrainSparseMul(const LayerConfig& config,
                           outMatSelfc->getWidth(),
                           outMatSelfc->getElementCnt()));
   cpuOutMatSelfc->copyFrom(*outMatSelfc, HPPL_STREAM_DEFAULT);
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
   if (useGpu) {
     hl_stream_synchronize(HPPL_STREAM_DEFAULT);
   }
@@ -418,7 +418,7 @@ void testSelectiveFcLayerTrainSparseMul(const LayerConfig& config,
   MatrixPtr cpuOutMatFc(
       new CpuMatrix(outMatFc->getHeight(), outMatFc->getWidth()));
   cpuOutMatFc->copyFrom(*outMatFc, HPPL_STREAM_DEFAULT);
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
   if (useGpu) {
     hl_stream_synchronize(HPPL_STREAM_DEFAULT);
   }
@@ -443,7 +443,7 @@ TEST(Layer, SelectiveFcLayer_train_sparse_mul) {
   selLayerConfig.set_size(fcLayerWidth);
 
   testSelectiveFcLayerTrainSparseMul(selLayerConfig, false);
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
   testSelectiveFcLayerTrainSparseMul(selLayerConfig, true);
 #endif
 }
diff --git a/paddle/gserver/tests/test_SeqSliceLayerGrad.cpp b/paddle/gserver/tests/test_SeqSliceLayerGrad.cpp
index f28149081b11d52651747f8fccc6e9f13d672d87..3366002ca113a3fd1c8a8c597e619a50f1e29931 100644
--- a/paddle/gserver/tests/test_SeqSliceLayerGrad.cpp
+++ b/paddle/gserver/tests/test_SeqSliceLayerGrad.cpp
@@ -195,7 +195,7 @@ TEST(Layer, SeqSliceLayer) {
   vector<vector<real>> ends;
 
   std::vector<bool> mode = {false};
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
   mode.push_back(true);
 #endif
   genSeqInfo(seqStartPos, subSeqStartPos);
diff --git a/paddle/gserver/tests/test_WarpCTCLayer.cpp b/paddle/gserver/tests/test_WarpCTCLayer.cpp
index ae5b64257fa06df696e0def76c67e21722f9bc97..da829460061d38f363317e33daeb65cfa705bb8e 100644
--- a/paddle/gserver/tests/test_WarpCTCLayer.cpp
+++ b/paddle/gserver/tests/test_WarpCTCLayer.cpp
@@ -199,7 +199,7 @@ TEST(Layer, WarpCTCLayer) {
     for (auto batchSize : {1, 10, 32}) {
       for (auto normByTimes : {false, true}) {
         for (auto useGpu : {false, true}) {
-#ifndef PADDLE_WITH_GPU
+#ifndef PADDLE_WITH_CUDA
           if (useGpu) continue;
 #endif
           LOG(INFO) << "layerSize=" << layerSize << " batchSize=" << batchSize
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index de02f9c0d57f591052f9c28ac3f9afe418fd5df4..c3e34d5309d9ca8a32d7b0a8043e668cdb5be54b 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -670,7 +670,7 @@ void GpuMatrix::leftMul(Matrix& a, real scaleAB, real scaleT) {
 }
 
 void GpuMatrix::selectRows(Matrix& table, IVector& ids) {
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
   CHECK(dynamic_cast<GpuMatrix*>(&table));
   CHECK(table.useGpu());
   CHECK(ids.useGpu());
@@ -694,7 +694,7 @@ void GpuMatrix::selectRows(Matrix& table, IVector& ids) {
 }
 
 void GpuMatrix::addToRows(Matrix& table, IVector& ids) {
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
   CHECK(dynamic_cast<GpuMatrix*>(&table));
   CHECK(table.useGpu());
   CHECK(ids.useGpu());
@@ -741,7 +741,7 @@ void GpuMatrix::rowMax(Matrix& max) {
 }
 
 void GpuMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
   CHECK(maxIds.useGpu() && maxVal.useGpu()) << "Matrix type are not equal";
   size_t numSamples = getHeight();
   size_t beam = maxVal.getWidth();
diff --git a/paddle/math/SparseMatrix.cpp b/paddle/math/SparseMatrix.cpp
index 1f31082ae8ea840ed2eeef33ac97791a1ca4e966..284b68d590ba655395c0186d8ea86d6855c6fc50 100644
--- a/paddle/math/SparseMatrix.cpp
+++ b/paddle/math/SparseMatrix.cpp
@@ -836,7 +836,7 @@ void GpuSparseMatrix::zeroMem() {
 }
 
 void GpuSparseMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
   CHECK(maxIds.useGpu() && maxVal.useGpu()) << "Matrix type are not equal";
   size_t numSamples = getHeight();
   size_t beam = maxVal.getWidth();
diff --git a/paddle/math/Vector.cpp b/paddle/math/Vector.cpp
index 54e57b255d3cf5e71618c15322e37e037c20210c..ff72672e3ab77212b309fcfea835839a916fa632 100644
--- a/paddle/math/Vector.cpp
+++ b/paddle/math/Vector.cpp
@@ -172,7 +172,7 @@ void GpuVectorT<T>::isEqualTo(const VectorT<T>& b, const T& value) {
 
 template <class T>
 void GpuVectorT<T>::selectFrom(const VectorT<T>& src, const VectorT<int>& ids) {
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
   hl_vector_select_from<T>(this->getData(),
                            this->getSize(),
                            src.getData(),
@@ -850,7 +850,7 @@ CpuGpuVectorT<T>::CpuGpuVectorT(CpuGpuVectorT<T>& src,
                                 size_t size)
     : sync_(nullptr) {
   CHECK_LE(offset + size, static_cast<size_t>(src.getSize()));
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
   SyncedFlag* flag = src.getSync();
   if (*flag == DATA_AT_CPU) {
     src.copyToGpu();  // will set synchronous data between CPU and GPU
@@ -861,7 +861,7 @@ CpuGpuVectorT<T>::CpuGpuVectorT(CpuGpuVectorT<T>& src,
   auto cMemHandle = (src.getVector(false))->getMemoryHandle();
   cpuVectorT_ = std::make_shared<CpuVectorT<T>>(
       size, std::dynamic_pointer_cast<CpuMemoryHandle>(cMemHandle), offset);
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
   auto gMemHandle = (src.getVector(true))->getMemoryHandle();
   gpuVectorT_ = std::make_shared<GpuVectorT<T>>(
       size, std::dynamic_pointer_cast<GpuMemoryHandle>(gMemHandle), offset);
diff --git a/paddle/math/tests/test_Allocator.cpp b/paddle/math/tests/test_Allocator.cpp
index cf2f66aea153bab22220e4e5a8b580778cddd058..1fecf659e5080c7d25f5f76b92b15f75eaab6ce3 100644
--- a/paddle/math/tests/test_Allocator.cpp
+++ b/paddle/math/tests/test_Allocator.cpp
@@ -68,7 +68,7 @@ void testPoolAllocator() {
 
 TEST(Allocator, Pool) {
   testPoolAllocator<CpuAllocator>();
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
   testPoolAllocator<GpuAllocator>();
 #endif
 }
@@ -92,7 +92,7 @@ TEST(MemoryHandle, Cpu) {
   EXPECT_EQ(ptr1, ptr2);
 }
 
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 TEST(MemoryHandle, Gpu) {
   int numGpu = hl_get_device_count();
 
diff --git a/paddle/math/tests/test_BaseMatrix.cpp b/paddle/math/tests/test_BaseMatrix.cpp
index 730759f3dbfd4c43938c41f2bc30f3b703b3de77..1766257860b0b13e9f0ce898438e7c2d644f545e 100644
--- a/paddle/math/tests/test_BaseMatrix.cpp
+++ b/paddle/math/tests/test_BaseMatrix.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 /**
  * This test file use autotest::AutoCompare and cmpWithoutArg to compares the
  * implementation of CPU and GPU member function in
diff --git a/paddle/math/tests/test_CpuGpuVector.cpp b/paddle/math/tests/test_CpuGpuVector.cpp
index ccb4a902b0baf9e279857c760f0396433d187966..c72f89c8244b1209e490b09387c2ee6352426ce1 100644
--- a/paddle/math/tests/test_CpuGpuVector.cpp
+++ b/paddle/math/tests/test_CpuGpuVector.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 
 #include <gtest/gtest.h>
 #include "paddle/math/Vector.h"
diff --git a/paddle/math/tests/test_ExecViaCpu.cpp b/paddle/math/tests/test_ExecViaCpu.cpp
index 2d439cd0600433989bb0150ad227c8e32741cfb0..25e0ba11ded96dd78aedc3c297507d0555d80d74 100644
--- a/paddle/math/tests/test_ExecViaCpu.cpp
+++ b/paddle/math/tests/test_ExecViaCpu.cpp
@@ -94,7 +94,7 @@ void testWrapper(F&& f) {
   }
 }
 
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 TEST(ExecViaCpu, test1) {
   testWrapper(f);
   testWrapper(&f);
diff --git a/paddle/math/tests/test_GpuProfiler.cpp b/paddle/math/tests/test_GpuProfiler.cpp
index 6dab187e3e0a9275c317775a15260d313e480415..9402bd3ec48fbed381ef1f676e8b179cabd4cb9f 100644
--- a/paddle/math/tests/test_GpuProfiler.cpp
+++ b/paddle/math/tests/test_GpuProfiler.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 
 #include <gtest/gtest.h>
 #include "paddle/math/Matrix.h"
diff --git a/paddle/math/tests/test_Matrix.cpp b/paddle/math/tests/test_Matrix.cpp
index 7a145eae6a2e7bd221b0f33a1430bfd020d86931..2f99fa3581e14b91acc0b294856619f4ae2b3483 100644
--- a/paddle/math/tests/test_Matrix.cpp
+++ b/paddle/math/tests/test_Matrix.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 /**
  * This test file use autotest::AutoCompare and cmpWithArg to compares the
  * implementation of CPU and GPU member function in Matrix.cpp.
diff --git a/paddle/math/tests/test_SparseMatrix.cpp b/paddle/math/tests/test_SparseMatrix.cpp
index 8151dde106c9c824f785c0bcd5812bb84f2c75f8..8abbe8d82e02b7d1738fe7e6d0c8d494166e7892 100644
--- a/paddle/math/tests/test_SparseMatrix.cpp
+++ b/paddle/math/tests/test_SparseMatrix.cpp
@@ -47,7 +47,7 @@ struct MatrixPara {
   SparseFormat format;
 };
 
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 void test_sparse_matrix_mul(MatrixPara paraA,
                             MatrixPara paraB,
                             MatrixPara paraC) {
@@ -452,7 +452,7 @@ TEST(Matrix, SparseMatrixCSRFormatTrimFrom) {
   matB->trimFrom(*mat);
   checkSMatrixEqual2(matA, matB);
 
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
   GpuSparseMatrixPtr matC = std::make_shared<GpuSparseMatrix>(
       height, trimedWidth, height, FLOAT_VALUE, SPARSE_CSR, true);
   matC->trimFrom(*mat);
@@ -546,7 +546,7 @@ TEST(Matrix, SparseMatrixCSCFormatTrimFrom) {
   matB->trimFrom(*mat);
   checkSMatrixEqual2(matA, matB);
 
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
   GpuSparseMatrixPtr matC = std::make_shared<GpuSparseMatrix>(
       height, trimedWidth, height, FLOAT_VALUE, SPARSE_CSC, true);
   matC->trimFrom(*mat);
diff --git a/paddle/math/tests/test_TrainingAlgorithm.cpp b/paddle/math/tests/test_TrainingAlgorithm.cpp
index 36ac0240076db80dd50a46cfeef0445e355762ca..5ae0aa036f6bfc1e5bd4e955277c4efff8c739ce 100644
--- a/paddle/math/tests/test_TrainingAlgorithm.cpp
+++ b/paddle/math/tests/test_TrainingAlgorithm.cpp
@@ -91,7 +91,7 @@ int VectorCheckErr(const VectorPtr& vector1, const VectorPtr& vector2) {
 typedef std::function<void(size_t size, bool useGpu)> testMatrixFunc;
 
 void testCase(testMatrixFunc matrixFunc) {
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
   for (auto useGpu : {false, true}) {
 #else
   for (auto useGpu : {false}) {
diff --git a/paddle/math/tests/test_batchTranspose.cpp b/paddle/math/tests/test_batchTranspose.cpp
index 0189e534eb2c001298ee31f9e1b22e0b5d5b4ef4..b70a61976402fd0a7cfee8382fd926fcf28486d5 100644
--- a/paddle/math/tests/test_batchTranspose.cpp
+++ b/paddle/math/tests/test_batchTranspose.cpp
@@ -17,7 +17,7 @@ limitations under the License. */
 
 using namespace paddle;  // NOLINT
 
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 TEST(MatrixBatchTransTest, test_batch_matrix_transpose) {
   const int nx = 100;
   const int ny = 50;
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index 7735877ac89af2832df199c18bf42494b375212f..7e5a1db44a5302e3b4e5d2768755824666e880ba 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 /// This unittest checks GpuMatrix/CpuMatrix get same result, so disable when
 /// only cpu version.
 
diff --git a/paddle/math/tests/test_perturbation.cpp b/paddle/math/tests/test_perturbation.cpp
index dff18136ae54877671e2f80f7265021870efa6ec..c7c07c817a08d78ddcbf8218e8c4a9d22f4990bc 100644
--- a/paddle/math/tests/test_perturbation.cpp
+++ b/paddle/math/tests/test_perturbation.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 
 #include <cuda_runtime.h>
 #include <gtest/gtest.h>
diff --git a/paddle/math/tests/test_sparseMatrixCompare.cpp b/paddle/math/tests/test_sparseMatrixCompare.cpp
index e39cc0a2f64572a4594ee8f22ecf4f838c691696..2b2a391b9d04a9f7fa4986a6b6dd5cd8e5385f1f 100644
--- a/paddle/math/tests/test_sparseMatrixCompare.cpp
+++ b/paddle/math/tests/test_sparseMatrixCompare.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 /// This unittest checks GpuSparseMatrix/CpuSparseMatrix get same result,
 //  so disable when
 /// only cpu version.
diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc
index ed0c3374ff6645725c907947c43733b6a3d14eef..fdc5ed19dc2973e744676c3b795c8ab86da58590 100644
--- a/paddle/memory/detail/buddy_allocator.cc
+++ b/paddle/memory/detail/buddy_allocator.cc
@@ -175,7 +175,7 @@ void* BuddyAllocator::SystemAlloc(size_t size) {
 }
 
 BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
   if (system_allocator_->UseGpu()) {
     if ((total_used_ + total_free_) == 0) {
       // Compute the maximum allocation size for the first allocation.
diff --git a/paddle/memory/detail/system_allocator.cc b/paddle/memory/detail/system_allocator.cc
index 64f8182b5cd9eb3e5b9fef5d0aaec700f6e3e17f..6c9a46dd09c15347fca1a30971e7e732d887bc8e 100644
--- a/paddle/memory/detail/system_allocator.cc
+++ b/paddle/memory/detail/system_allocator.cc
@@ -62,7 +62,7 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) {
 
 bool CPUAllocator::UseGpu() const { return false; }
 
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 
 void* GPUAllocator::Alloc(size_t& index, size_t size) {
   // CUDA documentation doesn't explain if cudaMalloc returns nullptr
diff --git a/paddle/memory/detail/system_allocator.h b/paddle/memory/detail/system_allocator.h
index 6b1f40347b983b0a73d76919e5e6e8aa91f2794e..ee9b012f91a9647839cf465c4074082f2d3509a6 100644
--- a/paddle/memory/detail/system_allocator.h
+++ b/paddle/memory/detail/system_allocator.h
@@ -40,7 +40,7 @@ class CPUAllocator : public SystemAllocator {
   virtual bool UseGpu() const;
 };
 
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 class GPUAllocator : public SystemAllocator {
  public:
   virtual void* Alloc(size_t& index, size_t size);
diff --git a/paddle/memory/detail/system_allocator_test.cc b/paddle/memory/detail/system_allocator_test.cc
index 57d5443d5098c78cb7a59670a94accec56e841bf..cd563844e7fa23241bb0bb56d1365ef34826c4a8 100644
--- a/paddle/memory/detail/system_allocator_test.cc
+++ b/paddle/memory/detail/system_allocator_test.cc
@@ -56,7 +56,7 @@ TEST(CPUAllocator, LockMem) {
   TestAllocator(a, 0);
 }
 
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 TEST(GPUAllocator, Alloc) {
   paddle::memory::detail::GPUAllocator a;
   TestAllocator(a, 2048);
diff --git a/paddle/memory/memcpy.cc b/paddle/memory/memcpy.cc
index 184d0f8fa72c436fbfa2c83e18a273a83504522e..790420a8ab41b1a61ee35dc086c8b95fa1a02019 100644
--- a/paddle/memory/memcpy.cc
+++ b/paddle/memory/memcpy.cc
@@ -26,7 +26,7 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
   std::memcpy(dst, src, num);
 }
 
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 template <>
 void Copy<platform::CPUPlace, platform::GPUPlace>(platform::CPUPlace dst_place,
                                                   void* dst,
diff --git a/paddle/memory/memcpy.h b/paddle/memory/memcpy.h
index 7142831d434a3e786620caea47b8715997288b9d..0bccee58c3a22379c75523467e0c717b98b08bcf 100644
--- a/paddle/memory/memcpy.h
+++ b/paddle/memory/memcpy.h
@@ -33,7 +33,7 @@ namespace memory {
 template <typename DstPlace, typename SrcPlace>
 void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num);
 
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 
 /**
  * \brief   Copy memory from one place to another place.
diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index 6d5a74dafe642f2959928c77b2bbd1239b721847..355b6218d0b41350bc8727858c341ed7eeccf195 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -62,7 +62,7 @@ size_t Used<platform::CPUPlace>(platform::CPUPlace place) {
   return GetCPUBuddyAllocator()->Used();
 }
 
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 
 BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
   using BuddyAllocVec = std::vector<BuddyAllocator*>;
diff --git a/paddle/memory/memory_test.cc b/paddle/memory/memory_test.cc
index 7a617f04dc5447d4f1770b500a92e88c58355a6b..0d402038a06f4ad93fd15946fc44aaeac58ada40 100644
--- a/paddle/memory/memory_test.cc
+++ b/paddle/memory/memory_test.cc
@@ -80,7 +80,7 @@ TEST(BuddyAllocator, CPUMultAlloc) {
   }
 }
 
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 
 size_t align(size_t size, paddle::platform::GPUPlace place) {
   size += sizeof(paddle::memory::detail::Metadata);
diff --git a/paddle/operators/detail/strided_memcpy.h b/paddle/operators/detail/strided_memcpy.h
index 9f05a2632266a11f23d34162ee6fdc2260ec7cc3..068c82f399316a1587d7322d8dab75823656800e 100644
--- a/paddle/operators/detail/strided_memcpy.h
+++ b/paddle/operators/detail/strided_memcpy.h
@@ -34,7 +34,7 @@ struct StridedMemcpyFunctor<T, 1> {
       auto& cpu_place = boost::get<platform::CPUPlace>(place);
       memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim.head);
     } else {
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
       auto& gpu_place = boost::get<platform::GPUPlace>(place);
       auto& cuda_ctx =
           reinterpret_cast<const platform::CUDADeviceContext&>(dev_ctx);
diff --git a/paddle/operators/math/im2col_test.cc b/paddle/operators/math/im2col_test.cc
index 3d040ca2b546ff66a4e8c35c3871fb6b1e92b210..40bdbfe73351a609a4ab9fdc27ac5ff6710df2a2 100644
--- a/paddle/operators/math/im2col_test.cc
+++ b/paddle/operators/math/im2col_test.cc
@@ -71,7 +71,7 @@ void testIm2col() {
     context =
         new paddle::platform::CPUDeviceContext(paddle::platform::CPUPlace());
   } else {
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
     context =
         new paddle::platform::CUDADeviceContext(paddle::platform::GPUPlace());
 #else
@@ -116,7 +116,7 @@ void testIm2col() {
 
 TEST(math, im2col) {
   testIm2col<paddle::platform::CPUPlace>();
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
   testIm2col<paddle::platform::GPUPlace>();
 #endif
 }
diff --git a/paddle/operators/math/math_function_test.cc b/paddle/operators/math/math_function_test.cc
index 2252268620ffc68bb1604596379a68416083918f..9945ba101d719848aa0c06fa65629d59f167c083 100644
--- a/paddle/operators/math/math_function_test.cc
+++ b/paddle/operators/math/math_function_test.cc
@@ -1,7 +1,7 @@
 #include "paddle/operators/math/math_function.h"
 #include "gtest/gtest.h"
 
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 TEST(math_function, notrans_mul_trans) {
   paddle::framework::Tensor input1;
   paddle::framework::Tensor input1_gpu;
diff --git a/paddle/operators/strided_memcpy_test.cc b/paddle/operators/strided_memcpy_test.cc
index e0dd7b19f14f04cb1ef6f3e9b06fcb20d6aebc6b..68f064eaee5851333ddf9767b7138da83a28503d 100644
--- a/paddle/operators/strided_memcpy_test.cc
+++ b/paddle/operators/strided_memcpy_test.cc
@@ -72,7 +72,7 @@ TEST(StridedMemcpy, CPUConcat) {
   }
 }
 
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 TEST(StridedMemcpy, GPUCrop) {
   // clang-format off
   int src[] = {
@@ -157,4 +157,4 @@ TEST(StridedMemcpy, GPUConcat) {
 
 #endif
 }  // namespace operators
-}  // namespace paddle
\ No newline at end of file
+}  // namespace paddle
diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc
index 8dcc357a16a057416b7e86a9de8ee5fee8cfcfc3..a9b6b799036a4f2ba93ef52398131db4fcb599f5 100644
--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
@@ -35,7 +35,7 @@ Eigen::DefaultDevice* CPUDeviceContext::eigen_device() const {
 
 Place CPUDeviceContext::GetPlace() const { return CPUPlace(); }
 
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 
 template <>
 Eigen::GpuDevice*
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index c1c4c7f7600b7e404c5b8db42172d87dc7cf7342..ef5f19214d9ccb23b9c946bee28cb764122bd7cd 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -14,7 +14,7 @@ limitations under the License. */
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/place.h"
 
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 #include "paddle/platform/dynload/cublas.h"
 #include "paddle/platform/dynload/cudnn.h"
 #include "paddle/platform/gpu_info.h"
@@ -61,7 +61,7 @@ class CPUDeviceContext : public DeviceContext {
   std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
 };
 
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 template <>
 struct EigenDeviceConverter<platform::GPUPlace> {
   using EigenDeviceType = Eigen::GpuDevice;
diff --git a/paddle/platform/enforce.h b/paddle/platform/enforce.h
index f9fe521d50729fdb20d8be45a89b81fd8f0f4564..15d8446cd8dceb2fdc03536e1f7bbcde73403a23 100644
--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
@@ -29,7 +29,7 @@ limitations under the License. */
 #include <cxxabi.h>  // for __cxa_demangle
 #endif
 
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 
 #include "paddle/platform/dynload/cublas.h"
 #include "paddle/platform/dynload/cudnn.h"
@@ -113,7 +113,7 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
   }
 }
 
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 
 template <typename... Args>
 inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
diff --git a/paddle/platform/enforce_test.cc b/paddle/platform/enforce_test.cc
index 80bdee3d9dfbe38ef707a6ba60cdb7f7b99714de..8206a055eabf4abf584962e921610d5029e2f571 100644
--- a/paddle/platform/enforce_test.cc
+++ b/paddle/platform/enforce_test.cc
@@ -213,4 +213,4 @@ TEST(ENFORCE_USER_DEFINED_CLASS, EQ) {
 TEST(ENFORCE_USER_DEFINED_CLASS, NE) {
   Dims a{{1, 2, 3, 4}}, b{{5, 6, 7, 8}};
   ASSERT_THROW(PADDLE_ENFORCE_EQ(a, b), paddle::platform::EnforceNotMet);
-}
\ No newline at end of file
+}
diff --git a/paddle/platform/gpu_info.h b/paddle/platform/gpu_info.h
index ac884386dde1f9e6ec3433bd46705f78cfd6b1e6..e47c9b4a2abf885987e634c3fc5b0d91a0f2555e 100644
--- a/paddle/platform/gpu_info.h
+++ b/paddle/platform/gpu_info.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 
 #include <cuda_runtime.h>
 #include <stddef.h>
diff --git a/paddle/platform/variant.h b/paddle/platform/variant.h
index 8145799dfdc6d88959c65904621e4820f5e06c9b..619897ca19eb2e6f4dbfd9160edf8c4bc58c89a9 100644
--- a/paddle/platform/variant.h
+++ b/paddle/platform/variant.h
@@ -16,7 +16,7 @@
 
 #include <boost/config.hpp>
 
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 
 // Because boost's variadic templates has bug on nvcc, boost will disable
 // variadic template support when GPU enabled on nvcc.
diff --git a/paddle/pserver/test/SocketTest.cpp b/paddle/pserver/test/SocketTest.cpp
index 96724530f5e1adb3135d9931d4b258d0ccf1c30d..b43461d61bab21747e85090bbf7af21a87a670c6 100644
--- a/paddle/pserver/test/SocketTest.cpp
+++ b/paddle/pserver/test/SocketTest.cpp
@@ -215,7 +215,7 @@ int main(int argc, char** argv) {
 
   uint64_t dataSize = FLAGS_dim * sizeof(real);
 
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
   GpuVector gpuParam(FLAGS_dim);
   GpuVector gpuGrad(FLAGS_dim);
 #else
diff --git a/paddle/pserver/test/test_ProtoServer.cpp b/paddle/pserver/test/test_ProtoServer.cpp
index 74ab1f2f77ec503c23ac81b9e2aaf60d21342b0b..ad8ffed9c1c8e4bdef27689ab21950db6b5cf0a2 100644
--- a/paddle/pserver/test/test_ProtoServer.cpp
+++ b/paddle/pserver/test/test_ProtoServer.cpp
@@ -99,7 +99,7 @@ TEST(ProtoServer, regular) {
 }
 
 TEST(ProtoServer, extended) {
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
   ProtoClient* client;
   if (FLAGS_rdma_tcp == "rdma")
     client = new ProtoClient(FLAGS_server_addr, FLAGS_port, F_RDMA);
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 761d82fc4d0c5bbc8ebc8a88e1a4d6745fb564bd..cff54b174134879a3779c7738cfc3b43a074f8d7 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -34,7 +34,7 @@ static size_t UniqueIntegerGenerator() {
 }
 
 bool IsCompileGPU() {
-#ifndef PADDLE_WITH_GPU
+#ifndef PADDLE_WITH_CUDA
   return false;
 #else
   return true;
@@ -78,7 +78,7 @@ PYBIND11_PLUGIN(core) {
       .def("set", PyCPUTensorSetFromArray<float>)
       .def("set", PyCPUTensorSetFromArray<int>)
       .def("set", PyCPUTensorSetFromArray<double>)
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
       .def("set", PyCUDATensorSetFromArray<float>)
       .def("set", PyCUDATensorSetFromArray<int>)
       .def("set", PyCUDATensorSetFromArray<double>)
@@ -96,7 +96,7 @@ PYBIND11_PLUGIN(core) {
       .def(
           "__init__",
           [](LoDTensor &instance, const std::vector<std::vector<size_t>> &lod) {
-#ifndef PADDLE_WITH_GPU
+#ifndef PADDLE_WITH_CUDA
             new (&instance) LoDTensor(lod);
 #else
              LoD new_lod;
@@ -107,7 +107,7 @@ PYBIND11_PLUGIN(core) {
           })
       .def("set_lod",
            [](LoDTensor &self, const std::vector<std::vector<size_t>> &lod) {
-#ifndef PADDLE_WITH_GPU
+#ifndef PADDLE_WITH_CUDA
              self.set_lod(lod);
 #else
              LoD new_lod;
@@ -117,7 +117,7 @@ PYBIND11_PLUGIN(core) {
 #endif
            })
       .def("lod", [](LoDTensor &self) -> std::vector<std::vector<size_t>> {
-#ifndef PADDLE_WITH_GPU
+#ifndef PADDLE_WITH_CUDA
         return self.lod();
 #else
            auto lod = self.lod();
@@ -203,7 +203,7 @@ All parameter, weight, gradient are variables in Paddle.
       .def_static("create",
                   [](paddle::platform::GPUPlace& place)
                       -> paddle::platform::DeviceContext* {
-#ifndef PADDLE_WITH_GPU
+#ifndef PADDLE_WITH_CUDA
                     PADDLE_THROW("GPUPlace is not supported in CPU device.");
 #else
                     return new paddle::platform::CUDADeviceContext(place);
diff --git a/paddle/pybind/tensor_py.h b/paddle/pybind/tensor_py.h
index 62e85fa54fba7ddbd017f3b12f7abdfae8a7c48c..9e73f79cbdd545db558bd8641bc52e4bf3b0664f 100644
--- a/paddle/pybind/tensor_py.h
+++ b/paddle/pybind/tensor_py.h
@@ -106,7 +106,7 @@ void PyCPUTensorSetFromArray(
   std::memcpy(dst, array.data(), sizeof(T) * array.size());
 }
 
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 template <typename T>
 void PyCUDATensorSetFromArray(
     framework::Tensor &self,
diff --git a/paddle/string/to_string_test.cc b/paddle/string/to_string_test.cc
index 542c771a98ec8ae187cd4f821ed1ee4373427041..971484dd0c073762e99f3926576eb21b96197769 100644
--- a/paddle/string/to_string_test.cc
+++ b/paddle/string/to_string_test.cc
@@ -36,4 +36,4 @@ TEST(to_string, user_defined) {
   using namespace paddle::string;
   UserDefinedClass instance;
   ASSERT_EQ(kOutputString, to_string(instance));
-}
\ No newline at end of file
+}
diff --git a/paddle/trainer/MergeModel.cpp b/paddle/trainer/MergeModel.cpp
index a37d53bc724777a537b7706892ca84fd354c3b33..6c52eaf4494bb247324b29981d94d7e97e0f212a 100644
--- a/paddle/trainer/MergeModel.cpp
+++ b/paddle/trainer/MergeModel.cpp
@@ -29,7 +29,7 @@ int main(int argc, char** argv) {
   initMain(argc, argv);
   initPython(argc, argv);
   string confFile = TrainerConfigHelper::getConfigNameFromPath(FLAGS_model_dir);
-#ifndef PADDLE_WITH_GPU
+#ifndef PADDLE_WITH_CUDA
   FLAGS_use_gpu = false;
 #endif
   auto config = std::make_shared<TrainerConfigHelper>(confFile);
diff --git a/paddle/trainer/tests/test_Compare.cpp b/paddle/trainer/tests/test_Compare.cpp
index b5d29da45af2a40962d6a2800243a980bd29d28d..f3a964acb69be059a43470f7b68910a3b6cecaab 100644
--- a/paddle/trainer/tests/test_Compare.cpp
+++ b/paddle/trainer/tests/test_Compare.cpp
@@ -146,7 +146,7 @@ void compareGradient(comData& comDataCpu, comData& comDataGpu) {
 }
 
 int main(int argc, char** argv) {
-#ifndef PADDLE_WITH_GPU
+#ifndef PADDLE_WITH_CUDA
   exit(0);
 #endif
   paddle::initMain(argc, argv);
diff --git a/paddle/trainer/tests/test_CompareSparse.cpp b/paddle/trainer/tests/test_CompareSparse.cpp
index 4da9ce20fb3f91b4763c3e9f651f93364d84233e..5f1834bd730375fc10762fc19788d0c693f8e752 100644
--- a/paddle/trainer/tests/test_CompareSparse.cpp
+++ b/paddle/trainer/tests/test_CompareSparse.cpp
@@ -174,7 +174,7 @@ TEST(compareSparse, multiGradientMachine) {
     FLAGS_local = local;
     FLAGS_ports_num_for_sparse = 5;
     for (bool useGpu : {false, true}) {
-#ifndef PADDLE_WITH_GPU
+#ifndef PADDLE_WITH_CUDA
       if (useGpu) continue;
 #endif
       FLAGS_parallel_nn = useGpu;
@@ -198,7 +198,7 @@ TEST(compareSparse, NeuralNetwork) {
     FLAGS_local = local;
     FLAGS_ports_num_for_sparse = 5;
     for (bool useGpu : {false, true}) {
-#ifndef PADDLE_WITH_GPU
+#ifndef PADDLE_WITH_CUDA
       if (useGpu) continue;
 #endif
       FLAGS_parallel_nn = useGpu;
diff --git a/paddle/trainer/tests/test_Trainer.cpp b/paddle/trainer/tests/test_Trainer.cpp
index f69e1aafeeebbdfc847585dfe071a44689a5b019..425b3d10a38086463784ba2a18db1293efe96e92 100644
--- a/paddle/trainer/tests/test_Trainer.cpp
+++ b/paddle/trainer/tests/test_Trainer.cpp
@@ -51,7 +51,7 @@ void checkGradientTest(const string& configFile,
 
 TEST(checkGradient, cpu) { checkGradientTest(configFile1, false, false); }
 
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 TEST(checkGradient, gpu) { checkGradientTest(configFile1, true, false); }
 
 TEST(checkGradient, multiGpu) {
@@ -97,7 +97,7 @@ TEST(checkGradient, hsigmoid) { checkGradientTest(configFile2, false, false); }
 
 TEST(checkGradient, chunk) {
   checkGradientTest(configFile3, false, false);
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
   checkGradientTest(configFile3, true, true);
 #endif
 }
diff --git a/paddle/trainer/tests/test_TrainerOnePass.cpp b/paddle/trainer/tests/test_TrainerOnePass.cpp
index 4c4d124fa9d95bc3def0700b97e30cd2558a009c..b2a93d4d5eea37ad716b59427f2aa4409d2f537d 100644
--- a/paddle/trainer/tests/test_TrainerOnePass.cpp
+++ b/paddle/trainer/tests/test_TrainerOnePass.cpp
@@ -79,7 +79,7 @@ void trainerOnePassTest(const string& configFile,
 // 1. test trainer (cpu, gpu).
 TEST(trainerOnePass, cpu) { trainerOnePassTest(configFile1, false, false); }
 
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 TEST(trainerOnePass, gpu) { trainerOnePassTest(configFile1, true, false); }
 
 TEST(trainerOnePass, gpu2) { trainerOnePassTest(configFile1, true, false, 2); }
@@ -94,7 +94,7 @@ TEST(trainerOnePass, parallel) {
 #endif
 
 // 2. test average_window.
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 TEST(average_window, gpu) {
   trainerOnePassTest(configFile1, true, false, 4, 0.01);
 }
@@ -266,7 +266,7 @@ TEST(checkRemoteUpdater, cpuTrainerOldUpdater) {
   checkRemoteParameterUpdaterTest(configFile1, false, false, 1, true);
 }
 
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
 TEST(checkRemoteUpdater, gpuTrainer) {
   checkRemoteParameterUpdaterTest(configFile1, true, false);
 }
diff --git a/paddle/trainer/tests/test_recurrent_machine_generation.cpp b/paddle/trainer/tests/test_recurrent_machine_generation.cpp
index 74b4fed7edd62ff24282f614b3886ee4cafea630..a8fbe31c2b1e228107dfc19483444409bfcbf788 100644
--- a/paddle/trainer/tests/test_recurrent_machine_generation.cpp
+++ b/paddle/trainer/tests/test_recurrent_machine_generation.cpp
@@ -113,7 +113,7 @@ void testGeneration(const string& configFile,
 #ifndef PADDLE_TYPE_DOUBLE
 
 TEST(RecurrentGradientMachine, test_generation) {
-#ifndef PADDLE_WITH_GPU
+#ifndef PADDLE_WITH_CUDA
   const auto useGpuConfs = {false};
 #else
   const auto useGpuConfs = {true, false};
diff --git a/paddle/utils/Flags.cpp b/paddle/utils/Flags.cpp
index 32155ded3574dae2ff4fc5a3ff327927435ef296..8f100f02e90bcbc7fdcf6f053aec6f95cfb09c1a 100644
--- a/paddle/utils/Flags.cpp
+++ b/paddle/utils/Flags.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "Flags.h"
 
-#ifndef PADDLE_WITH_GPU
+#ifndef PADDLE_WITH_CUDA
 DEFINE_bool(use_gpu, false, "Only support CPU training");
 #else
 DEFINE_bool(use_gpu, true, "Whether to use GPU for training");
diff --git a/paddle/utils/Util.h b/paddle/utils/Util.h
index 904d0f506150d8279945b5942de27d1736b7c992..9579881ea3b92abab0189631184bab515afb67a3 100644
--- a/paddle/utils/Util.h
+++ b/paddle/utils/Util.h
@@ -218,7 +218,7 @@ protected:
  * *d2* is peer device to enable direct access to by the d1 device.
  */
 inline void enablePeerAccess(int d1, int d2) {
-#ifdef PADDLE_WITH_GPU
+#ifdef PADDLE_WITH_CUDA
   if (hl_device_can_access_peer(d1, d2)) {
     SetDevice dev(d1);
     hl_device_enable_peer_access(d2);
diff --git a/paddle/utils/Version.h b/paddle/utils/Version.h
index 611fda83d98fd22f81aff5c5d588841ad490eff7..004d62451cddfee8fbd687938086e04ecb2332a9 100644
--- a/paddle/utils/Version.h
+++ b/paddle/utils/Version.h
@@ -48,7 +48,7 @@ void printVersion(std::ostream& os);
  * @return return true if paddle compiled with GPU
  */
 constexpr bool isWithGpu() {
-#ifndef PADDLE_WITH_GPU
+#ifndef PADDLE_WITH_CUDA
   return false;
 #else
   return true;