diff --git a/paddle/fluid/operators/clip_op.h b/paddle/fluid/operators/clip_op.h
index 56e65b6f664f9dc848bf86d21753047717873cfa..a8485a148b17c1a084b9d294c998531ec3a8e071 100644
--- a/paddle/fluid/operators/clip_op.h
+++ b/paddle/fluid/operators/clip_op.h
@@ -25,17 +25,23 @@ namespace operators {
 using framework::Tensor;
 using platform::Transform;
 
+#ifdef __NVCC__
+template <typename T, typename UnaryOperation>
+__global__ void ClipCudaKernel(const T* input, T* out, int num,
+                               UnaryOperation op) {
+  int idx = threadIdx.x + blockDim.x * blockIdx.x;
+  if (idx < num) {
+    out[idx] = op(input[idx]);
+  }
+}
+#endif
+
 template <typename T>
 class ClipFunctor {
  public:
   explicit ClipFunctor(const T min, const T max) : min_(min), max_(max) {}
   HOSTDEVICE T operator()(const T& x) const {
-    if (x < min_)
-      return min_;
-    else if (x > max_)
-      return max_;
-    else
-      return x;
+    return x < min_ ? min_ : x > max_ ? max_ : x;
   }
 
  private:
@@ -97,9 +103,20 @@ class ClipKernel : public framework::OpKernel<T> {
       T* out_data = out->mutable_data<T>(context.GetPlace());
       const T* x_data = x->data<T>();
       int64_t numel = x->numel();
-      Transform<DeviceContext> trans;
-      trans(context.template device_context<DeviceContext>(), x_data,
-            x_data + numel, out_data, ClipFunctor<T>(min, max));
+      if (platform::is_gpu_place(context.GetPlace())) {
+#ifdef __NVCC__
+        int threads = 256;
+        int blocks = (numel + threads - 1) / threads;
+        ClipCudaKernel<T, ClipFunctor<T>><<<
+            blocks, threads, 0,
+            context.template device_context<platform::CUDADeviceContext>()
+                .stream()>>>(x_data, out_data, numel, ClipFunctor<T>(min, max));
+#endif
+      } else {
+        Transform<DeviceContext> trans;
+        trans(context.template device_context<DeviceContext>(), x_data,
+              x_data + numel, out_data, ClipFunctor<T>(min, max));
+      }
     } else if (x_var->IsType<framework::SelectedRows>()) {
       auto* x = context.Input<framework::SelectedRows>("X");
       auto* out = context.Output<framework::SelectedRows>("Out");
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h
index 364fe773c712389e79d9c3280cf68535c18ffc9c..206eeea87fb03dc32cb9a2e86e7f34b7a78b7101 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -197,6 +197,40 @@ void CommonForwardBroadcastCPU(const framework::Tensor *x,
 }
 
 #ifdef __NVCC__
+template <typename Functor, typename T, typename OutType>
+__global__ void ElementwiseKernel(const T *x, const T *y, OutType *out, int pre,
+                                  int n, int post, int total, Functor func) {
+  int tid = threadIdx.x + blockDim.x * blockIdx.x;
+  int idx = tid / post % n;
+  if (tid < total) {
+    out[tid] = func(x[tid], y[idx]);
+  }
+}
+
+template <typename Functor, typename T, typename OutType>
+void ComputeElementwiseCUDA(const framework::Tensor *x,
+                            const framework::Tensor *y, framework::Tensor *z,
+                            int pre, int n, int post,
+                            const platform::CUDADeviceContext &ctx,
+                            Functor func, const bool is_xsize_larger = true) {
+  const T *x_data = x->data<T>();
+  const T *y_data = y->data<T>();
+  OutType *out_data = z->mutable_data<OutType>(ctx.GetPlace());
+
+  int numel = pre * n * post;
+  int threads = 256;
+  int blocks = (numel + threads - 1) / threads;
+  if (is_xsize_larger) {
+    ElementwiseKernel<Functor, T,
+                      OutType><<<blocks, threads, 0, ctx.stream()>>>(
+        x_data, y_data, out_data, pre, n, post, numel, func);
+  } else {
+    ElementwiseKernel<Functor, T,
+                      OutType><<<blocks, threads, 0, ctx.stream()>>>(
+        y_data, x_data, out_data, pre, n, post, numel, func);
+  }
+}
+
 template <typename Functor, typename T, typename OutType = T>
 __global__ void CommonForwardBroadcastCUDAKernel(
     const int *x_strides_array, const int *y_strides_array,
@@ -1908,6 +1942,16 @@ void ElementwiseComputeEx(const framework::ExecutionContext &ctx,
         ctx, x, y, z, x_dims, y_dims, func, axis, is_xsize_larger);
     return;
   }
+
+  if (platform::is_gpu_place(ctx.GetPlace())) {
+#ifdef __NVCC__
+    ComputeElementwiseCUDA<Functor, T, OutType>(
+        x, y, z, pre, n, post,
+        ctx.template device_context<platform::CUDADeviceContext>(), func,
+        is_xsize_larger);
+#endif
+    return;
+  }
   if (post == 1) {
     functor.RunRowWise(n, pre);
     return;