diff --git a/paddle/fluid/operators/sequence_expand_op.cu b/paddle/fluid/operators/sequence_expand_op.cu
index 1bd73426522bcac608e54f979a5c049b2d2fa62b..8a35bc908e8d73f9bdc5f0fbf268fb40e5d228ce 100644
--- a/paddle/fluid/operators/sequence_expand_op.cu
+++ b/paddle/fluid/operators/sequence_expand_op.cu
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #define EIGEN_USE_GPU
-#include <stdio.h>
 #include <algorithm>
 #include "paddle/fluid/operators/sequence_expand_op.h"
 #include "paddle/fluid/platform/cuda_helper.h"
@@ -109,12 +108,10 @@ struct SequenceExpandFunctor<platform::CUDADeviceContext, T> {
       const framework::Vector<size_t>& x_lod,   /*expand source lod*/
       const framework::Vector<size_t>& ref_lod, /*expand referenced lod*/
       LoDTensor* out) {
-    int x_item_length = 1;
-    x_item_length = x.numel() / x.dims()[0];
-    VLOG(0) << "x_item_length" << x_item_length;
-    int thread_x = std::max(static_cast<int>(ref_lod.size()), 32);
-    int thread_y = std::max(1024 / thread_x, 16);
-    int thread_z = std::min(1024 / thread_x / thread_y, 16);
+    int x_item_length = x.numel() / x.dims()[0];
+    int thread_x = std::min(32, std::max(static_cast<int>(ref_lod.size()), 16));
+    int thread_y = 16;
+    int thread_z = 1024 / thread_x / thread_y;
     int block_x = static_cast<int>(ref_lod.size());
     dim3 block_size(thread_x, thread_y, thread_z);
     dim3 grid_size(block_x, 1);
@@ -133,12 +130,10 @@ struct SequenceExpandGradFunctor<platform::CUDADeviceContext, T> {
                   const framework::Vector<size_t>& x_lod, /*expand source lod*/
                   const framework::Vector<size_t>& ref_lod, /*expand based lod*/
                   LoDTensor* dx) {
-    int x_item_length = 1;
-    x_item_length = framework::product(dx->dims()) / dx->dims()[0];
-
-    int thread_x = std::max(static_cast<int>(ref_lod.size()), 32);
-    int thread_y = std::max(1024 / thread_x, 16);
-    int thread_z = std::min(1024 / thread_x / thread_y, 16);
+    int x_item_length = framework::product(dx->dims()) / dx->dims()[0];
+    int thread_x = std::min(32, std::max(static_cast<int>(ref_lod.size()), 16));
+    int thread_y = 16;
+    int thread_z = 1024 / thread_x / thread_y;
     int block_x = static_cast<int>(ref_lod.size());
     dim3 block_size(thread_x, thread_y, thread_z);
     dim3 grid_size(block_x, 1);
diff --git a/paddle/fluid/operators/sequence_expand_op.h b/paddle/fluid/operators/sequence_expand_op.h
index c55c3e215abdf91e3fe9b1bdb23823747865839a..d62c387c3eebf9df0ab532f4e891da006f239468 100644
--- a/paddle/fluid/operators/sequence_expand_op.h
+++ b/paddle/fluid/operators/sequence_expand_op.h
@@ -15,8 +15,6 @@ limitations under the License. */
 #pragma once
 #include <numeric>  // std::iota
 
-#include <glog/logging.h>
-#include <sstream>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/math/math_function.h"