diff --git a/demo-client/src/ctr_prediction.cpp b/demo-client/src/ctr_prediction.cpp
index 87fa05b431c2f4e41c81c7551258f24cdcb81161..70b0c841227e411b70ef8c7a6263837804a83b55 100644
--- a/demo-client/src/ctr_prediction.cpp
+++ b/demo-client/src/ctr_prediction.cpp
@@ -30,7 +30,7 @@ using baidu::paddle_serving::predictor::ctr_prediction::Response;
 using baidu::paddle_serving::predictor::ctr_prediction::CTRReqInstance;
 using baidu::paddle_serving::predictor::ctr_prediction::CTRResInstance;
 
-int batch_size = 1;
+int batch_size = 16;
 int sparse_num = 26;
 int dense_num = 13;
 int thread_num = 1;
@@ -95,8 +95,12 @@ int create_req(Request* req,
       return -1;
     }
     // add data
-    std::vector<std::string> feature_list =
-        split(data_list[data_index + i], "\t");
+    // avoid out of boundary
+    int cur_index = data_index + i;
+    if (cur_index >= data_list.size()) {
+      cur_index = cur_index % data_list.size();
+    }
+    std::vector<std::string> feature_list = split(data_list[cur_index], "\t");
     for (int fi = 0; fi < dense_num; fi++) {
       if (feature_list[fi] == "") {
         ins->add_dense_ids(0.0);
diff --git a/demo-serving/conf/gflags.conf b/demo-serving/conf/gflags.conf
index f9c735b5e272f528eeaa409762c9c25bba09dcf3..b4eedcc8d9d554b4ce159456b0614b139aa979c2 100644
--- a/demo-serving/conf/gflags.conf
+++ b/demo-serving/conf/gflags.conf
@@ -1,2 +1,2 @@
 --enable_model_toolkit
---enable_cube=false
+--enable_cube=true
diff --git a/demo-serving/op/ctr_prediction_op.cpp b/demo-serving/op/ctr_prediction_op.cpp
index 649d46990aef1b9a782ecfbf91bf25630cfafdf7..a904562b6b303134d5198fbbe01ad2cb79c4ba97 100644
--- a/demo-serving/op/ctr_prediction_op.cpp
+++ b/demo-serving/op/ctr_prediction_op.cpp
@@ -263,28 +263,51 @@ int CTRPredictionOp::inference() {
     return 0;
   }
 
-  if (out->size() != sample_size) {
-    LOG(ERROR) << "Output tensor size not equal that of input";
-    fill_response_with_message(res, -1, "Output size != input size");
+  if (out->size() != 1) {
+    LOG(ERROR) << "Model returned number of fetch tensor more than 1";
+    fill_response_with_message(
+        res, -1, "Model returned number of fetch tensor more than 1");
     return 0;
   }
 
-  for (size_t i = 0; i < out->size(); ++i) {
-    int dim1 = out->at(i).shape[0];
-    int dim2 = out->at(i).shape[1];
+  int output_shape_dim = out->at(0).shape.size();
+  if (output_shape_dim != 2) {
+    LOG(ERROR) << "Fetch LoDTensor should be shape of [sample_size, 2]";
+    fill_response_with_message(
+        res, -1, "Fetch LoDTensor should be shape of [sample_size, 2]");
+    return 0;
+  }
 
-    if (out->at(i).dtype != paddle::PaddleDType::FLOAT32) {
-      LOG(ERROR) << "Expected data type float";
-      fill_response_with_message(res, -1, "Expected data type float");
-      return 0;
-    }
+  if (out->at(0).dtype != paddle::PaddleDType::FLOAT32) {
+    LOG(ERROR) << "Fetch LoDTensor data type should be FLOAT32";
+    fill_response_with_message(
+        res, -1, "Fetch LoDTensor data type should be FLOAT32");
+    return 0;
+  }
 
-    float *data = static_cast<float *>(out->at(i).data.data());
-    for (int j = 0; j < dim1; ++j) {
-      CTRResInstance *res_instance = res->add_predictions();
-      res_instance->set_prob0(data[j * dim2]);
-      res_instance->set_prob1(data[j * dim2 + 1]);
-    }
+  int dim1 = out->at(0).shape[0];
+  int dim2 = out->at(0).shape[1];
+
+  if (dim1 != sample_size) {
+    LOG(ERROR) << "Returned result count not equal to sample_size";
+    fill_response_with_message(
+        res, -1, "Returned result count not equal to sample size");
+    return 0;
+  }
+
+  if (dim2 != 2) {
+    LOG(ERROR) << "Returned result is not expected, should be 2 floats for "
+                  "each sample";
+    fill_response_with_message(
+        res, -1, "Retunred result is not 2 floats for each sample");
+    return 0;
+  }
+
+  float *data = static_cast<float *>(out->at(0).data.data());
+  for (int i = 0; i < dim1; ++i) {
+    CTRResInstance *res_instance = res->add_predictions();
+    res_instance->set_prob0(data[i * dim2]);
+    res_instance->set_prob1(data[i * dim2 + 1]);
   }
 
   for (size_t i = 0; i < in->size(); ++i) {