diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9cfec8e70b4a3d166e3b45048408d7f5e45ce6e4..c62cc9bfd70d72d926eeee5eb52a69428855eb9b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -302,6 +302,14 @@ set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
 set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
 
+if (ON_INFER)
+    message(STATUS "On inference mode, will take place some specific optimization.")
+    add_definitions(-DPADDLE_ON_INFERENCE)
+else()
+    #TODO(luotao), combine this warning with `make inference_lib_dist` command.
+    message(WARNING "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only.")
+endif()
+
 add_subdirectory(paddle)
 if(WITH_PYTHON)
     add_subdirectory(python)
@@ -312,10 +320,3 @@ if(WITH_DOC)
     find_python_module(recommonmark REQUIRED)
     add_subdirectory(doc)
 endif()
-
-if (ON_INFER)
-    message(STATUS "On inference mode, will take place some specific optimization.")
-else()
-    #TODO(luotao), combine this warning with `make inference_lib_dist` command.
-    message(WARNING "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only.")
-endif()
diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h
index 7cf98f27251db3cfe5e8e295ed21056f6e5a2963..e09a24334762388733ba9941ad1674fbc401f707 100644
--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/tensor.h"
 
+#include "paddle/fluid/operators/math/blas.h"
 namespace paddle {
 namespace operators {
 namespace math {
@@ -65,36 +66,42 @@ void SoftmaxFunctor<DeviceContext, T, is_test>::operator()(
                                                  .broadcast(one_by_class));
 }
 
-template <typename DeviceContext, typename T>
-class SoftmaxFunctor<DeviceContext, T, true> {
+template <typename DeviceContext>
+class SoftmaxFunctor<DeviceContext, float, true> {
   void operator()(const DeviceContext& context, const framework::Tensor* X,
                   framework::Tensor* Y) {
-    auto logits = EigenMatrix<T>::From(*X);
-    auto softmax = EigenMatrix<T>::From(*Y);
-
+    auto in_dims = X->dims();
+    auto out_dims = Y->dims();
+    const float* in_data = X->data<float>();
+    float* out_data = Y->data<float>();
     const int kBatchDim = 0;
     const int kClassDim = 1;
-
-    const int batch_size = logits.dimension(kBatchDim);
-    const int num_classes = logits.dimension(kClassDim);
-
-    Eigen::DSizes<int, 1> along_class(kClassDim);
-    Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
-    Eigen::DSizes<int, 2> one_by_class(1, num_classes);
-
-    auto shifted_logits = (logits -
-                           logits.maximum(along_class)
-                               .eval()
-                               .reshape(batch_by_one)
-                               .broadcast(one_by_class));
-
-    softmax.device(*context.eigen_device()) = shifted_logits.exp();
-    softmax.device(*context.eigen_device()) = (softmax *
-                                               softmax.sum(along_class)
-                                                   .inverse()
-                                                   .eval()
-                                                   .reshape(batch_by_one)
-                                                   .broadcast(one_by_class));
+    // 2D data. Batch x C
+    const int batch_size = in_dims[kBatchDim];
+    const int num_classes = in_dims[kClassDim];
+    std::vector<float> entities(batch_size);
+    auto blas = math::GetBlas<DeviceContext, float>(context);
+    for (int n = 0; n < batch_size; ++n) {
+      entities[n] = in_data[n * num_classes];
+      for (int c = 1; c < num_classes; ++c) {
+        entities[n] = in_data[n * num_classes + c] > entities[n]
+                          ? in_data[n * num_classes + c]
+                          : entities[n];
+      }
+      for (int c = 0; c < num_classes; ++c) {
+        out_data[n * num_classes + c] =
+            in_data[n * num_classes + c] - entities[n];
+      }
+    }
+
+    blas.VEXP(num_classes * batch_size, out_data, out_data);
+    for (int n = 0; n < batch_size; ++n) {
+      entities[n] = out_data[n * num_classes];
+      for (int c = 1; c < num_classes; ++c) {
+        entities[n] += out_data[n * num_classes + c];
+      }
+      blas.SCAL(num_classes, 1.0f / entities[n], &out_data[n * num_classes]);
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/softmax_op.h b/paddle/fluid/operators/softmax_op.h
index 2fea8a65bc5141b11549ef400f11b54278be35f9..91829d5761bfdd1f9806af6589a2967fe866fec8 100644
--- a/paddle/fluid/operators/softmax_op.h
+++ b/paddle/fluid/operators/softmax_op.h
@@ -35,7 +35,7 @@ class SoftmaxKernel : public framework::OpKernel<T> {
     Tensor X_2d = framework::ReshapeToMatrix(*X, rank - 1);
     Tensor Out_2d = framework::ReshapeToMatrix(*Out, rank - 1);
 
-#ifdef ON_INFER
+#ifdef PADDLE_ON_INFERENCE
     math::SoftmaxFunctor<DeviceContext, T, true>()(
         context.template device_context<DeviceContext>(), &X_2d, &Out_2d);
 #else