diff --git a/paddle/gserver/evaluators/ChunkEvaluator.cpp b/paddle/gserver/evaluators/ChunkEvaluator.cpp
index 273925ba55ee4076454cc496501355c1cd643d39..22579891f397afe58d5b4285f0aece944d8b753c 100644
--- a/paddle/gserver/evaluators/ChunkEvaluator.cpp
+++ b/paddle/gserver/evaluators/ChunkEvaluator.cpp
@@ -75,7 +75,6 @@ class ChunkEvaluator : public Evaluator {
 
 public:
   virtual void init(const EvaluatorConfig& config) {
-    CHECK(!FLAGS_use_gpu) << "Not supported";
     Evaluator::init(config);
     if (config.chunk_scheme() == "IOB") {
       numTagTypes_ = 2;
@@ -137,6 +136,7 @@ public:
     CHECK_EQ(arguments.size(), (size_t)2);
     IVectorPtr& output = arguments[0].ids;
     IVectorPtr& label = arguments[1].ids;
+    CHECK(!output->useGpu() && !label->useGpu()) << "Not supported";
     auto sequenceStartPositions =
         arguments[1].sequenceStartPositions->getVector(false);
     CHECK_EQ(output->getSize(), label->getSize());
diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
index 787ce703a08aef602ac9603dbd7d48b807c7c6d5..0ded30eeb44e95b50ff91722ef96a9f24c81c16d 100644
--- a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
@@ -813,7 +813,6 @@ void TrainerThread::mergeGradSparse(
       para->getMat(PARAMETER_GRADIENT).get());
   std::vector<uint32_t>& ids = mainMat->getIds(threadId_);
 
-  ids.clear();
   for (auto slaveParams : slaveParameters) {
     SparseRowCpuMatrix* mat =
         dynamic_cast<SparseRowCpuMatrix*>((*slaveParams)[pid]
diff --git a/paddle/parameter/Parameter.h b/paddle/parameter/Parameter.h
index 2f9606dc6802656486d3500a25ae7f06442dd9c4..ff251fe89f9f885c361b6c1ae7dde0ae57695e47 100644
--- a/paddle/parameter/Parameter.h
+++ b/paddle/parameter/Parameter.h
@@ -146,6 +146,12 @@ public:
     }
   }
 
+  void enableBufType(ParameterType type) {
+    if (bufs_[type]) return;
+    bufs_[type] = Vector::createParallelVector(config_.size(), useGpu_);
+    bufs_[type]->zeroMem();
+  }
+
   void enableIntType(ParameterType type, size_t intStoreSize = 0) {
     if (!intBufs_[type]) {
       SetDevice device(deviceId_);
diff --git a/paddle/trainer/ThreadParameterUpdater.cpp b/paddle/trainer/ThreadParameterUpdater.cpp
index 91f7f4d29df938e88a0e8c54b7046194c7adfb35..a26e9239f987f63ecbcf0183582ca64d64b50af6 100644
--- a/paddle/trainer/ThreadParameterUpdater.cpp
+++ b/paddle/trainer/ThreadParameterUpdater.cpp
@@ -20,6 +20,8 @@ limitations under the License. */
 #include "paddle/math/SparseRowMatrix.h"
 #include "paddle/utils/Thread.h"
 
+P_DECLARE_int32(trainer_count);
+
 namespace paddle {
 
 SgdThreadUpdater::SgdThreadUpdater(const OptimizationConfig& optConfig)
@@ -48,6 +50,13 @@ void SgdThreadUpdater::init(std::vector<ParameterPtr>& parameters) {
                                               false /*inPserver*/));
     size_t numRows = para->isGradSparseUpdate() ? para->getConfig().dims(0) : 0;
     optimizers_[pid]->init(numRows, &para->getConfig());
+    if (para->isGradSparseUpdate() && FLAGS_trainer_count == 1) {
+      // For trainer_count=1, the gradient machine is NeuralNetwork, which does
+      // not create parameter buf for PARAMETER_GRADIENT for sparse update in
+      // Parameter::enableType(). But gradient parameter buf is still used
+      // in SgdThreadUpdater. We need to explicitly create it.
+      para->enableBufType(PARAMETER_GRADIENT);
+    }
   }
 }
 
@@ -211,7 +220,7 @@ void SgdThreadUpdater::threadUpdateSparse(
     // From MultiGradientMachine
     SparseRowIdsCpuMatrix* mainMat = dynamic_cast<SparseRowIdsCpuMatrix*>(
       para->getMat(PARAMETER_GRADIENT).get());
-    const std::vector<uint32_t>& sparseIds = mainMat->getIds(tid);
+    std::vector<uint32_t>& sparseIds = mainMat->getIds(tid);
 
     for (auto id : sparseIds) {
       // setup sub bufs
@@ -221,6 +230,7 @@ void SgdThreadUpdater::threadUpdateSparse(
       optimizer->update(vecs, para->getConfig(), id);
       vecs[PARAMETER_GRADIENT]->zeroMem();
     }
+    sparseIds.clear();
   } else if (dynamic_cast<SparseRowCpuMatrix*>(
                para->getMat(PARAMETER_GRADIENT).get())) {
     // From NeuralNetwork
@@ -246,6 +256,10 @@ void SgdThreadUpdater::threadUpdateSparse(
       optimizer->update(vecs, para->getConfig(), id);
       vecs[PARAMETER_GRADIENT]->zeroMem();
     }
+    // For numThreads > 1, MultiGradientMachine is used, which goes
+    // to the above branch.
+    CHECK_EQ(numThreads, 1);
+    mainMat->clearIndices();
   } else {
     auto & m = *para->getMat(PARAMETER_GRADIENT).get();
     LOG(FATAL) << "Internal error: " << para->getName() << " "
diff --git a/paddle/utils/Logging.h b/paddle/utils/Logging.h
index b3f439804686fa2103eda87b96e61f0d279280a1..7fdfa3240c1de71ca8cd4c4b7e772b6767b43672 100644
--- a/paddle/utils/Logging.h
+++ b/paddle/utils/Logging.h
@@ -191,7 +191,7 @@ void installFailureWriter(void(*callback)(const char*, int));
 }
 #endif  // PADDLE_USE_GLOG
 
-#ifdef NDEBUG
+#ifndef NDEBUG
 #define DEBUG_LEVEL 5
 #define DBG VLOG(DEBUG_LEVEL)
 #else