diff --git a/doc/source/gserver/activations/index.rst b/doc/source/gserver/activations/index.rst
index ed6200d9a6c12cce0d6edb2a749c91d860d7fc2f..6ceab41ccb3282ebe474bea74873b3d29b16de9e 100644
--- a/doc/source/gserver/activations/index.rst
+++ b/doc/source/gserver/activations/index.rst
@@ -1,5 +1,5 @@
 Activations
 =============
 
-.. doxygenfile:: paddle/gserver/activations/ActivationFunction.h
-.. doxygenfile:: paddle/gserver/activations/ActivationFunction.cpp
+..  doxygenfile:: paddle/gserver/activations/ActivationFunction.h
+..  doxygenfile:: paddle/gserver/activations/ActivationFunction.cpp
diff --git a/doc/source/gserver/dataprovider/dataproviders.rst b/doc/source/gserver/dataprovider/dataproviders.rst
index 2d2ace177b97a1735314ad58703498354b16dd67..e8aa4bc35634a0c6ede192a15b276564f7a2c13e 100644
--- a/doc/source/gserver/dataprovider/dataproviders.rst
+++ b/doc/source/gserver/dataprovider/dataproviders.rst
@@ -1,14 +1,83 @@
 Data Providers
 ================
 
-Data Provider
+Base DataProvider
+------------------
+..  doxygenclass:: paddle::DataProvider
+    :members:
+
+DataProviderGroup
+-------------------
+..  doxygenclass:: paddle::DataProviderGroup
+    :members:
+
+MultiDataProvider
+-------------------
+..  doxygenclass:: paddle::MultiDataProvider
+    :members:
+
+PyDataProvider
+===================
+
+IFieldScanner
+-------------
+..  doxygenclass:: paddle::IFieldScanner
+    :members:
+
+DenseScanner
+-------------
+..  doxygenclass:: paddle::DenseScanner
+    :members:
+
+IndexScanner
+-------------
+..  doxygenclass:: paddle::IndexScanner
+    :members:
+
+SparseNonValueScanner
+---------------------
+..  doxygenclass:: paddle::SparseNonValueScanner
+    :members:
+
+SparseValueScanner
+------------------
+..  doxygenclass:: paddle::SparseValueScanner
+    :members:
+
+SequenceScanner
+------------------
+..  doxygenclass:: paddle::SparseValueScanner
+    :members:
+
+IPyDataProviderCache
+--------------------
+..  doxygenclass:: paddle::IPyDataProviderCache
+    :members:
+
+NoCacheStrategy
 ---------------
-.. doxygenfile:: paddle/gserver/dataproviders/DataProvider.h
-.. doxygenfile:: paddle/gserver/dataproviders/PyDataProvider2.cpp
-.. doxygenfile:: paddle/gserver/dataproviders/DataProviderGroup.h
-.. doxygenfile:: paddle/gserver/dataproviders/MultiDataProvider.h
+..  doxygenclass:: paddle::NoCacheStrategy
+    :members:
 
-Proto Data Provider
+CacheOnePassInMemory
 --------------------
-.. doxygenfile:: paddle/gserver/dataproviders/ProtoDataProvider.h
-.. doxygenfile:: paddle/gserver/dataproviders/ProtoReader.h
+..  doxygenclass:: paddle::CacheOnePassInMemory
+    :members:
+
+IPyDataProvider
+---------------
+..  doxygenclass:: paddle::PyDataProvider2
+    :members:
+
+Proto Data Provider
+===================
+
+ProtoDataProvider
+----------------
+..  doxygenclass:: paddle::ProtoDataProvider
+    :members:
+
+ProtoSequenceDataProvider
+----------------
+..  doxygenclass:: paddle::ProtoSequenceDataProvider
+    :members:
diff --git a/doc/source/gserver/evaluators/evaluators.rst b/doc/source/gserver/evaluators/evaluators.rst
new file mode 100644
index 0000000000000000000000000000000000000000..0c5cc85e7dff31693bdc9d2ee44ef470a0fc5f90
--- /dev/null
+++ b/doc/source/gserver/evaluators/evaluators.rst
@@ -0,0 +1,102 @@
+Base Evaluator
+==============
+
+Evaluator
+---------
+..  doxygenclass:: paddle::Evaluator
+    :members:
+
+
+Utils
+=====
+
+SumEvaluator
+------------
+..  doxygenclass:: paddle::SumEvaluator
+    :members:
+
+ColumnSumEvaluator
+------------------
+..  doxygenclass:: paddle::ColumnSumEvaluator
+    :members:
+
+Classification
+==============
+
+ClassificationErrorEvaluator
+---------------------------
+..  doxygenclass:: paddle::ClassificationErrorEvaluator
+    :members:
+
+SequenceClassificationErrorEvaluator
+------------------------------------
+..  doxygenclass:: paddle::SequenceClassificationErrorEvaluator
+    :members:
+
+AucEvaluator
+-------------
+..  doxygenclass:: paddle::AucEvaluator
+    :members:
+
+PrecisionRecallEvaluator
+------------------------
+..  doxygenclass:: paddle::PrecisionRecallEvaluator
+    :members:
+
+ChunkEvaluator
+--------------
+..  doxygenclass:: paddle::ChunkEvaluator
+    :members:
+
+CTCEvaluator
+------------
+..  doxygenclass:: paddle::CTCErrorEvaluator
+    :members:
+
+
+Rank
+====
+
+PnpairEvaluator
+-------------
+..  doxygenclass:: paddle::PnpairEvaluator
+    :members:
+
+AucEvaluator
+-------------
+..  doxygenclass:: paddle::RankAucEvaluator
+    :members:
+
+
+Printer
+=======
+
+ValuePrinter
+-------------
+..  doxygenclass:: paddle::ValuePrinter
+    :members:
+
+GradientPrinter
+---------------
+..  doxygenclass:: paddle::GradientPrinter
+    :members:
+
+MaxIdPrinter
+------------
+..  doxygenclass:: paddle::MaxIdPrinter
+    :members:
+
+MaxFramePrinter
+---------------
+..  doxygenclass:: paddle::MaxFramePrinter
+    :members:
+
+SequenceTextPrinter
+------------------
+..  doxygenclass:: paddle::SequenceTextPrinter
+    :members:
+
+ClassificationErrorPrinter
+--------------------------
+..  doxygenclass:: paddle::ClassificationErrorPrinter
+    :members:
diff --git a/doc/source/gserver/evaluators/index.rst b/doc/source/gserver/evaluators/index.rst
index d7f622ff826033c3689564d728c272f8d5618273..298de3e1a32d36b9102f5ad64cc1b968f418041b 100644
--- a/doc/source/gserver/evaluators/index.rst
+++ b/doc/source/gserver/evaluators/index.rst
@@ -1,8 +1,7 @@
 Evaluators
-============
-
-.. doxygenfile:: paddle/gserver/evaluators/Evaluator.h
-.. doxygenfile:: paddle/gserver/evaluators/ChunkEvaluator.cpp
-.. doxygenfile:: paddle/gserver/evaluators/CTCErrorEvaluator.cpp
+==========
 
+.. toctree::
+  :maxdepth: 3
 
+  evaluators.rst
diff --git a/doc/source/gserver/gradientmachines/gradientmachines.rst b/doc/source/gserver/gradientmachines/gradientmachines.rst
index b3009f274e055d9f538cf4a8f51d50069290899d..3607664c850cdf4df4e10151b05f15e275adceaf 100644
--- a/doc/source/gserver/gradientmachines/gradientmachines.rst
+++ b/doc/source/gserver/gradientmachines/gradientmachines.rst
@@ -1,20 +1,40 @@
-Gradient machines
-===================
+Gradient Machines
+================
 
-Networks
-------------
-.. doxygenfile:: paddle/gserver/gradientmachines/MultiNetwork.h
-.. doxygenfile:: paddle/gserver/gradientmachines/ParallelNeuralNetwork.h
+GradientMachine
+---------------------
+..  doxygenclass:: paddle::GradientMachine
+    :members:
 
-Gradient Machines
+GradientMachineModel
 --------------------
-.. doxygenfile:: paddle/gserver/gradientmachines/GradientMachine.h
-.. doxygenfile:: paddle/gserver/gradientmachines/MultiGradientMachine.h
+..  doxygenclass:: paddle::IGradientMachineMode
+    :members:
+
+MultiGradientMachine
+---------------------
+..  doxygenclass:: paddle::MultiGradientMachine
+    :members:
+
+TrainerThread
+`````````````
+..  doxygenclass:: paddle::TrainerThread
+    :members:
 
 Recurrent Gradient Machines
------------------------------
-.. doxygenfile:: paddle/gserver/gradientmachines/RecurrentGradientMachine.h
-.. doxygenfile:: paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+---------------------------
+..  doxygenclass:: paddle::RecurrentGradientMachine
+    :members:
 
+Networks
+========
 
+NeuralNetwork
+-------------
+..  doxygenclass:: paddle::NeuralNetwork
+    :members:
 
+ParallelNeuralNetwork
+---------------------
+..  doxygenclass:: paddle::ParallelNeuralNetwork
+    :members:
diff --git a/paddle/gserver/dataproviders/DataProvider.h b/paddle/gserver/dataproviders/DataProvider.h
index 3c4bea0b3c3637b1fe3bd1f85e59d4b5d011a824..fb404405fbeb0e2da4785c09e3cce4ef7da71320 100644
--- a/paddle/gserver/dataproviders/DataProvider.h
+++ b/paddle/gserver/dataproviders/DataProvider.h
@@ -118,10 +118,10 @@ public:
     data_.push_back(argu);
   }
 
-  /*
-   * argus: DataBatch.getStreams()
-   * size: DataBatch.getSize()
-   * dataId: sub dataprovider id (in MultiDataProvider)
+  /**
+   * @param argus: DataBatch.getStreams()
+   * @param size: DataBatch.getSize()
+   * @param dataId: sub dataprovider id (in MultiDataProvider)
    */
   void appendArguments(const std::vector<Argument>& argus, int size,
                        int dataId) {
@@ -312,22 +312,28 @@ public:
   }
 };
 
-// Data provider for one input and one integer label
+/**
+ * Data provider for one input and one integer label.
+ */
 class SimpleDataProviderBase : public DataProvider {
 protected:
-  int64_t sampleDim_;       // sample feature dimension
-  int64_t bufferCapacity_;  // the number of samples
+  /// sample feature dimension
+  int64_t sampleDim_;
+  /// the number of samples
+  int64_t bufferCapacity_;
   int64_t sampleNumInBuf_;
-  int64_t nextItemIndex_;  // next item to read in buffer
-  bool withInfo_;          // some user defined info for validation
+  /// next item to read in buffer
+  int64_t nextItemIndex_;
+  /// some user defined info for validation
+  bool withInfo_;
 
-  // data buffer: bufferCapacity_ * nDataDim_
+  /// data buffer: bufferCapacity_ * nDataDim_
   CpuMatrixPtr hInputDataBuf_;
 
-  // label buffer:bufferCapacity_ * 1
+  /// label buffer:bufferCapacity_ * 1
   CpuIVectorPtr hInputLabelBuf_;
 
-  // info buffer:bufferCapacity_ * 1
+  /// info buffer:bufferCapacity_ * 1
   CpuIVectorPtr hInputInfoBuf_;
 
   ThreadLocal<MatrixPtr> dataBatch_;
@@ -348,7 +354,7 @@ public:
 
   virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
 
-  // return the number of samples in the buffer
+  /// return the number of samples in the buffer
   int64_t fillBuffer();
 
 protected:
diff --git a/paddle/gserver/dataproviders/ProtoDataProvider.h b/paddle/gserver/dataproviders/ProtoDataProvider.h
index 0f3f55738a37edaecb9937c2b8c02205887f87e3..02db5f870db74be7237d96d187f134b94192933e 100644
--- a/paddle/gserver/dataproviders/ProtoDataProvider.h
+++ b/paddle/gserver/dataproviders/ProtoDataProvider.h
@@ -80,7 +80,7 @@ protected:
    */
   inline bool iidData() const { return sequenceStartPositions_.empty(); }
 
-  // check that sample is consistent with header_
+  /// check that sample is consistent with header_
   void checkSample(const DataSample& sample);
 
   template <class Op>
@@ -129,14 +129,15 @@ protected:
 
   int64_t currentSequenceIndex_;
 
-  // The size should be the number of sequences.
+  /// The size should be the number of sequences.
   std::vector<size_t> shuffledSequenceIds_;
 
   ThreadLocalD<DataBatch> cpuBatch_;
   ThreadLocalD<DataBatch> gpuBatch_;
 
   RWLock lock_;
-  std::vector<StatPtr> nnzStats_;  // stats for number of none-zeros entries
+  // stats for number of none-zeros entries
+  std::vector<StatPtr> nnzStats_;
 };
 
 /**
diff --git a/paddle/gserver/evaluators/Evaluator.cpp b/paddle/gserver/evaluators/Evaluator.cpp
index ba9847f6abcee0b1f20f9895542cff6a1c614aad..a50eecdbb758ada0184a51cdb4546efe2f000d71 100644
--- a/paddle/gserver/evaluators/Evaluator.cpp
+++ b/paddle/gserver/evaluators/Evaluator.cpp
@@ -1000,20 +1000,34 @@ REGISTER_EVALUATOR(max_frame_printer, MaxFramePrinter);
 /**
  * Sequence text printer will print text according to index matrix and a
  * dictionary. There can be multiple input to this layer:
+ *
  *   1) If there is only one input, the input must be a matrix containing
  *      the sequence of indices;
+ *
  *   2) If there are more than one input, the first input should be ids,
  *      and are interpreted as sample ids.
  *
  * The output format will be:
+ *
  *   1) sequence without sub-sequence, and there is probability.
+ *
+ *     @code
  *      id \t prob space_seperated_tokens_from_dictionary_according_to_seq
+ *     @endcode
+ *
  *   2) sequence without sub-sequence, and there is not probability.
+ *
+ *     @code
  *      id \t space_seperated_tokens_from_dictionary_according_to_seq
+ *     @endcode
+ *
  *   3) sequence with sub-sequence, and there is not probability.
+ *
+ *     @code
  *      id \t space_seperated_tokens_from_dictionary_according_to_sub_seq
  *      \t \t space_seperated_tokens_from_dictionary_according_to_sub_seq
  *      ...
+ *     @endcode
  *
  * Typically SequenceTextPrinter layer takes output of maxid or RecurrentGroup
  * with maxid (when generating) as an input.
diff --git a/paddle/gserver/evaluators/Evaluator.h b/paddle/gserver/evaluators/Evaluator.h
index 316219b4fb292b09c5c7272885c64506f32d6ba5..eee785e0e3a092995c9e152ad2dd75027706a6fc 100644
--- a/paddle/gserver/evaluators/Evaluator.h
+++ b/paddle/gserver/evaluators/Evaluator.h
@@ -68,7 +68,7 @@ public:
     numSamples_ += arguments[0].getBatchSize();
   }
 
-  // finish() should be called before distributeEval
+  /// finish() should be called before distributeEval
   virtual void distributeEval(ParameterClient2* client) {
     LOG(FATAL) << "Not implemeted";
   }
@@ -85,7 +85,7 @@ public:
    */
   virtual void finish() {}
 
-  // finish() should be called before printStats
+  /// finish() should be called before printStats
   virtual void printStats(std::ostream& os) {
     os << config_.name() << "="
        << (numSamples_ ? totalScore_ / numSamples_ : 0);
@@ -130,9 +130,9 @@ public:
   /**
    * @brief evaluate AUC using colIdx-th column as prediction.
    *
-   * colIdx = 0: the 0-th column.
-   * colIdx > 0: the colIdx-th column.
-   * colIdx < 0: the last colIdx-th column.
+   * - colIdx = 0: the 0-th column.
+   * - colIdx > 0: the colIdx-th column.
+   * - colIdx < 0: the last colIdx-th column.
    *
    */
   AucEvaluator(int32_t colIdx)
@@ -223,10 +223,14 @@ public:
   virtual void distributeEval(ParameterClient2* client);
 
   struct StatsInfo {
-    double TP;  // numbers of true positives
-    double TN;  // numbers of true negatives
-    double FP;  // numbers of false positives
-    double FN;  // numbers of false negatives
+    /// numbers of true positives
+    double TP;
+    /// numbers of true negatives
+    double TN;
+    /// numbers of false positives
+    double FP;
+    /// numbers of false negatives
+    double FN;
 
     StatsInfo() : TP(0.0), TN(0.0), FP(0.0), FN(0.0) {}
   };
diff --git a/paddle/gserver/gradientmachines/GradientMachine.h b/paddle/gserver/gradientmachines/GradientMachine.h
index 7233f985c56e96e0d19009ac0f0f47a41e88a5bb..986a1ee71dbb00781c6af93a06f3e16d6639c307 100644
--- a/paddle/gserver/gradientmachines/GradientMachine.h
+++ b/paddle/gserver/gradientmachines/GradientMachine.h
@@ -247,7 +247,7 @@ public:
   virtual void restart() {}
 
 
-  // Set the gradient of the output from outside.
+  /// Set the gradient of the output from outside.
   virtual void setOutputGrad(const std::vector<Argument>& args) {
     LOG(FATAL) << "Not implemented!";
   }
diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.h b/paddle/gserver/gradientmachines/MultiGradientMachine.h
index 7c4ec4f6d2563ffc383f179c5d756dcc78fbc3bc..d13cf426c29e4e9f6806178f2362e8189fdb0dec 100644
--- a/paddle/gserver/gradientmachines/MultiGradientMachine.h
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.h
@@ -31,14 +31,15 @@ typedef Queue<int> PidQueue;
 typedef std::unique_ptr<TrainerThread> TrainerThreadPtr;
 
 struct GradBuffer {
-  // GradBuffer is used for gathering gradient for GPU parameters
+  /// GradBuffer is used for gathering gradient for GPU parameters
   int paramId;
 
-  // sem is used to notify that the local gradient merge of the current thread
-  // finished for the current thread.
+  /// sem is used to notify that the local gradient merge of the current thread
+  /// finished for the current thread.
   Semaphore sem;
 
-  std::vector<VectorPtr> bufs;  // bufs[mergeIndex]
+  // bufs[mergeIndex]
+  std::vector<VectorPtr> bufs;
 };
 
 /**
@@ -189,14 +190,14 @@ public:
     return useGpu_;
   }
 
-  // @return whether to pass the gradients in outArgs_ to each threads.
+  /// @return whether to pass the gradients in outArgs_ to each threads.
   bool isPassGrad() { return isPassGrad_; }
 
-  // @brief set whether to pass the gradient in outArgs_ to each threads.
+  /// @brief set whether to pass the gradient in outArgs_ to each threads.
   void setPassGrad(bool isPass) { isPassGrad_ = isPass; }
 
-  // Set the gradients of the outputs.
-  // The gradietns will be copied to each thread in the computing threads.
+  /// Set the gradients of the outputs.
+  /// The gradietns will be copied to each thread in the computing threads.
   virtual void setOutputGrad(const std::vector<Argument>& args);
 
 protected:
@@ -205,8 +206,8 @@ protected:
   std::vector<TrainerThreadPtr>& getAllThreads() {
     return threads_;
   }
-  // Calculate the real device id based on the logical device id and the
-  // thread id.
+  /// Calculate the real device id based on the logical device id and the
+  /// thread id.
   int logicalDeviceId2RealDeviceId(int logicalId, int threadId = 0) const {
     if (logicalId == -1) {
       logicalId = 0;
@@ -215,8 +216,8 @@ protected:
                numDevices_);
   }
 
-  // Calculate the logical device id based on the real device id and the
-  // thread id.
+  /// Calculate the logical device id based on the real device id and the
+  /// thread id.
   int realDeviceId2LogicalDeviceId(int realId, int threadId = 0) const {
     if (realId == -1) {
       return 0;
@@ -232,15 +233,15 @@ protected:
     return hasNonstaticCpuParamters_;
   }
 
-  // Called TrainerThread to wait before merging CPU parameter gradients.
+  /// Called TrainerThread to wait before merging CPU parameter gradients.
   void waitBeforeMerge() { trainerBarrier_.wait(); }
 
-  // called by MultiGradientMachine and TrainerThread to wait after merging
-  // CPU parameter graidents.
+  /// called by MultiGradientMachine and TrainerThread to wait after merging
+  /// CPU parameter graidents.
   void waitAfterMerge() { allBarrier_.wait(); }
 
-  // called by MultiGradientMachine and TrainerThread to wait for copyInArgs()
-  // finishing
+  /// called by MultiGradientMachine and TrainerThread to wait for copyInArgs()
+  /// finishing
   void waitForCopyInArgs() { allBarrier_.wait(); }
 
   TrainerThreadPtr& getThread(int threadId) {
@@ -255,8 +256,8 @@ protected:
     return passType_;
   }
 
-  // Called by TrainerThread to notify MultiGradientMachine that the gradient
-  // for paramId is ready
+  /// Called by TrainerThread to notify MultiGradientMachine that the gradient
+  /// for paramId is ready
   void notifyGradientTransfer(int paramId);
 
   const std::vector<Argument>& getInArgs() {
@@ -297,7 +298,7 @@ protected:
   virtual void backwardImp(
       const UpdateCallback& callback = NULL);
 
-  // update all parameters
+  /// update all parameters
   void updateThreadParameters();
 
   void startTask(TaskType taskType);
@@ -311,7 +312,7 @@ protected:
 
   bool hasNonstaticCpuParamters_;
 
-  // store main parameter only
+  /// store main parameter only
   std::unique_ptr<GradientMachine> gradientMachine_;
 
   std::vector<TrainerThreadPtr> threads_;
@@ -326,7 +327,7 @@ protected:
   std::vector<Argument> outArgs_;
   hl_stream_t outArgStream_;
 
-  // ParameterType which needs to be merged from each GPU
+  /// ParameterType which needs to be merged from each GPU
   std::vector<ParameterType> mergeTypes_;
   int numDevices_;  /* number of gpu devices */
   int numLogicalDevices_;  // number of GPU used by one NN
@@ -334,16 +335,16 @@ protected:
 
   UpdateCallback backwardCallback_;
 
-  // barrrier for threads_
+  /// barrrier for threads_
   ThreadBarrier trainerBarrier_;
 
-  // barrier for both MultiGradientMachine and threds_
+  /// barrier for both MultiGradientMachine and threds_
   ThreadBarrier allBarrier_;
 
-  // indicate whether inArgs is copied before forward()
+  /// indicate whether inArgs is copied before forward()
   bool inArgsCopied_;
 
-  // Whether to copy the gradient back from an external input.
+  /// Whether to copy the gradient back from an external input.
   bool isPassGrad_;
 };
 
@@ -413,7 +414,7 @@ public:
 
   void prefetch();
 
-  // copy the output gradient from the main GradientMachine.
+  /// copy the output gradient from the main GradientMachine.
   void copyOutputGrad();
 
 protected:
@@ -441,51 +442,60 @@ protected:
   void backward();
   void backwardCallback(Parameter* para);
 
-  // call the actuall callback supplied by the caller of
-  // GradientMachine::backward
+  /// call the actuall callback supplied by the caller of
+  /// GradientMachine::backward
   void doCallback(int pid);
 
 protected:
   MultiGradientMachine* multiMachine_;
   ModelConfig config_;
-  bool stopping_;   // whether the thread should stop
-  int partnerId_;   // the threads form which to collect gradient
-  int threadId_;    // from 0 to #threads-1
+  /// whether the thread should stop
+  bool stopping_;
+  /// the threads form which to collect gradient
+  int partnerId_;
+  /// from 0 to threads-1
+  int threadId_;
   int deviceId_;
   std::unique_ptr<GradientMachine> gradientMachine_;
   std::vector<ParameterPtr> parameters_;
 
-  // ParameterType which needs to be merged from each GPU
+  /// ParameterType which needs to be merged from each GPU
   std::vector<ParameterType> mergeTypes_;
 
-  std::unique_ptr<std::thread> computeThread_;  // compute thread
+  /// compute thread
+  std::unique_ptr<std::thread> computeThread_;
   std::vector<Argument> inArgs_;
   std::vector<Argument> outArgs_;
   Semaphore taskReadySem_;
   Semaphore outArgsReadySem_;
 
-  std::unique_ptr<std::thread> copyThread_;  // copy thread
-  PidQueue gradBufQueue_;  // queue of gradient needs to be copied to partner
+  /// copy thread
+  std::unique_ptr<std::thread> copyThread_;
+  /// queue of gradient needs to be copied to partner
+  PidQueue gradBufQueue_;
   hl_stream_t gradStream_;
 
-  std::unique_ptr<std::thread> gradCollectThread_;  // grad merge thread
-  // queue of gradient needs to be merged with gradient coopied by
-  // copyGradToBufferThread
+  /// grad merge thread
+  std::unique_ptr<std::thread> gradCollectThread_;
+  /// queue of gradient needs to be merged with gradient coopied by
+  /// copyGradToBufferThread
   PidQueue gradQueue_;
   UpdateCallback backwardCallback_;
 
-  std::unique_ptr<std::thread> valueDispatchThread_;  // value dispatch thread
-  // queue of the parameter whose the vale are ready for copy
+  /// value dispatch thread
+  std::unique_ptr<std::thread> valueDispatchThread_;
+  /// queue of the parameter whose the vale are ready for copy
   PidQueue valueReadyQueue_;
 
-  // used to notify all the parameter values are ready
+  /// used to notify all the parameter values are ready
   LockedCondition valueReadyCond_;
 
   hl_stream_t valueStream_;
-  std::atomic<int> updateCounter_;  // how many parameters are updated
+  /// how many parameters are updated
+  std::atomic<int> updateCounter_;
   bool parameterUpdated_;
 
-  // indicate whether inArgs is copied before forward()
+  /// indicate whether inArgs is copied before forward()
   bool inArgsCopied_;
 };
 
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.h b/paddle/gserver/gradientmachines/NeuralNetwork.h
index 1b440042d726406cd970a8cf55b81f3feae25b8a..06c679a63cc79b68b9fd27dfb64dfa9add8a1078 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.h
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.h
@@ -66,12 +66,15 @@ public:
                                                       PARAMETER_MOMENTUM},
       bool useGpu = FLAGS_use_gpu);
 
-  // connect two submodels
-  // down-submodel's output become up-submodel's input
-  // *realLayer* is down-submodel's output layer
-  // *agentLayer* is up-submodel's input agent layer
-  // by default, connection is one by one,
-  // if the agent height is smaller than real layer, *height* has to be filled
+  /**
+   * Connect two submodels and
+   * down-submodel's output become up-submodel's input.
+   * By default, connection is one by one,
+   * If the agent height is smaller than real layer, *height* has to be filled.
+   *
+   * @param realLayer  The down-submodel's output layer.
+   * @param agentLayer The up-submodel's input agent layer.
+   */
   static void connect(LayerPtr agentLayer, LayerPtr realLayer, int height = 0);
   void connect(std::string agentLayerName, NeuralNetwork* srcNN,
                std::string realLayerName);
@@ -98,10 +101,10 @@ public:
   virtual void resetState();
   virtual void setOutputGrad(const std::vector<Argument>& args);
 
-  // set machine state
+  /// set machine state
   virtual void setState(const MachineState& machineState);
 
-  // get machine state
+  /// get machine state
   virtual void getState(MachineState& machineState);
 
   static NeuralNetwork* create(const ModelConfig& config);
@@ -126,8 +129,14 @@ public:
                                         NeuralNetwork* rootNetwork = nullptr);
 
 protected:
-  // rootNetwork: used in MultiNetwork
-  // sub networks can get parameters_ and parameterMap_ from base NeuralNetwork
+  /**
+   * The constructor of NeuralNetwork.
+   * The sub networks can get parameters_ and parameterMap_
+   * from base NeuralNetwork.
+   *
+   * @param subModelName The name of sub-model.
+   * @param rootNetwork  It used in MultiNetwork.
+   */
   NeuralNetwork(std::string subModelName = "",
                 NeuralNetwork* rootNetwork = nullptr)
       : subModelName_(subModelName),
@@ -146,8 +155,8 @@ protected:
 
   NeuralNetwork* rootNetwork_;
 
-  // Whether parameter of this NN is initialized by its own
-  // (i.e., not by callback supplied with the caller)
+  /// Whether parameter of this NN is initialized by its own
+  /// (i.e., not by callback supplied with the caller)
   bool paramSelfInited_;
 };
 
diff --git a/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h b/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h
index eaf2376e132b279153ee6c04588848c86fcc23ec..2a3db654f4e16c0ecd4be91425330208046b4a6c 100644
--- a/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h
+++ b/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h
@@ -67,7 +67,8 @@ public:
 
 protected:
   bool useGpu_;
-  int numDevices_; /* number of gpu devices */
+  /// number of gpu devices
+  int numDevices_;
   std::vector<std::unique_ptr<ParallelThread>> threads_;
 };
 
@@ -97,11 +98,14 @@ public:
   JobQueue queue_;
 
 protected:
-  int threadId_;  // from 0 to #threads-1
-  int deviceId_;  // the GPU device Id which the computeThread_ used
+  /// from 0 to threads-1
+  int threadId_;
+  /// the GPU device Id which the computeThread_ used
+  int deviceId_;
   bool useGpu_;
   std::unique_ptr<std::thread> computeThread_;
-  bool stopping_;  // whether the thread should stop
+  /// whether the thread should stop
+  bool stopping_;
   UpdateCallback backwardCallback_;
   PassType passType_;
 };
diff --git a/paddle/gserver/layers/CostLayer.h b/paddle/gserver/layers/CostLayer.h
index b4383370a06e3f1d94f3d8bd645147baab1bc29d..b464e16737ae561dce6e7d4f16a4dd61f73204e0 100644
--- a/paddle/gserver/layers/CostLayer.h
+++ b/paddle/gserver/layers/CostLayer.h
@@ -53,8 +53,13 @@ protected:
   real coeff_;
 };
 
-/*
- * MultiClassCrossEntropy
+/**
+ * The cross-entropy loss for multi-class classification task.
+ * The loss function is:
+ *
+ * \f[
+ * L = - \sum_{i}{t_{k} * log(P(y=k))}
+ * \f]
  */
 class MultiClassCrossEntropy : public CostLayer {
 public:
@@ -68,9 +73,20 @@ public:
   void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad);
 };
 
-/*
- * MultiClassCrossEntropyWithSelfNorm
- * \sum_i (-log(x_label(i)) + alpha * log(Z(i)^2)
+/**
+ * The cross-entropy with self-normalization for multi-class classification.
+ *
+ * The loss function is:
+ * \f[
+ * L = \sum_{i}[-log(P(x_{i})) + alpha * log(Z(x_{i})^2)]
+ * \f]
+ *
+ * The \f$Z(x)\f$ is the softmax normalizer.
+ *
+ * [1] Jacob Devlin, Rabih Zbib, Zhongqiang Huang, Thomas Lamar,
+ *     Richard Schwartz, and John Makhoul. Fast and robust neural
+ *     network joint models for statistical machine translation.
+ *     In Proceedings of the ACL 2014 Conference.
  */
 class MultiClassCrossEntropyWithSelfNorm : public CostLayer {
 public:
@@ -88,9 +104,11 @@ protected:
   MatrixPtr sumInv_;
 };
 
-/*
- * SoftBinaryClassCrossEntropy
- *  \sum_i (\sum_j -y_j(i)*log(x_j(i))-(1-y_j(i))*log(1-x_j(i)))
+/**
+ * The cross-entropy for soft binary class.
+ * \f[
+ * L = \sum_i (\sum_j -y_j(i)*log(x_j(i))-(1-y_j(i))*log(1-x_j(i)))
+ * \f]
  */
 class SoftBinaryClassCrossEntropy : public CostLayer {
 public:
@@ -107,6 +125,13 @@ protected:
   MatrixPtr targetPerDim_;
 };
 
+/**
+ * This cost layer compute Euclidean (L2) loss for real-valued regression
+ * tasks.
+ * \f[
+ * L = \frac{1}{2N} \sum_{i=1}^N {|| \hat{y}_i - y_i||_2^2}
+ * \f]
+ */
 class SumOfSquaresCostLayer : public CostLayer {
 public:
   explicit SumOfSquaresCostLayer(const LayerConfig& config)
@@ -119,8 +144,17 @@ public:
   void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad);
 };
 
-/*
- * RankingCost
+/**
+ * A cost layer for learning to rank (LTR) task. This layer contains at leat
+ * three inputs.
+ * \f[
+ *  C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + log(1 + e^{o_{i,j}}) \\
+ *  o_{i,j} =  o_i - o_j  \\
+ *  \tilde{P_{i,j}} = \left \{0, 0.5, 1 \right \} \ or \ \left \{0, 1 \right \}
+ * \f]
+ *
+ * [1]. Chris Burges, Tal Shaked, Erin Renshaw, et al. Learning to
+ *      Rank useing Gradient Descent.
  */
 class RankingCost : public Layer {
 public:
@@ -155,12 +189,25 @@ private:
   double negPairCount_;
   MatrixPtr margin_;
   MatrixPtr marginGrad_;
-  // if input label is put in ids (not value), copy to this buffer.
+  /// if input label is put in ids (not value), copy to this buffer.
   MatrixPtr labelBuf_;
   LayerPtr weightLayer_;
 };
 
-/* lambdaRank listwise LTR approach */
+/**
+ * LambdaRank os a method for learning arbitrary information retrieval
+ * measures. It can be applied to any algorithm that learns through gradient
+ * descent. LambdaRank is a listwise method, in that the cost depends on the
+ * sorted order of the documents. LambdaRank gives the gradient of cost
+ * function:
+ *
+ * \f[
+ * \lambda_{ij} = \frac{1}{1 + e^{o_i - o_j}} \left| \Delta_{NDCG} \right|
+ * \f]
+ *
+ * [1] Christopher J.C. Burges, Robert Ragno, Quoc Viet Le. Learning to Rank
+ *     with Nonsmooth Cost Functions.
+ */
 class LambdaCost : public Layer {
 public:
   explicit LambdaCost(const LayerConfig& config) : Layer(config) {}
@@ -191,9 +238,11 @@ private:
 };
 
 /**
- * Cross entropy for multi binary labels
- * cost[i] = -sum(label[i][j]*log(output[i][j])
- *                + (1-label[i][j])*log(1-output[i][j]))
+ * Cross entropy for multi binary labels.
+ * \f[
+ * cost[i] = -sum(label[i][j]*log(output[i][j]) +
+ *            (1-label[i][j])*log(1-output[i][j]))
+ * \f]
  */
 class MultiBinaryLabelCrossEntropy : public CostLayer {
 protected:
@@ -210,13 +259,18 @@ public:
   void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad);
 };
 
-/*
- * Huber loss for robust 2-classes classification
+/**
+ * Huber loss for robust 2-classes classification.
  *
  * For label={0, 1}, let y=2*label-1. Given output f, the loss is:
- * -4*y*f, if y*f < -1
- * (1-y*f)^2, if -1 < y*f < 1,
- * 0, otherwise
+ * \f[
+ * Loss =
+ * \left\{\begin{matrix}
+ *  4 * y * f     &   \textit{if}  \ \  y* f < -1 \\
+ *  (1 - y * f)^2 &  \textit{if}   \ \  -1 < y * f < 1  \\
+ *  0             &                    \textit{otherwise}
+ * \end{matrix}\right.
+ * \f]
  */
 class HuberTwoClass : public CostLayer {
   std::vector<Argument> tmpCpuInput_;
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 931a373fcbcb384561bb5327d120175f5e7e1194..85625c2f6a2cb502d133dc45c9480a614d934cf7 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -3083,7 +3083,7 @@ following are cost Layers.
 @wrap_name_default()
 def rank_cost(left, right, lable, weight=None, name=None, coeff=1.0):
     """
-    A cost Layer for leanrning to rank using gradient descent. Details can refer
+    A cost Layer for learning to rank using gradient descent. Details can refer
     to `papers <http://research.microsoft.com/en-us/um/people/cburges/papers/ICML_ranking.pdf>`_.
     This layer contains at least three inputs. The weight is an optional
     argument, which affects the cost.