Adjustment doc and code for CostLayer, GradientMachine and DataProvider.

Also add some comments for cost layers. ISSUE=4580653 git-svn-id: https://svn.baidu.com/idl/trunk/paddle@1410 1ad973e4-5ce8-4261-8a94-b56d1f490c56

Adjustment doc and code for CostLayer, GradientMachine and DataProvider.
Also add some comments for cost layers. ISSUE=4580653 git-svn-id: https://svn.baidu.com/idl/trunk/paddle@1410 1ad973e4-5ce8-4261-8a94-b56d1f490c56
f0637523 · dangqingqing · 4268885c · f0637523 · f0637523 · f0637523
15 changed file
--- a/doc/source/gserver/activations/index.rst
+++ b/doc/source/gserver/activations/index.rst
 Activations
 =============
-.. doxygenfile:: paddle/gserver/activations/ActivationFunction.h
+..  doxygenfile:: paddle/gserver/activations/ActivationFunction.h
-.. doxygenfile:: paddle/gserver/activations/ActivationFunction.cpp
+..  doxygenfile:: paddle/gserver/activations/ActivationFunction.cpp
--- a/doc/source/gserver/dataprovider/dataproviders.rst
+++ b/doc/source/gserver/dataprovider/dataproviders.rst
 Data Providers
 ================
-Data Provider
+Base DataProvider
+------------------
+..  doxygenclass:: paddle::DataProvider
+    :members:
+DataProviderGroup
+-------------------
+..  doxygenclass:: paddle::DataProviderGroup
+    :members:
+MultiDataProvider
+-------------------
+..  doxygenclass:: paddle::MultiDataProvider
+    :members:
+PyDataProvider
+===================
+IFieldScanner
+-------------
+..  doxygenclass:: paddle::IFieldScanner
+    :members:
+DenseScanner
+-------------
+..  doxygenclass:: paddle::DenseScanner
+    :members:
+IndexScanner
+-------------
+..  doxygenclass:: paddle::IndexScanner
+    :members:
+SparseNonValueScanner
+---------------------
+..  doxygenclass:: paddle::SparseNonValueScanner
+    :members:
+SparseValueScanner
+------------------
+..  doxygenclass:: paddle::SparseValueScanner
+    :members:
+SequenceScanner
+------------------
+..  doxygenclass:: paddle::SparseValueScanner
+    :members:
+IPyDataProviderCache
+--------------------
+..  doxygenclass:: paddle::IPyDataProviderCache
+    :members:
+NoCacheStrategy
 ---------------
-.. doxygenfile:: paddle/gserver/dataproviders/DataProvider.h
+..  doxygenclass:: paddle::NoCacheStrategy
-.. doxygenfile:: paddle/gserver/dataproviders/PyDataProvider2.cpp
+    :members:
-.. doxygenfile:: paddle/gserver/dataproviders/DataProviderGroup.h
-.. doxygenfile:: paddle/gserver/dataproviders/MultiDataProvider.h
-Proto Data Provider
+CacheOnePassInMemory
 --------------------
-.. doxygenfile:: paddle/gserver/dataproviders/ProtoDataProvider.h
+..  doxygenclass:: paddle::CacheOnePassInMemory
-.. doxygenfile:: paddle/gserver/dataproviders/ProtoReader.h
+    :members:
+IPyDataProvider
+---------------
+..  doxygenclass:: paddle::PyDataProvider2
+    :members:
+Proto Data Provider
+===================
+ProtoDataProvider
+----------------
+..  doxygenclass:: paddle::ProtoDataProvider
+    :members:
+ProtoSequenceDataProvider
+----------------
+..  doxygenclass:: paddle::ProtoSequenceDataProvider
+    :members:
--- a/doc/source/gserver/evaluators/evaluators.rst
+++ b/doc/source/gserver/evaluators/evaluators.rst
+Base Evaluator
+==============
+Evaluator
+---------
+..  doxygenclass:: paddle::Evaluator
+    :members:
+Utils
+=====
+SumEvaluator
+------------
+..  doxygenclass:: paddle::SumEvaluator
+    :members:
+ColumnSumEvaluator
+------------------
+..  doxygenclass:: paddle::ColumnSumEvaluator
+    :members:
+Classification
+==============
+ClassificationErrorEvaluator
+---------------------------
+..  doxygenclass:: paddle::ClassificationErrorEvaluator
+    :members:
+SequenceClassificationErrorEvaluator
+------------------------------------
+..  doxygenclass:: paddle::SequenceClassificationErrorEvaluator
+    :members:
+AucEvaluator
+-------------
+..  doxygenclass:: paddle::AucEvaluator
+    :members:
+PrecisionRecallEvaluator
+------------------------
+..  doxygenclass:: paddle::PrecisionRecallEvaluator
+    :members:
+ChunkEvaluator
+--------------
+..  doxygenclass:: paddle::ChunkEvaluator
+    :members:
+CTCEvaluator
+------------
+..  doxygenclass:: paddle::CTCErrorEvaluator
+    :members:
+Rank
+====
+PnpairEvaluator
+-------------
+..  doxygenclass:: paddle::PnpairEvaluator
+    :members:
+AucEvaluator
+-------------
+..  doxygenclass:: paddle::RankAucEvaluator
+    :members:
+Printer
+=======
+ValuePrinter
+-------------
+..  doxygenclass:: paddle::ValuePrinter
+    :members:
+GradientPrinter
+---------------
+..  doxygenclass:: paddle::GradientPrinter
+    :members:
+MaxIdPrinter
+------------
+..  doxygenclass:: paddle::MaxIdPrinter
+    :members:
+MaxFramePrinter
+---------------
+..  doxygenclass:: paddle::MaxFramePrinter
+    :members:
+SequenceTextPrinter
+------------------
+..  doxygenclass:: paddle::SequenceTextPrinter
+    :members:
+ClassificationErrorPrinter
+--------------------------
+..  doxygenclass:: paddle::ClassificationErrorPrinter
+    :members:
--- a/doc/source/gserver/evaluators/index.rst
+++ b/doc/source/gserver/evaluators/index.rst
 Evaluators
-============
+==========
-.. doxygenfile:: paddle/gserver/evaluators/Evaluator.h
-.. doxygenfile:: paddle/gserver/evaluators/ChunkEvaluator.cpp
-.. doxygenfile:: paddle/gserver/evaluators/CTCErrorEvaluator.cpp
+.. toctree::
+  :maxdepth: 3
+  evaluators.rst
--- a/doc/source/gserver/gradientmachines/gradientmachines.rst
+++ b/doc/source/gserver/gradientmachines/gradientmachines.rst
-Gradient machines
+Gradient Machines
-===================
+================
-Networks
+GradientMachine
------------
+---------------------
-.. doxygenfile:: paddle/gserver/gradientmachines/MultiNetwork.h
+..  doxygenclass:: paddle::GradientMachine
-.. doxygenfile:: paddle/gserver/gradientmachines/ParallelNeuralNetwork.h
+    :members:
-Gradient Machines
+GradientMachineModel
 --------------------
-.. doxygenfile:: paddle/gserver/gradientmachines/GradientMachine.h
+..  doxygenclass:: paddle::IGradientMachineMode
-.. doxygenfile:: paddle/gserver/gradientmachines/MultiGradientMachine.h
+    :members:
+MultiGradientMachine
+---------------------
+..  doxygenclass:: paddle::MultiGradientMachine
+    :members:
+TrainerThread
+`````````````
+..  doxygenclass:: paddle::TrainerThread
+    :members:
 Recurrent Gradient Machines
-----------------------------
+---------------------------
-.. doxygenfile:: paddle/gserver/gradientmachines/RecurrentGradientMachine.h
+..  doxygenclass:: paddle::RecurrentGradientMachine
-.. doxygenfile:: paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+    :members:
+Networks
+========
+NeuralNetwork
+-------------
+..  doxygenclass:: paddle::NeuralNetwork
+    :members:
+ParallelNeuralNetwork
+---------------------
+..  doxygenclass:: paddle::ParallelNeuralNetwork
+    :members:
--- a/paddle/gserver/dataproviders/DataProvider.h
+++ b/paddle/gserver/dataproviders/DataProvider.h
@@ -118,10 +118,10 @@ public:
    data_.push_back(argu);
  }
-  /*
+  /**
-   * argus: DataBatch.getStreams()
+   * @param argus: DataBatch.getStreams()
-   * size: DataBatch.getSize()
+   * @param size: DataBatch.getSize()
-   * dataId: sub dataprovider id (in MultiDataProvider)
+   * @param dataId: sub dataprovider id (in MultiDataProvider)
   */
  void appendArguments(const std::vector<Argument>& argus, int size,
                       int dataId) {
@@ -312,22 +312,28 @@ public:
  }
 };
-// Data provider for one input and one integer label
+/**
+ * Data provider for one input and one integer label.
+ */
 class SimpleDataProviderBase : public DataProvider {
 protected:
-  int64_t sampleDim_;       // sample feature dimension
+  /// sample feature dimension
-  int64_t bufferCapacity_;  // the number of samples
+  int64_t sampleDim_;
+  /// the number of samples
+  int64_t bufferCapacity_;
  int64_t sampleNumInBuf_;
-  int64_t nextItemIndex_;  // next item to read in buffer
+  /// next item to read in buffer
-  bool withInfo_;          // some user defined info for validation
+  int64_t nextItemIndex_;
+  /// some user defined info for validation
+  bool withInfo_;
-  // data buffer: bufferCapacity_ * nDataDim_
+  /// data buffer: bufferCapacity_ * nDataDim_
  CpuMatrixPtr hInputDataBuf_;
-  // label buffer:bufferCapacity_ * 1
+  /// label buffer:bufferCapacity_ * 1
  CpuIVectorPtr hInputLabelBuf_;
-  // info buffer:bufferCapacity_ * 1
+  /// info buffer:bufferCapacity_ * 1
  CpuIVectorPtr hInputInfoBuf_;
  ThreadLocal<MatrixPtr> dataBatch_;
@@ -348,7 +354,7 @@ public:
  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
-  // return the number of samples in the buffer
+  /// return the number of samples in the buffer
  int64_t fillBuffer();
 protected:

--- a/paddle/gserver/dataproviders/ProtoDataProvider.h
+++ b/paddle/gserver/dataproviders/ProtoDataProvider.h
@@ -80,7 +80,7 @@ protected:
   */
  inline bool iidData() const { return sequenceStartPositions_.empty(); }
-  // check that sample is consistent with header_
+  /// check that sample is consistent with header_
  void checkSample(const DataSample& sample);
  template <class Op>
@@ -129,14 +129,15 @@ protected:
  int64_t currentSequenceIndex_;
-  // The size should be the number of sequences.
+  /// The size should be the number of sequences.
  std::vector<size_t> shuffledSequenceIds_;
  ThreadLocalD<DataBatch> cpuBatch_;
  ThreadLocalD<DataBatch> gpuBatch_;
  RWLock lock_;
-  std::vector<StatPtr> nnzStats_;  // stats for number of none-zeros entries
+  // stats for number of none-zeros entries
+  std::vector<StatPtr> nnzStats_;
 };
 /**

--- a/paddle/gserver/evaluators/Evaluator.cpp
+++ b/paddle/gserver/evaluators/Evaluator.cpp
@@ -1000,20 +1000,34 @@ REGISTER_EVALUATOR(max_frame_printer, MaxFramePrinter);
 /**
 * Sequence text printer will print text according to index matrix and a
 * dictionary. There can be multiple input to this layer:
+ *
 *   1) If there is only one input, the input must be a matrix containing
 *      the sequence of indices;
+ *
 *   2) If there are more than one input, the first input should be ids,
 *      and are interpreted as sample ids.
 *
 * The output format will be:
+ *
 *   1) sequence without sub-sequence, and there is probability.
+ *
+ *     @code
 *      id \t prob space_seperated_tokens_from_dictionary_according_to_seq
+ *     @endcode
+ *
 *   2) sequence without sub-sequence, and there is not probability.
+ *
+ *     @code
 *      id \t space_seperated_tokens_from_dictionary_according_to_seq
+ *     @endcode
+ *
 *   3) sequence with sub-sequence, and there is not probability.
+ *
+ *     @code
 *      id \t space_seperated_tokens_from_dictionary_according_to_sub_seq
 *      \t \t space_seperated_tokens_from_dictionary_according_to_sub_seq
 *      ...
+ *     @endcode
 *
 * Typically SequenceTextPrinter layer takes output of maxid or RecurrentGroup
 * with maxid (when generating) as an input.

--- a/paddle/gserver/evaluators/Evaluator.h
+++ b/paddle/gserver/evaluators/Evaluator.h
@@ -68,7 +68,7 @@ public:
    numSamples_ += arguments[0].getBatchSize();
  }
-  // finish() should be called before distributeEval
+  /// finish() should be called before distributeEval
  virtual void distributeEval(ParameterClient2* client) {
    LOG(FATAL) << "Not implemeted";
  }
@@ -85,7 +85,7 @@ public:
   */
  virtual void finish() {}
-  // finish() should be called before printStats
+  /// finish() should be called before printStats
  virtual void printStats(std::ostream& os) {
    os << config_.name() << "="
       << (numSamples_ ? totalScore_ / numSamples_ : 0);
@@ -130,9 +130,9 @@ public:
  /**
   * @brief evaluate AUC using colIdx-th column as prediction.
   *
-   * colIdx = 0: the 0-th column.
+   * - colIdx = 0: the 0-th column.
-   * colIdx > 0: the colIdx-th column.
+   * - colIdx > 0: the colIdx-th column.
-   * colIdx < 0: the last colIdx-th column.
+   * - colIdx < 0: the last colIdx-th column.
   *
   */
  AucEvaluator(int32_t colIdx)
@@ -223,10 +223,14 @@ public:
  virtual void distributeEval(ParameterClient2* client);
  struct StatsInfo {
-    double TP;  // numbers of true positives
+    /// numbers of true positives
-    double TN;  // numbers of true negatives
+    double TP;
-    double FP;  // numbers of false positives
+    /// numbers of true negatives
-    double FN;  // numbers of false negatives
+    double TN;
+    /// numbers of false positives
+    double FP;
+    /// numbers of false negatives
+    double FN;
    StatsInfo() : TP(0.0), TN(0.0), FP(0.0), FN(0.0) {}
  };

--- a/paddle/gserver/gradientmachines/GradientMachine.h
+++ b/paddle/gserver/gradientmachines/GradientMachine.h
@@ -247,7 +247,7 @@ public:
  virtual void restart() {}
-  // Set the gradient of the output from outside.
+  /// Set the gradient of the output from outside.
  virtual void setOutputGrad(const std::vector<Argument>& args) {
    LOG(FATAL) << "Not implemented!";
  }

--- a/paddle/gserver/gradientmachines/MultiGradientMachine.h
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.h
@@ -31,14 +31,15 @@ typedef Queue<int> PidQueue;
 typedef std::unique_ptr<TrainerThread> TrainerThreadPtr;
 struct GradBuffer {
-  // GradBuffer is used for gathering gradient for GPU parameters
+  /// GradBuffer is used for gathering gradient for GPU parameters
  int paramId;
-  // sem is used to notify that the local gradient merge of the current thread
+  /// sem is used to notify that the local gradient merge of the current thread
-  // finished for the current thread.
+  /// finished for the current thread.
  Semaphore sem;
-  std::vector<VectorPtr> bufs;  // bufs[mergeIndex]
+  // bufs[mergeIndex]
+  std::vector<VectorPtr> bufs;
 };
 /**
@@ -189,14 +190,14 @@ public:
    return useGpu_;
  }
-  // @return whether to pass the gradients in outArgs_ to each threads.
+  /// @return whether to pass the gradients in outArgs_ to each threads.
  bool isPassGrad() { return isPassGrad_; }
-  // @brief set whether to pass the gradient in outArgs_ to each threads.
+  /// @brief set whether to pass the gradient in outArgs_ to each threads.
  void setPassGrad(bool isPass) { isPassGrad_ = isPass; }
-  // Set the gradients of the outputs.
+  /// Set the gradients of the outputs.
-  // The gradietns will be copied to each thread in the computing threads.
+  /// The gradietns will be copied to each thread in the computing threads.
  virtual void setOutputGrad(const std::vector<Argument>& args);
 protected:
@@ -205,8 +206,8 @@ protected:
  std::vector<TrainerThreadPtr>& getAllThreads() {
    return threads_;
  }
-  // Calculate the real device id based on the logical device id and the
+  /// Calculate the real device id based on the logical device id and the
-  // thread id.
+  /// thread id.
  int logicalDeviceId2RealDeviceId(int logicalId, int threadId = 0) const {
    if (logicalId == -1) {
      logicalId = 0;
@@ -215,8 +216,8 @@ protected:
               numDevices_);
  }
-  // Calculate the logical device id based on the real device id and the
+  /// Calculate the logical device id based on the real device id and the
-  // thread id.
+  /// thread id.
  int realDeviceId2LogicalDeviceId(int realId, int threadId = 0) const {
    if (realId == -1) {
      return 0;
@@ -232,15 +233,15 @@ protected:
    return hasNonstaticCpuParamters_;
  }
-  // Called TrainerThread to wait before merging CPU parameter gradients.
+  /// Called TrainerThread to wait before merging CPU parameter gradients.
  void waitBeforeMerge() { trainerBarrier_.wait(); }
-  // called by MultiGradientMachine and TrainerThread to wait after merging
+  /// called by MultiGradientMachine and TrainerThread to wait after merging
-  // CPU parameter graidents.
+  /// CPU parameter graidents.
  void waitAfterMerge() { allBarrier_.wait(); }
-  // called by MultiGradientMachine and TrainerThread to wait for copyInArgs()
+  /// called by MultiGradientMachine and TrainerThread to wait for copyInArgs()
-  // finishing
+  /// finishing
  void waitForCopyInArgs() { allBarrier_.wait(); }
  TrainerThreadPtr& getThread(int threadId) {
@@ -255,8 +256,8 @@ protected:
    return passType_;
  }
-  // Called by TrainerThread to notify MultiGradientMachine that the gradient
+  /// Called by TrainerThread to notify MultiGradientMachine that the gradient
-  // for paramId is ready
+  /// for paramId is ready
  void notifyGradientTransfer(int paramId);
  const std::vector<Argument>& getInArgs() {
@@ -297,7 +298,7 @@ protected:
  virtual void backwardImp(
      const UpdateCallback& callback = NULL);
-  // update all parameters
+  /// update all parameters
  void updateThreadParameters();
  void startTask(TaskType taskType);
@@ -311,7 +312,7 @@ protected:
  bool hasNonstaticCpuParamters_;
-  // store main parameter only
+  /// store main parameter only
  std::unique_ptr<GradientMachine> gradientMachine_;
  std::vector<TrainerThreadPtr> threads_;
@@ -326,7 +327,7 @@ protected:
  std::vector<Argument> outArgs_;
  hl_stream_t outArgStream_;
-  // ParameterType which needs to be merged from each GPU
+  /// ParameterType which needs to be merged from each GPU
  std::vector<ParameterType> mergeTypes_;
  int numDevices_;  /* number of gpu devices */
  int numLogicalDevices_;  // number of GPU used by one NN
@@ -334,16 +335,16 @@ protected:
  UpdateCallback backwardCallback_;
-  // barrrier for threads_
+  /// barrrier for threads_
  ThreadBarrier trainerBarrier_;
-  // barrier for both MultiGradientMachine and threds_
+  /// barrier for both MultiGradientMachine and threds_
  ThreadBarrier allBarrier_;
-  // indicate whether inArgs is copied before forward()
+  /// indicate whether inArgs is copied before forward()
  bool inArgsCopied_;
-  // Whether to copy the gradient back from an external input.
+  /// Whether to copy the gradient back from an external input.
  bool isPassGrad_;
 };
@@ -413,7 +414,7 @@ public:
  void prefetch();
-  // copy the output gradient from the main GradientMachine.
+  /// copy the output gradient from the main GradientMachine.
  void copyOutputGrad();
 protected:
@@ -441,51 +442,60 @@ protected:
  void backward();
  void backwardCallback(Parameter* para);
-  // call the actuall callback supplied by the caller of
+  /// call the actuall callback supplied by the caller of
-  // GradientMachine::backward
+  /// GradientMachine::backward
  void doCallback(int pid);
 protected:
  MultiGradientMachine* multiMachine_;
  ModelConfig config_;
-  bool stopping_;   // whether the thread should stop
+  /// whether the thread should stop
-  int partnerId_;   // the threads form which to collect gradient
+  bool stopping_;
-  int threadId_;    // from 0 to #threads-1
+  /// the threads form which to collect gradient
+  int partnerId_;
+  /// from 0 to threads-1
+  int threadId_;
  int deviceId_;
  std::unique_ptr<GradientMachine> gradientMachine_;
  std::vector<ParameterPtr> parameters_;
-  // ParameterType which needs to be merged from each GPU
+  /// ParameterType which needs to be merged from each GPU
  std::vector<ParameterType> mergeTypes_;
-  std::unique_ptr<std::thread> computeThread_;  // compute thread
+  /// compute thread
+  std::unique_ptr<std::thread> computeThread_;
  std::vector<Argument> inArgs_;
  std::vector<Argument> outArgs_;
  Semaphore taskReadySem_;
  Semaphore outArgsReadySem_;
-  std::unique_ptr<std::thread> copyThread_;  // copy thread
+  /// copy thread
-  PidQueue gradBufQueue_;  // queue of gradient needs to be copied to partner
+  std::unique_ptr<std::thread> copyThread_;
+  /// queue of gradient needs to be copied to partner
+  PidQueue gradBufQueue_;
  hl_stream_t gradStream_;
-  std::unique_ptr<std::thread> gradCollectThread_;  // grad merge thread
+  /// grad merge thread
-  // queue of gradient needs to be merged with gradient coopied by
+  std::unique_ptr<std::thread> gradCollectThread_;
-  // copyGradToBufferThread
+  /// queue of gradient needs to be merged with gradient coopied by
+  /// copyGradToBufferThread
  PidQueue gradQueue_;
  UpdateCallback backwardCallback_;
-  std::unique_ptr<std::thread> valueDispatchThread_;  // value dispatch thread
+  /// value dispatch thread
-  // queue of the parameter whose the vale are ready for copy
+  std::unique_ptr<std::thread> valueDispatchThread_;
+  /// queue of the parameter whose the vale are ready for copy
  PidQueue valueReadyQueue_;
-  // used to notify all the parameter values are ready
+  /// used to notify all the parameter values are ready
  LockedCondition valueReadyCond_;
  hl_stream_t valueStream_;
-  std::atomic<int> updateCounter_;  // how many parameters are updated
+  /// how many parameters are updated
+  std::atomic<int> updateCounter_;
  bool parameterUpdated_;
-  // indicate whether inArgs is copied before forward()
+  /// indicate whether inArgs is copied before forward()
  bool inArgsCopied_;
 };

--- a/paddle/gserver/gradientmachines/NeuralNetwork.h
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.h
@@ -66,12 +66,15 @@ public:
                                                      PARAMETER_MOMENTUM},
      bool useGpu = FLAGS_use_gpu);
-  // connect two submodels
+  /**
-  // down-submodel's output become up-submodel's input
+   * Connect two submodels and
-  // *realLayer* is down-submodel's output layer
+   * down-submodel's output become up-submodel's input.
-  // *agentLayer* is up-submodel's input agent layer
+   * By default, connection is one by one,
-  // by default, connection is one by one,
+   * If the agent height is smaller than real layer, *height* has to be filled.
-  // if the agent height is smaller than real layer, *height* has to be filled
+   *
+   * @param realLayer  The down-submodel's output layer.
+   * @param agentLayer The up-submodel's input agent layer.
+   */
  static void connect(LayerPtr agentLayer, LayerPtr realLayer, int height = 0);
  void connect(std::string agentLayerName, NeuralNetwork* srcNN,
               std::string realLayerName);
@@ -98,10 +101,10 @@ public:
  virtual void resetState();
  virtual void setOutputGrad(const std::vector<Argument>& args);
-  // set machine state
+  /// set machine state
  virtual void setState(const MachineState& machineState);
-  // get machine state
+  /// get machine state
  virtual void getState(MachineState& machineState);
  static NeuralNetwork* create(const ModelConfig& config);
@@ -126,8 +129,14 @@ public:
                                        NeuralNetwork* rootNetwork = nullptr);
 protected:
-  // rootNetwork: used in MultiNetwork
+  /**
-  // sub networks can get parameters_ and parameterMap_ from base NeuralNetwork
+   * The constructor of NeuralNetwork.
+   * The sub networks can get parameters_ and parameterMap_
+   * from base NeuralNetwork.
+   *
+   * @param subModelName The name of sub-model.
+   * @param rootNetwork  It used in MultiNetwork.
+   */
  NeuralNetwork(std::string subModelName = "",
                NeuralNetwork* rootNetwork = nullptr)
      : subModelName_(subModelName),
@@ -146,8 +155,8 @@ protected:
  NeuralNetwork* rootNetwork_;
-  // Whether parameter of this NN is initialized by its own
+  /// Whether parameter of this NN is initialized by its own
-  // (i.e., not by callback supplied with the caller)
+  /// (i.e., not by callback supplied with the caller)
  bool paramSelfInited_;
 };

--- a/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h
+++ b/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h
@@ -67,7 +67,8 @@ public:
 protected:
  bool useGpu_;
-  int numDevices_; /* number of gpu devices */
+  /// number of gpu devices
+  int numDevices_;
  std::vector<std::unique_ptr<ParallelThread>> threads_;
 };
@@ -97,11 +98,14 @@ public:
  JobQueue queue_;
 protected:
-  int threadId_;  // from 0 to #threads-1
+  /// from 0 to threads-1
-  int deviceId_;  // the GPU device Id which the computeThread_ used
+  int threadId_;
+  /// the GPU device Id which the computeThread_ used
+  int deviceId_;
  bool useGpu_;
  std::unique_ptr<std::thread> computeThread_;
-  bool stopping_;  // whether the thread should stop
+  /// whether the thread should stop
+  bool stopping_;
  UpdateCallback backwardCallback_;
  PassType passType_;
 };

--- a/paddle/gserver/layers/CostLayer.h
+++ b/paddle/gserver/layers/CostLayer.h
@@ -53,8 +53,13 @@ protected:
  real coeff_;
 };
-/*
+/**
- * MultiClassCrossEntropy
+ * The cross-entropy loss for multi-class classification task.
+ * The loss function is:
+ *
+ * \f[
+ * L = - \sum_{i}{t_{k} * log(P(y=k))}
+ * \f]
 */
 class MultiClassCrossEntropy : public CostLayer {
 public:
@@ -68,9 +73,20 @@ public:
  void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad);
 };
-/*
+/**
- * MultiClassCrossEntropyWithSelfNorm
+ * The cross-entropy with self-normalization for multi-class classification.
- * \sum_i (-log(x_label(i)) + alpha * log(Z(i)^2)
+ *
+ * The loss function is:
+ * \f[
+ * L = \sum_{i}[-log(P(x_{i})) + alpha * log(Z(x_{i})^2)]
+ * \f]
+ *
+ * The \f$Z(x)\f$ is the softmax normalizer.
+ *
+ * [1] Jacob Devlin, Rabih Zbib, Zhongqiang Huang, Thomas Lamar,
+ *     Richard Schwartz, and John Makhoul. Fast and robust neural
+ *     network joint models for statistical machine translation.
+ *     In Proceedings of the ACL 2014 Conference.
 */
 class MultiClassCrossEntropyWithSelfNorm : public CostLayer {
 public:
@@ -88,9 +104,11 @@ protected:
  MatrixPtr sumInv_;
 };
-/*
+/**
- * SoftBinaryClassCrossEntropy
+ * The cross-entropy for soft binary class.
- *  \sum_i (\sum_j -y_j(i)*log(x_j(i))-(1-y_j(i))*log(1-x_j(i)))
+ * \f[
+ * L = \sum_i (\sum_j -y_j(i)*log(x_j(i))-(1-y_j(i))*log(1-x_j(i)))
+ * \f]
 */
 class SoftBinaryClassCrossEntropy : public CostLayer {
 public:
@@ -107,6 +125,13 @@ protected:
  MatrixPtr targetPerDim_;
 };
+/**
+ * This cost layer compute Euclidean (L2) loss for real-valued regression
+ * tasks.
+ * \f[
+ * L = \frac{1}{2N} \sum_{i=1}^N {|| \hat{y}_i - y_i||_2^2}
+ * \f]
+ */
 class SumOfSquaresCostLayer : public CostLayer {
 public:
  explicit SumOfSquaresCostLayer(const LayerConfig& config)
@@ -119,8 +144,17 @@ public:
  void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad);
 };
-/*
+/**
- * RankingCost
+ * A cost layer for learning to rank (LTR) task. This layer contains at leat
+ * three inputs.
+ * \f[
+ *  C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + log(1 + e^{o_{i,j}}) \\
+ *  o_{i,j} =  o_i - o_j  \\
+ *  \tilde{P_{i,j}} = \left \{0, 0.5, 1 \right \} \ or \ \left \{0, 1 \right \}
+ * \f]
+ *
+ * [1]. Chris Burges, Tal Shaked, Erin Renshaw, et al. Learning to
+ *      Rank useing Gradient Descent.
 */
 class RankingCost : public Layer {
 public:
@@ -155,12 +189,25 @@ private:
  double negPairCount_;
  MatrixPtr margin_;
  MatrixPtr marginGrad_;
-  // if input label is put in ids (not value), copy to this buffer.
+  /// if input label is put in ids (not value), copy to this buffer.
  MatrixPtr labelBuf_;
  LayerPtr weightLayer_;
 };
-/* lambdaRank listwise LTR approach */
+/**
+ * LambdaRank os a method for learning arbitrary information retrieval
+ * measures. It can be applied to any algorithm that learns through gradient
+ * descent. LambdaRank is a listwise method, in that the cost depends on the
+ * sorted order of the documents. LambdaRank gives the gradient of cost
+ * function:
+ *
+ * \f[
+ * \lambda_{ij} = \frac{1}{1 + e^{o_i - o_j}} \left| \Delta_{NDCG} \right|
+ * \f]
+ *
+ * [1] Christopher J.C. Burges, Robert Ragno, Quoc Viet Le. Learning to Rank
+ *     with Nonsmooth Cost Functions.
+ */
 class LambdaCost : public Layer {
 public:
  explicit LambdaCost(const LayerConfig& config) : Layer(config) {}
@@ -191,9 +238,11 @@ private:
 };
 /**
- * Cross entropy for multi binary labels
+ * Cross entropy for multi binary labels.
- * cost[i] = -sum(label[i][j]*log(output[i][j])
+ * \f[
- *                + (1-label[i][j])*log(1-output[i][j]))
+ * cost[i] = -sum(label[i][j]*log(output[i][j]) +
+ *            (1-label[i][j])*log(1-output[i][j]))
+ * \f]
 */
 class MultiBinaryLabelCrossEntropy : public CostLayer {
 protected:
@@ -210,13 +259,18 @@ public:
  void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad);
 };
-/*
+/**
- * Huber loss for robust 2-classes classification
+ * Huber loss for robust 2-classes classification.
 *
 * For label={0, 1}, let y=2*label-1. Given output f, the loss is:
- * -4*y*f, if y*f < -1
+ * \f[
- * (1-y*f)^2, if -1 < y*f < 1,
+ * Loss =
- * 0, otherwise
+ * \left\{\begin{matrix}
+ *  4 * y * f     &   \textit{if}  \ \  y* f < -1 \\
+ *  (1 - y * f)^2 &  \textit{if}   \ \  -1 < y * f < 1  \\
+ *  0             &                    \textit{otherwise}
+ * \end{matrix}\right.
+ * \f]
 */
 class HuberTwoClass : public CostLayer {
  std::vector<Argument> tmpCpuInput_;

--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -3083,7 +3083,7 @@ following are cost Layers.
 @wrap_name_default()
 def rank_cost(left, right, lable, weight=None, name=None, coeff=1.0):
    """
-    A cost Layer for leanrning to rank using gradient descent. Details can refer
+    A cost Layer for learning to rank using gradient descent. Details can refer
    to `papers <http://research.microsoft.com/en-us/um/people/cburges/papers/ICML_ranking.pdf>`_.
    This layer contains at least three inputs. The weight is an optional
    argument, which affects the cost.