From f063752366c79b6c1fb51d2fafca26c12c8b8ca8 Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Tue, 30 Aug 2016 02:12:13 +0000 Subject: [PATCH] Adjustment doc and code for CostLayer, GradientMachine and DataProvider. Also add some comments for cost layers. ISSUE=4580653 git-svn-id: https://svn.baidu.com/idl/trunk/paddle@1410 1ad973e4-5ce8-4261-8a94-b56d1f490c56 --- doc/source/gserver/activations/index.rst | 4 +- .../gserver/dataprovider/dataproviders.rst | 85 +++++++++++++-- doc/source/gserver/evaluators/evaluators.rst | 102 ++++++++++++++++++ doc/source/gserver/evaluators/index.rst | 9 +- .../gradientmachines/gradientmachines.rst | 44 +++++--- paddle/gserver/dataproviders/DataProvider.h | 32 +++--- .../gserver/dataproviders/ProtoDataProvider.h | 7 +- paddle/gserver/evaluators/Evaluator.cpp | 14 +++ paddle/gserver/evaluators/Evaluator.h | 22 ++-- .../gradientmachines/GradientMachine.h | 2 +- .../gradientmachines/MultiGradientMachine.h | 98 +++++++++-------- .../gserver/gradientmachines/NeuralNetwork.h | 33 +++--- .../gradientmachines/ParallelNeuralNetwork.h | 12 ++- paddle/gserver/layers/CostLayer.h | 94 ++++++++++++---- .../paddle/trainer_config_helpers/layers.py | 2 +- 15 files changed, 426 insertions(+), 134 deletions(-) create mode 100644 doc/source/gserver/evaluators/evaluators.rst diff --git a/doc/source/gserver/activations/index.rst b/doc/source/gserver/activations/index.rst index ed6200d9a..6ceab41cc 100644 --- a/doc/source/gserver/activations/index.rst +++ b/doc/source/gserver/activations/index.rst @@ -1,5 +1,5 @@ Activations ============= -.. doxygenfile:: paddle/gserver/activations/ActivationFunction.h -.. doxygenfile:: paddle/gserver/activations/ActivationFunction.cpp +.. doxygenfile:: paddle/gserver/activations/ActivationFunction.h +.. doxygenfile:: paddle/gserver/activations/ActivationFunction.cpp diff --git a/doc/source/gserver/dataprovider/dataproviders.rst b/doc/source/gserver/dataprovider/dataproviders.rst index 2d2ace177..e8aa4bc35 100644 --- a/doc/source/gserver/dataprovider/dataproviders.rst +++ b/doc/source/gserver/dataprovider/dataproviders.rst @@ -1,14 +1,83 @@ Data Providers ================ -Data Provider +Base DataProvider +------------------ +.. doxygenclass:: paddle::DataProvider + :members: + +DataProviderGroup +------------------- +.. doxygenclass:: paddle::DataProviderGroup + :members: + +MultiDataProvider +------------------- +.. doxygenclass:: paddle::MultiDataProvider + :members: + +PyDataProvider +=================== + +IFieldScanner +------------- +.. doxygenclass:: paddle::IFieldScanner + :members: + +DenseScanner +------------- +.. doxygenclass:: paddle::DenseScanner + :members: + +IndexScanner +------------- +.. doxygenclass:: paddle::IndexScanner + :members: + +SparseNonValueScanner +--------------------- +.. doxygenclass:: paddle::SparseNonValueScanner + :members: + +SparseValueScanner +------------------ +.. doxygenclass:: paddle::SparseValueScanner + :members: + +SequenceScanner +------------------ +.. doxygenclass:: paddle::SparseValueScanner + :members: + +IPyDataProviderCache +-------------------- +.. doxygenclass:: paddle::IPyDataProviderCache + :members: + +NoCacheStrategy --------------- -.. doxygenfile:: paddle/gserver/dataproviders/DataProvider.h -.. doxygenfile:: paddle/gserver/dataproviders/PyDataProvider2.cpp -.. doxygenfile:: paddle/gserver/dataproviders/DataProviderGroup.h -.. doxygenfile:: paddle/gserver/dataproviders/MultiDataProvider.h +.. doxygenclass:: paddle::NoCacheStrategy + :members: -Proto Data Provider +CacheOnePassInMemory -------------------- -.. doxygenfile:: paddle/gserver/dataproviders/ProtoDataProvider.h -.. doxygenfile:: paddle/gserver/dataproviders/ProtoReader.h +.. doxygenclass:: paddle::CacheOnePassInMemory + :members: + +IPyDataProvider +--------------- +.. doxygenclass:: paddle::PyDataProvider2 + :members: + +Proto Data Provider +=================== + +ProtoDataProvider +---------------- +.. doxygenclass:: paddle::ProtoDataProvider + :members: + +ProtoSequenceDataProvider +---------------- +.. doxygenclass:: paddle::ProtoSequenceDataProvider + :members: diff --git a/doc/source/gserver/evaluators/evaluators.rst b/doc/source/gserver/evaluators/evaluators.rst new file mode 100644 index 000000000..0c5cc85e7 --- /dev/null +++ b/doc/source/gserver/evaluators/evaluators.rst @@ -0,0 +1,102 @@ +Base Evaluator +============== + +Evaluator +--------- +.. doxygenclass:: paddle::Evaluator + :members: + + +Utils +===== + +SumEvaluator +------------ +.. doxygenclass:: paddle::SumEvaluator + :members: + +ColumnSumEvaluator +------------------ +.. doxygenclass:: paddle::ColumnSumEvaluator + :members: + +Classification +============== + +ClassificationErrorEvaluator +--------------------------- +.. doxygenclass:: paddle::ClassificationErrorEvaluator + :members: + +SequenceClassificationErrorEvaluator +------------------------------------ +.. doxygenclass:: paddle::SequenceClassificationErrorEvaluator + :members: + +AucEvaluator +------------- +.. doxygenclass:: paddle::AucEvaluator + :members: + +PrecisionRecallEvaluator +------------------------ +.. doxygenclass:: paddle::PrecisionRecallEvaluator + :members: + +ChunkEvaluator +-------------- +.. doxygenclass:: paddle::ChunkEvaluator + :members: + +CTCEvaluator +------------ +.. doxygenclass:: paddle::CTCErrorEvaluator + :members: + + +Rank +==== + +PnpairEvaluator +------------- +.. doxygenclass:: paddle::PnpairEvaluator + :members: + +AucEvaluator +------------- +.. doxygenclass:: paddle::RankAucEvaluator + :members: + + +Printer +======= + +ValuePrinter +------------- +.. doxygenclass:: paddle::ValuePrinter + :members: + +GradientPrinter +--------------- +.. doxygenclass:: paddle::GradientPrinter + :members: + +MaxIdPrinter +------------ +.. doxygenclass:: paddle::MaxIdPrinter + :members: + +MaxFramePrinter +--------------- +.. doxygenclass:: paddle::MaxFramePrinter + :members: + +SequenceTextPrinter +------------------ +.. doxygenclass:: paddle::SequenceTextPrinter + :members: + +ClassificationErrorPrinter +-------------------------- +.. doxygenclass:: paddle::ClassificationErrorPrinter + :members: diff --git a/doc/source/gserver/evaluators/index.rst b/doc/source/gserver/evaluators/index.rst index d7f622ff8..298de3e1a 100644 --- a/doc/source/gserver/evaluators/index.rst +++ b/doc/source/gserver/evaluators/index.rst @@ -1,8 +1,7 @@ Evaluators -============ - -.. doxygenfile:: paddle/gserver/evaluators/Evaluator.h -.. doxygenfile:: paddle/gserver/evaluators/ChunkEvaluator.cpp -.. doxygenfile:: paddle/gserver/evaluators/CTCErrorEvaluator.cpp +========== +.. toctree:: + :maxdepth: 3 + evaluators.rst diff --git a/doc/source/gserver/gradientmachines/gradientmachines.rst b/doc/source/gserver/gradientmachines/gradientmachines.rst index b3009f274..3607664c8 100644 --- a/doc/source/gserver/gradientmachines/gradientmachines.rst +++ b/doc/source/gserver/gradientmachines/gradientmachines.rst @@ -1,20 +1,40 @@ -Gradient machines -=================== +Gradient Machines +================ -Networks ------------- -.. doxygenfile:: paddle/gserver/gradientmachines/MultiNetwork.h -.. doxygenfile:: paddle/gserver/gradientmachines/ParallelNeuralNetwork.h +GradientMachine +--------------------- +.. doxygenclass:: paddle::GradientMachine + :members: -Gradient Machines +GradientMachineModel -------------------- -.. doxygenfile:: paddle/gserver/gradientmachines/GradientMachine.h -.. doxygenfile:: paddle/gserver/gradientmachines/MultiGradientMachine.h +.. doxygenclass:: paddle::IGradientMachineMode + :members: + +MultiGradientMachine +--------------------- +.. doxygenclass:: paddle::MultiGradientMachine + :members: + +TrainerThread +````````````` +.. doxygenclass:: paddle::TrainerThread + :members: Recurrent Gradient Machines ------------------------------ -.. doxygenfile:: paddle/gserver/gradientmachines/RecurrentGradientMachine.h -.. doxygenfile:: paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp +--------------------------- +.. doxygenclass:: paddle::RecurrentGradientMachine + :members: +Networks +======== +NeuralNetwork +------------- +.. doxygenclass:: paddle::NeuralNetwork + :members: +ParallelNeuralNetwork +--------------------- +.. doxygenclass:: paddle::ParallelNeuralNetwork + :members: diff --git a/paddle/gserver/dataproviders/DataProvider.h b/paddle/gserver/dataproviders/DataProvider.h index 3c4bea0b3..fb404405f 100644 --- a/paddle/gserver/dataproviders/DataProvider.h +++ b/paddle/gserver/dataproviders/DataProvider.h @@ -118,10 +118,10 @@ public: data_.push_back(argu); } - /* - * argus: DataBatch.getStreams() - * size: DataBatch.getSize() - * dataId: sub dataprovider id (in MultiDataProvider) + /** + * @param argus: DataBatch.getStreams() + * @param size: DataBatch.getSize() + * @param dataId: sub dataprovider id (in MultiDataProvider) */ void appendArguments(const std::vector& argus, int size, int dataId) { @@ -312,22 +312,28 @@ public: } }; -// Data provider for one input and one integer label +/** + * Data provider for one input and one integer label. + */ class SimpleDataProviderBase : public DataProvider { protected: - int64_t sampleDim_; // sample feature dimension - int64_t bufferCapacity_; // the number of samples + /// sample feature dimension + int64_t sampleDim_; + /// the number of samples + int64_t bufferCapacity_; int64_t sampleNumInBuf_; - int64_t nextItemIndex_; // next item to read in buffer - bool withInfo_; // some user defined info for validation + /// next item to read in buffer + int64_t nextItemIndex_; + /// some user defined info for validation + bool withInfo_; - // data buffer: bufferCapacity_ * nDataDim_ + /// data buffer: bufferCapacity_ * nDataDim_ CpuMatrixPtr hInputDataBuf_; - // label buffer:bufferCapacity_ * 1 + /// label buffer:bufferCapacity_ * 1 CpuIVectorPtr hInputLabelBuf_; - // info buffer:bufferCapacity_ * 1 + /// info buffer:bufferCapacity_ * 1 CpuIVectorPtr hInputInfoBuf_; ThreadLocal dataBatch_; @@ -348,7 +354,7 @@ public: virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch); - // return the number of samples in the buffer + /// return the number of samples in the buffer int64_t fillBuffer(); protected: diff --git a/paddle/gserver/dataproviders/ProtoDataProvider.h b/paddle/gserver/dataproviders/ProtoDataProvider.h index 0f3f55738..02db5f870 100644 --- a/paddle/gserver/dataproviders/ProtoDataProvider.h +++ b/paddle/gserver/dataproviders/ProtoDataProvider.h @@ -80,7 +80,7 @@ protected: */ inline bool iidData() const { return sequenceStartPositions_.empty(); } - // check that sample is consistent with header_ + /// check that sample is consistent with header_ void checkSample(const DataSample& sample); template @@ -129,14 +129,15 @@ protected: int64_t currentSequenceIndex_; - // The size should be the number of sequences. + /// The size should be the number of sequences. std::vector shuffledSequenceIds_; ThreadLocalD cpuBatch_; ThreadLocalD gpuBatch_; RWLock lock_; - std::vector nnzStats_; // stats for number of none-zeros entries + // stats for number of none-zeros entries + std::vector nnzStats_; }; /** diff --git a/paddle/gserver/evaluators/Evaluator.cpp b/paddle/gserver/evaluators/Evaluator.cpp index ba9847f6a..a50eecdbb 100644 --- a/paddle/gserver/evaluators/Evaluator.cpp +++ b/paddle/gserver/evaluators/Evaluator.cpp @@ -1000,20 +1000,34 @@ REGISTER_EVALUATOR(max_frame_printer, MaxFramePrinter); /** * Sequence text printer will print text according to index matrix and a * dictionary. There can be multiple input to this layer: + * * 1) If there is only one input, the input must be a matrix containing * the sequence of indices; + * * 2) If there are more than one input, the first input should be ids, * and are interpreted as sample ids. * * The output format will be: + * * 1) sequence without sub-sequence, and there is probability. + * + * @code * id \t prob space_seperated_tokens_from_dictionary_according_to_seq + * @endcode + * * 2) sequence without sub-sequence, and there is not probability. + * + * @code * id \t space_seperated_tokens_from_dictionary_according_to_seq + * @endcode + * * 3) sequence with sub-sequence, and there is not probability. + * + * @code * id \t space_seperated_tokens_from_dictionary_according_to_sub_seq * \t \t space_seperated_tokens_from_dictionary_according_to_sub_seq * ... + * @endcode * * Typically SequenceTextPrinter layer takes output of maxid or RecurrentGroup * with maxid (when generating) as an input. diff --git a/paddle/gserver/evaluators/Evaluator.h b/paddle/gserver/evaluators/Evaluator.h index 316219b4f..eee785e0e 100644 --- a/paddle/gserver/evaluators/Evaluator.h +++ b/paddle/gserver/evaluators/Evaluator.h @@ -68,7 +68,7 @@ public: numSamples_ += arguments[0].getBatchSize(); } - // finish() should be called before distributeEval + /// finish() should be called before distributeEval virtual void distributeEval(ParameterClient2* client) { LOG(FATAL) << "Not implemeted"; } @@ -85,7 +85,7 @@ public: */ virtual void finish() {} - // finish() should be called before printStats + /// finish() should be called before printStats virtual void printStats(std::ostream& os) { os << config_.name() << "=" << (numSamples_ ? totalScore_ / numSamples_ : 0); @@ -130,9 +130,9 @@ public: /** * @brief evaluate AUC using colIdx-th column as prediction. * - * colIdx = 0: the 0-th column. - * colIdx > 0: the colIdx-th column. - * colIdx < 0: the last colIdx-th column. + * - colIdx = 0: the 0-th column. + * - colIdx > 0: the colIdx-th column. + * - colIdx < 0: the last colIdx-th column. * */ AucEvaluator(int32_t colIdx) @@ -223,10 +223,14 @@ public: virtual void distributeEval(ParameterClient2* client); struct StatsInfo { - double TP; // numbers of true positives - double TN; // numbers of true negatives - double FP; // numbers of false positives - double FN; // numbers of false negatives + /// numbers of true positives + double TP; + /// numbers of true negatives + double TN; + /// numbers of false positives + double FP; + /// numbers of false negatives + double FN; StatsInfo() : TP(0.0), TN(0.0), FP(0.0), FN(0.0) {} }; diff --git a/paddle/gserver/gradientmachines/GradientMachine.h b/paddle/gserver/gradientmachines/GradientMachine.h index 7233f985c..986a1ee71 100644 --- a/paddle/gserver/gradientmachines/GradientMachine.h +++ b/paddle/gserver/gradientmachines/GradientMachine.h @@ -247,7 +247,7 @@ public: virtual void restart() {} - // Set the gradient of the output from outside. + /// Set the gradient of the output from outside. virtual void setOutputGrad(const std::vector& args) { LOG(FATAL) << "Not implemented!"; } diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.h b/paddle/gserver/gradientmachines/MultiGradientMachine.h index 7c4ec4f6d..d13cf426c 100644 --- a/paddle/gserver/gradientmachines/MultiGradientMachine.h +++ b/paddle/gserver/gradientmachines/MultiGradientMachine.h @@ -31,14 +31,15 @@ typedef Queue PidQueue; typedef std::unique_ptr TrainerThreadPtr; struct GradBuffer { - // GradBuffer is used for gathering gradient for GPU parameters + /// GradBuffer is used for gathering gradient for GPU parameters int paramId; - // sem is used to notify that the local gradient merge of the current thread - // finished for the current thread. + /// sem is used to notify that the local gradient merge of the current thread + /// finished for the current thread. Semaphore sem; - std::vector bufs; // bufs[mergeIndex] + // bufs[mergeIndex] + std::vector bufs; }; /** @@ -189,14 +190,14 @@ public: return useGpu_; } - // @return whether to pass the gradients in outArgs_ to each threads. + /// @return whether to pass the gradients in outArgs_ to each threads. bool isPassGrad() { return isPassGrad_; } - // @brief set whether to pass the gradient in outArgs_ to each threads. + /// @brief set whether to pass the gradient in outArgs_ to each threads. void setPassGrad(bool isPass) { isPassGrad_ = isPass; } - // Set the gradients of the outputs. - // The gradietns will be copied to each thread in the computing threads. + /// Set the gradients of the outputs. + /// The gradietns will be copied to each thread in the computing threads. virtual void setOutputGrad(const std::vector& args); protected: @@ -205,8 +206,8 @@ protected: std::vector& getAllThreads() { return threads_; } - // Calculate the real device id based on the logical device id and the - // thread id. + /// Calculate the real device id based on the logical device id and the + /// thread id. int logicalDeviceId2RealDeviceId(int logicalId, int threadId = 0) const { if (logicalId == -1) { logicalId = 0; @@ -215,8 +216,8 @@ protected: numDevices_); } - // Calculate the logical device id based on the real device id and the - // thread id. + /// Calculate the logical device id based on the real device id and the + /// thread id. int realDeviceId2LogicalDeviceId(int realId, int threadId = 0) const { if (realId == -1) { return 0; @@ -232,15 +233,15 @@ protected: return hasNonstaticCpuParamters_; } - // Called TrainerThread to wait before merging CPU parameter gradients. + /// Called TrainerThread to wait before merging CPU parameter gradients. void waitBeforeMerge() { trainerBarrier_.wait(); } - // called by MultiGradientMachine and TrainerThread to wait after merging - // CPU parameter graidents. + /// called by MultiGradientMachine and TrainerThread to wait after merging + /// CPU parameter graidents. void waitAfterMerge() { allBarrier_.wait(); } - // called by MultiGradientMachine and TrainerThread to wait for copyInArgs() - // finishing + /// called by MultiGradientMachine and TrainerThread to wait for copyInArgs() + /// finishing void waitForCopyInArgs() { allBarrier_.wait(); } TrainerThreadPtr& getThread(int threadId) { @@ -255,8 +256,8 @@ protected: return passType_; } - // Called by TrainerThread to notify MultiGradientMachine that the gradient - // for paramId is ready + /// Called by TrainerThread to notify MultiGradientMachine that the gradient + /// for paramId is ready void notifyGradientTransfer(int paramId); const std::vector& getInArgs() { @@ -297,7 +298,7 @@ protected: virtual void backwardImp( const UpdateCallback& callback = NULL); - // update all parameters + /// update all parameters void updateThreadParameters(); void startTask(TaskType taskType); @@ -311,7 +312,7 @@ protected: bool hasNonstaticCpuParamters_; - // store main parameter only + /// store main parameter only std::unique_ptr gradientMachine_; std::vector threads_; @@ -326,7 +327,7 @@ protected: std::vector outArgs_; hl_stream_t outArgStream_; - // ParameterType which needs to be merged from each GPU + /// ParameterType which needs to be merged from each GPU std::vector mergeTypes_; int numDevices_; /* number of gpu devices */ int numLogicalDevices_; // number of GPU used by one NN @@ -334,16 +335,16 @@ protected: UpdateCallback backwardCallback_; - // barrrier for threads_ + /// barrrier for threads_ ThreadBarrier trainerBarrier_; - // barrier for both MultiGradientMachine and threds_ + /// barrier for both MultiGradientMachine and threds_ ThreadBarrier allBarrier_; - // indicate whether inArgs is copied before forward() + /// indicate whether inArgs is copied before forward() bool inArgsCopied_; - // Whether to copy the gradient back from an external input. + /// Whether to copy the gradient back from an external input. bool isPassGrad_; }; @@ -413,7 +414,7 @@ public: void prefetch(); - // copy the output gradient from the main GradientMachine. + /// copy the output gradient from the main GradientMachine. void copyOutputGrad(); protected: @@ -441,51 +442,60 @@ protected: void backward(); void backwardCallback(Parameter* para); - // call the actuall callback supplied by the caller of - // GradientMachine::backward + /// call the actuall callback supplied by the caller of + /// GradientMachine::backward void doCallback(int pid); protected: MultiGradientMachine* multiMachine_; ModelConfig config_; - bool stopping_; // whether the thread should stop - int partnerId_; // the threads form which to collect gradient - int threadId_; // from 0 to #threads-1 + /// whether the thread should stop + bool stopping_; + /// the threads form which to collect gradient + int partnerId_; + /// from 0 to threads-1 + int threadId_; int deviceId_; std::unique_ptr gradientMachine_; std::vector parameters_; - // ParameterType which needs to be merged from each GPU + /// ParameterType which needs to be merged from each GPU std::vector mergeTypes_; - std::unique_ptr computeThread_; // compute thread + /// compute thread + std::unique_ptr computeThread_; std::vector inArgs_; std::vector outArgs_; Semaphore taskReadySem_; Semaphore outArgsReadySem_; - std::unique_ptr copyThread_; // copy thread - PidQueue gradBufQueue_; // queue of gradient needs to be copied to partner + /// copy thread + std::unique_ptr copyThread_; + /// queue of gradient needs to be copied to partner + PidQueue gradBufQueue_; hl_stream_t gradStream_; - std::unique_ptr gradCollectThread_; // grad merge thread - // queue of gradient needs to be merged with gradient coopied by - // copyGradToBufferThread + /// grad merge thread + std::unique_ptr gradCollectThread_; + /// queue of gradient needs to be merged with gradient coopied by + /// copyGradToBufferThread PidQueue gradQueue_; UpdateCallback backwardCallback_; - std::unique_ptr valueDispatchThread_; // value dispatch thread - // queue of the parameter whose the vale are ready for copy + /// value dispatch thread + std::unique_ptr valueDispatchThread_; + /// queue of the parameter whose the vale are ready for copy PidQueue valueReadyQueue_; - // used to notify all the parameter values are ready + /// used to notify all the parameter values are ready LockedCondition valueReadyCond_; hl_stream_t valueStream_; - std::atomic updateCounter_; // how many parameters are updated + /// how many parameters are updated + std::atomic updateCounter_; bool parameterUpdated_; - // indicate whether inArgs is copied before forward() + /// indicate whether inArgs is copied before forward() bool inArgsCopied_; }; diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.h b/paddle/gserver/gradientmachines/NeuralNetwork.h index 1b440042d..06c679a63 100644 --- a/paddle/gserver/gradientmachines/NeuralNetwork.h +++ b/paddle/gserver/gradientmachines/NeuralNetwork.h @@ -66,12 +66,15 @@ public: PARAMETER_MOMENTUM}, bool useGpu = FLAGS_use_gpu); - // connect two submodels - // down-submodel's output become up-submodel's input - // *realLayer* is down-submodel's output layer - // *agentLayer* is up-submodel's input agent layer - // by default, connection is one by one, - // if the agent height is smaller than real layer, *height* has to be filled + /** + * Connect two submodels and + * down-submodel's output become up-submodel's input. + * By default, connection is one by one, + * If the agent height is smaller than real layer, *height* has to be filled. + * + * @param realLayer The down-submodel's output layer. + * @param agentLayer The up-submodel's input agent layer. + */ static void connect(LayerPtr agentLayer, LayerPtr realLayer, int height = 0); void connect(std::string agentLayerName, NeuralNetwork* srcNN, std::string realLayerName); @@ -98,10 +101,10 @@ public: virtual void resetState(); virtual void setOutputGrad(const std::vector& args); - // set machine state + /// set machine state virtual void setState(const MachineState& machineState); - // get machine state + /// get machine state virtual void getState(MachineState& machineState); static NeuralNetwork* create(const ModelConfig& config); @@ -126,8 +129,14 @@ public: NeuralNetwork* rootNetwork = nullptr); protected: - // rootNetwork: used in MultiNetwork - // sub networks can get parameters_ and parameterMap_ from base NeuralNetwork + /** + * The constructor of NeuralNetwork. + * The sub networks can get parameters_ and parameterMap_ + * from base NeuralNetwork. + * + * @param subModelName The name of sub-model. + * @param rootNetwork It used in MultiNetwork. + */ NeuralNetwork(std::string subModelName = "", NeuralNetwork* rootNetwork = nullptr) : subModelName_(subModelName), @@ -146,8 +155,8 @@ protected: NeuralNetwork* rootNetwork_; - // Whether parameter of this NN is initialized by its own - // (i.e., not by callback supplied with the caller) + /// Whether parameter of this NN is initialized by its own + /// (i.e., not by callback supplied with the caller) bool paramSelfInited_; }; diff --git a/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h b/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h index eaf2376e1..2a3db654f 100644 --- a/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h +++ b/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h @@ -67,7 +67,8 @@ public: protected: bool useGpu_; - int numDevices_; /* number of gpu devices */ + /// number of gpu devices + int numDevices_; std::vector> threads_; }; @@ -97,11 +98,14 @@ public: JobQueue queue_; protected: - int threadId_; // from 0 to #threads-1 - int deviceId_; // the GPU device Id which the computeThread_ used + /// from 0 to threads-1 + int threadId_; + /// the GPU device Id which the computeThread_ used + int deviceId_; bool useGpu_; std::unique_ptr computeThread_; - bool stopping_; // whether the thread should stop + /// whether the thread should stop + bool stopping_; UpdateCallback backwardCallback_; PassType passType_; }; diff --git a/paddle/gserver/layers/CostLayer.h b/paddle/gserver/layers/CostLayer.h index b4383370a..b464e1673 100644 --- a/paddle/gserver/layers/CostLayer.h +++ b/paddle/gserver/layers/CostLayer.h @@ -53,8 +53,13 @@ protected: real coeff_; }; -/* - * MultiClassCrossEntropy +/** + * The cross-entropy loss for multi-class classification task. + * The loss function is: + * + * \f[ + * L = - \sum_{i}{t_{k} * log(P(y=k))} + * \f] */ class MultiClassCrossEntropy : public CostLayer { public: @@ -68,9 +73,20 @@ public: void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad); }; -/* - * MultiClassCrossEntropyWithSelfNorm - * \sum_i (-log(x_label(i)) + alpha * log(Z(i)^2) +/** + * The cross-entropy with self-normalization for multi-class classification. + * + * The loss function is: + * \f[ + * L = \sum_{i}[-log(P(x_{i})) + alpha * log(Z(x_{i})^2)] + * \f] + * + * The \f$Z(x)\f$ is the softmax normalizer. + * + * [1] Jacob Devlin, Rabih Zbib, Zhongqiang Huang, Thomas Lamar, + * Richard Schwartz, and John Makhoul. Fast and robust neural + * network joint models for statistical machine translation. + * In Proceedings of the ACL 2014 Conference. */ class MultiClassCrossEntropyWithSelfNorm : public CostLayer { public: @@ -88,9 +104,11 @@ protected: MatrixPtr sumInv_; }; -/* - * SoftBinaryClassCrossEntropy - * \sum_i (\sum_j -y_j(i)*log(x_j(i))-(1-y_j(i))*log(1-x_j(i))) +/** + * The cross-entropy for soft binary class. + * \f[ + * L = \sum_i (\sum_j -y_j(i)*log(x_j(i))-(1-y_j(i))*log(1-x_j(i))) + * \f] */ class SoftBinaryClassCrossEntropy : public CostLayer { public: @@ -107,6 +125,13 @@ protected: MatrixPtr targetPerDim_; }; +/** + * This cost layer compute Euclidean (L2) loss for real-valued regression + * tasks. + * \f[ + * L = \frac{1}{2N} \sum_{i=1}^N {|| \hat{y}_i - y_i||_2^2} + * \f] + */ class SumOfSquaresCostLayer : public CostLayer { public: explicit SumOfSquaresCostLayer(const LayerConfig& config) @@ -119,8 +144,17 @@ public: void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad); }; -/* - * RankingCost +/** + * A cost layer for learning to rank (LTR) task. This layer contains at leat + * three inputs. + * \f[ + * C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + log(1 + e^{o_{i,j}}) \\ + * o_{i,j} = o_i - o_j \\ + * \tilde{P_{i,j}} = \left \{0, 0.5, 1 \right \} \ or \ \left \{0, 1 \right \} + * \f] + * + * [1]. Chris Burges, Tal Shaked, Erin Renshaw, et al. Learning to + * Rank useing Gradient Descent. */ class RankingCost : public Layer { public: @@ -155,12 +189,25 @@ private: double negPairCount_; MatrixPtr margin_; MatrixPtr marginGrad_; - // if input label is put in ids (not value), copy to this buffer. + /// if input label is put in ids (not value), copy to this buffer. MatrixPtr labelBuf_; LayerPtr weightLayer_; }; -/* lambdaRank listwise LTR approach */ +/** + * LambdaRank os a method for learning arbitrary information retrieval + * measures. It can be applied to any algorithm that learns through gradient + * descent. LambdaRank is a listwise method, in that the cost depends on the + * sorted order of the documents. LambdaRank gives the gradient of cost + * function: + * + * \f[ + * \lambda_{ij} = \frac{1}{1 + e^{o_i - o_j}} \left| \Delta_{NDCG} \right| + * \f] + * + * [1] Christopher J.C. Burges, Robert Ragno, Quoc Viet Le. Learning to Rank + * with Nonsmooth Cost Functions. + */ class LambdaCost : public Layer { public: explicit LambdaCost(const LayerConfig& config) : Layer(config) {} @@ -191,9 +238,11 @@ private: }; /** - * Cross entropy for multi binary labels - * cost[i] = -sum(label[i][j]*log(output[i][j]) - * + (1-label[i][j])*log(1-output[i][j])) + * Cross entropy for multi binary labels. + * \f[ + * cost[i] = -sum(label[i][j]*log(output[i][j]) + + * (1-label[i][j])*log(1-output[i][j])) + * \f] */ class MultiBinaryLabelCrossEntropy : public CostLayer { protected: @@ -210,13 +259,18 @@ public: void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad); }; -/* - * Huber loss for robust 2-classes classification +/** + * Huber loss for robust 2-classes classification. * * For label={0, 1}, let y=2*label-1. Given output f, the loss is: - * -4*y*f, if y*f < -1 - * (1-y*f)^2, if -1 < y*f < 1, - * 0, otherwise + * \f[ + * Loss = + * \left\{\begin{matrix} + * 4 * y * f & \textit{if} \ \ y* f < -1 \\ + * (1 - y * f)^2 & \textit{if} \ \ -1 < y * f < 1 \\ + * 0 & \textit{otherwise} + * \end{matrix}\right. + * \f] */ class HuberTwoClass : public CostLayer { std::vector tmpCpuInput_; diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index 931a373fc..85625c2f6 100644 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -3083,7 +3083,7 @@ following are cost Layers. @wrap_name_default() def rank_cost(left, right, lable, weight=None, name=None, coeff=1.0): """ - A cost Layer for leanrning to rank using gradient descent. Details can refer + A cost Layer for learning to rank using gradient descent. Details can refer to `papers `_. This layer contains at least three inputs. The weight is an optional argument, which affects the cost. -- GitLab