提交 f0637523 编写于 作者: D dangqingqing

Adjustment doc and code for CostLayer, GradientMachine and DataProvider.

Also add some comments for cost layers.
ISSUE=4580653

git-svn-id: https://svn.baidu.com/idl/trunk/paddle@1410 1ad973e4-5ce8-4261-8a94-b56d1f490c56
上级 4268885c
Activations Activations
============= =============
.. doxygenfile:: paddle/gserver/activations/ActivationFunction.h .. doxygenfile:: paddle/gserver/activations/ActivationFunction.h
.. doxygenfile:: paddle/gserver/activations/ActivationFunction.cpp .. doxygenfile:: paddle/gserver/activations/ActivationFunction.cpp
Data Providers Data Providers
================ ================
Data Provider Base DataProvider
------------------
.. doxygenclass:: paddle::DataProvider
:members:
DataProviderGroup
-------------------
.. doxygenclass:: paddle::DataProviderGroup
:members:
MultiDataProvider
-------------------
.. doxygenclass:: paddle::MultiDataProvider
:members:
PyDataProvider
===================
IFieldScanner
-------------
.. doxygenclass:: paddle::IFieldScanner
:members:
DenseScanner
-------------
.. doxygenclass:: paddle::DenseScanner
:members:
IndexScanner
-------------
.. doxygenclass:: paddle::IndexScanner
:members:
SparseNonValueScanner
---------------------
.. doxygenclass:: paddle::SparseNonValueScanner
:members:
SparseValueScanner
------------------
.. doxygenclass:: paddle::SparseValueScanner
:members:
SequenceScanner
------------------
.. doxygenclass:: paddle::SparseValueScanner
:members:
IPyDataProviderCache
--------------------
.. doxygenclass:: paddle::IPyDataProviderCache
:members:
NoCacheStrategy
--------------- ---------------
.. doxygenfile:: paddle/gserver/dataproviders/DataProvider.h .. doxygenclass:: paddle::NoCacheStrategy
.. doxygenfile:: paddle/gserver/dataproviders/PyDataProvider2.cpp :members:
.. doxygenfile:: paddle/gserver/dataproviders/DataProviderGroup.h
.. doxygenfile:: paddle/gserver/dataproviders/MultiDataProvider.h
Proto Data Provider CacheOnePassInMemory
-------------------- --------------------
.. doxygenfile:: paddle/gserver/dataproviders/ProtoDataProvider.h .. doxygenclass:: paddle::CacheOnePassInMemory
.. doxygenfile:: paddle/gserver/dataproviders/ProtoReader.h :members:
IPyDataProvider
---------------
.. doxygenclass:: paddle::PyDataProvider2
:members:
Proto Data Provider
===================
ProtoDataProvider
----------------
.. doxygenclass:: paddle::ProtoDataProvider
:members:
ProtoSequenceDataProvider
----------------
.. doxygenclass:: paddle::ProtoSequenceDataProvider
:members:
Base Evaluator
==============
Evaluator
---------
.. doxygenclass:: paddle::Evaluator
:members:
Utils
=====
SumEvaluator
------------
.. doxygenclass:: paddle::SumEvaluator
:members:
ColumnSumEvaluator
------------------
.. doxygenclass:: paddle::ColumnSumEvaluator
:members:
Classification
==============
ClassificationErrorEvaluator
---------------------------
.. doxygenclass:: paddle::ClassificationErrorEvaluator
:members:
SequenceClassificationErrorEvaluator
------------------------------------
.. doxygenclass:: paddle::SequenceClassificationErrorEvaluator
:members:
AucEvaluator
-------------
.. doxygenclass:: paddle::AucEvaluator
:members:
PrecisionRecallEvaluator
------------------------
.. doxygenclass:: paddle::PrecisionRecallEvaluator
:members:
ChunkEvaluator
--------------
.. doxygenclass:: paddle::ChunkEvaluator
:members:
CTCEvaluator
------------
.. doxygenclass:: paddle::CTCErrorEvaluator
:members:
Rank
====
PnpairEvaluator
-------------
.. doxygenclass:: paddle::PnpairEvaluator
:members:
AucEvaluator
-------------
.. doxygenclass:: paddle::RankAucEvaluator
:members:
Printer
=======
ValuePrinter
-------------
.. doxygenclass:: paddle::ValuePrinter
:members:
GradientPrinter
---------------
.. doxygenclass:: paddle::GradientPrinter
:members:
MaxIdPrinter
------------
.. doxygenclass:: paddle::MaxIdPrinter
:members:
MaxFramePrinter
---------------
.. doxygenclass:: paddle::MaxFramePrinter
:members:
SequenceTextPrinter
------------------
.. doxygenclass:: paddle::SequenceTextPrinter
:members:
ClassificationErrorPrinter
--------------------------
.. doxygenclass:: paddle::ClassificationErrorPrinter
:members:
Evaluators Evaluators
============ ==========
.. doxygenfile:: paddle/gserver/evaluators/Evaluator.h
.. doxygenfile:: paddle/gserver/evaluators/ChunkEvaluator.cpp
.. doxygenfile:: paddle/gserver/evaluators/CTCErrorEvaluator.cpp
.. toctree::
:maxdepth: 3
evaluators.rst
Gradient machines Gradient Machines
=================== ================
Networks GradientMachine
------------ ---------------------
.. doxygenfile:: paddle/gserver/gradientmachines/MultiNetwork.h .. doxygenclass:: paddle::GradientMachine
.. doxygenfile:: paddle/gserver/gradientmachines/ParallelNeuralNetwork.h :members:
Gradient Machines GradientMachineModel
-------------------- --------------------
.. doxygenfile:: paddle/gserver/gradientmachines/GradientMachine.h .. doxygenclass:: paddle::IGradientMachineMode
.. doxygenfile:: paddle/gserver/gradientmachines/MultiGradientMachine.h :members:
MultiGradientMachine
---------------------
.. doxygenclass:: paddle::MultiGradientMachine
:members:
TrainerThread
`````````````
.. doxygenclass:: paddle::TrainerThread
:members:
Recurrent Gradient Machines Recurrent Gradient Machines
----------------------------- ---------------------------
.. doxygenfile:: paddle/gserver/gradientmachines/RecurrentGradientMachine.h .. doxygenclass:: paddle::RecurrentGradientMachine
.. doxygenfile:: paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp :members:
Networks
========
NeuralNetwork
-------------
.. doxygenclass:: paddle::NeuralNetwork
:members:
ParallelNeuralNetwork
---------------------
.. doxygenclass:: paddle::ParallelNeuralNetwork
:members:
...@@ -118,10 +118,10 @@ public: ...@@ -118,10 +118,10 @@ public:
data_.push_back(argu); data_.push_back(argu);
} }
/* /**
* argus: DataBatch.getStreams() * @param argus: DataBatch.getStreams()
* size: DataBatch.getSize() * @param size: DataBatch.getSize()
* dataId: sub dataprovider id (in MultiDataProvider) * @param dataId: sub dataprovider id (in MultiDataProvider)
*/ */
void appendArguments(const std::vector<Argument>& argus, int size, void appendArguments(const std::vector<Argument>& argus, int size,
int dataId) { int dataId) {
...@@ -312,22 +312,28 @@ public: ...@@ -312,22 +312,28 @@ public:
} }
}; };
// Data provider for one input and one integer label /**
* Data provider for one input and one integer label.
*/
class SimpleDataProviderBase : public DataProvider { class SimpleDataProviderBase : public DataProvider {
protected: protected:
int64_t sampleDim_; // sample feature dimension /// sample feature dimension
int64_t bufferCapacity_; // the number of samples int64_t sampleDim_;
/// the number of samples
int64_t bufferCapacity_;
int64_t sampleNumInBuf_; int64_t sampleNumInBuf_;
int64_t nextItemIndex_; // next item to read in buffer /// next item to read in buffer
bool withInfo_; // some user defined info for validation int64_t nextItemIndex_;
/// some user defined info for validation
bool withInfo_;
// data buffer: bufferCapacity_ * nDataDim_ /// data buffer: bufferCapacity_ * nDataDim_
CpuMatrixPtr hInputDataBuf_; CpuMatrixPtr hInputDataBuf_;
// label buffer:bufferCapacity_ * 1 /// label buffer:bufferCapacity_ * 1
CpuIVectorPtr hInputLabelBuf_; CpuIVectorPtr hInputLabelBuf_;
// info buffer:bufferCapacity_ * 1 /// info buffer:bufferCapacity_ * 1
CpuIVectorPtr hInputInfoBuf_; CpuIVectorPtr hInputInfoBuf_;
ThreadLocal<MatrixPtr> dataBatch_; ThreadLocal<MatrixPtr> dataBatch_;
...@@ -348,7 +354,7 @@ public: ...@@ -348,7 +354,7 @@ public:
virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch); virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
// return the number of samples in the buffer /// return the number of samples in the buffer
int64_t fillBuffer(); int64_t fillBuffer();
protected: protected:
......
...@@ -80,7 +80,7 @@ protected: ...@@ -80,7 +80,7 @@ protected:
*/ */
inline bool iidData() const { return sequenceStartPositions_.empty(); } inline bool iidData() const { return sequenceStartPositions_.empty(); }
// check that sample is consistent with header_ /// check that sample is consistent with header_
void checkSample(const DataSample& sample); void checkSample(const DataSample& sample);
template <class Op> template <class Op>
...@@ -129,14 +129,15 @@ protected: ...@@ -129,14 +129,15 @@ protected:
int64_t currentSequenceIndex_; int64_t currentSequenceIndex_;
// The size should be the number of sequences. /// The size should be the number of sequences.
std::vector<size_t> shuffledSequenceIds_; std::vector<size_t> shuffledSequenceIds_;
ThreadLocalD<DataBatch> cpuBatch_; ThreadLocalD<DataBatch> cpuBatch_;
ThreadLocalD<DataBatch> gpuBatch_; ThreadLocalD<DataBatch> gpuBatch_;
RWLock lock_; RWLock lock_;
std::vector<StatPtr> nnzStats_; // stats for number of none-zeros entries // stats for number of none-zeros entries
std::vector<StatPtr> nnzStats_;
}; };
/** /**
......
...@@ -1000,20 +1000,34 @@ REGISTER_EVALUATOR(max_frame_printer, MaxFramePrinter); ...@@ -1000,20 +1000,34 @@ REGISTER_EVALUATOR(max_frame_printer, MaxFramePrinter);
/** /**
* Sequence text printer will print text according to index matrix and a * Sequence text printer will print text according to index matrix and a
* dictionary. There can be multiple input to this layer: * dictionary. There can be multiple input to this layer:
*
* 1) If there is only one input, the input must be a matrix containing * 1) If there is only one input, the input must be a matrix containing
* the sequence of indices; * the sequence of indices;
*
* 2) If there are more than one input, the first input should be ids, * 2) If there are more than one input, the first input should be ids,
* and are interpreted as sample ids. * and are interpreted as sample ids.
* *
* The output format will be: * The output format will be:
*
* 1) sequence without sub-sequence, and there is probability. * 1) sequence without sub-sequence, and there is probability.
*
* @code
* id \t prob space_seperated_tokens_from_dictionary_according_to_seq * id \t prob space_seperated_tokens_from_dictionary_according_to_seq
* @endcode
*
* 2) sequence without sub-sequence, and there is not probability. * 2) sequence without sub-sequence, and there is not probability.
*
* @code
* id \t space_seperated_tokens_from_dictionary_according_to_seq * id \t space_seperated_tokens_from_dictionary_according_to_seq
* @endcode
*
* 3) sequence with sub-sequence, and there is not probability. * 3) sequence with sub-sequence, and there is not probability.
*
* @code
* id \t space_seperated_tokens_from_dictionary_according_to_sub_seq * id \t space_seperated_tokens_from_dictionary_according_to_sub_seq
* \t \t space_seperated_tokens_from_dictionary_according_to_sub_seq * \t \t space_seperated_tokens_from_dictionary_according_to_sub_seq
* ... * ...
* @endcode
* *
* Typically SequenceTextPrinter layer takes output of maxid or RecurrentGroup * Typically SequenceTextPrinter layer takes output of maxid or RecurrentGroup
* with maxid (when generating) as an input. * with maxid (when generating) as an input.
......
...@@ -68,7 +68,7 @@ public: ...@@ -68,7 +68,7 @@ public:
numSamples_ += arguments[0].getBatchSize(); numSamples_ += arguments[0].getBatchSize();
} }
// finish() should be called before distributeEval /// finish() should be called before distributeEval
virtual void distributeEval(ParameterClient2* client) { virtual void distributeEval(ParameterClient2* client) {
LOG(FATAL) << "Not implemeted"; LOG(FATAL) << "Not implemeted";
} }
...@@ -85,7 +85,7 @@ public: ...@@ -85,7 +85,7 @@ public:
*/ */
virtual void finish() {} virtual void finish() {}
// finish() should be called before printStats /// finish() should be called before printStats
virtual void printStats(std::ostream& os) { virtual void printStats(std::ostream& os) {
os << config_.name() << "=" os << config_.name() << "="
<< (numSamples_ ? totalScore_ / numSamples_ : 0); << (numSamples_ ? totalScore_ / numSamples_ : 0);
...@@ -130,9 +130,9 @@ public: ...@@ -130,9 +130,9 @@ public:
/** /**
* @brief evaluate AUC using colIdx-th column as prediction. * @brief evaluate AUC using colIdx-th column as prediction.
* *
* colIdx = 0: the 0-th column. * - colIdx = 0: the 0-th column.
* colIdx > 0: the colIdx-th column. * - colIdx > 0: the colIdx-th column.
* colIdx < 0: the last colIdx-th column. * - colIdx < 0: the last colIdx-th column.
* *
*/ */
AucEvaluator(int32_t colIdx) AucEvaluator(int32_t colIdx)
...@@ -223,10 +223,14 @@ public: ...@@ -223,10 +223,14 @@ public:
virtual void distributeEval(ParameterClient2* client); virtual void distributeEval(ParameterClient2* client);
struct StatsInfo { struct StatsInfo {
double TP; // numbers of true positives /// numbers of true positives
double TN; // numbers of true negatives double TP;
double FP; // numbers of false positives /// numbers of true negatives
double FN; // numbers of false negatives double TN;
/// numbers of false positives
double FP;
/// numbers of false negatives
double FN;
StatsInfo() : TP(0.0), TN(0.0), FP(0.0), FN(0.0) {} StatsInfo() : TP(0.0), TN(0.0), FP(0.0), FN(0.0) {}
}; };
......
...@@ -247,7 +247,7 @@ public: ...@@ -247,7 +247,7 @@ public:
virtual void restart() {} virtual void restart() {}
// Set the gradient of the output from outside. /// Set the gradient of the output from outside.
virtual void setOutputGrad(const std::vector<Argument>& args) { virtual void setOutputGrad(const std::vector<Argument>& args) {
LOG(FATAL) << "Not implemented!"; LOG(FATAL) << "Not implemented!";
} }
......
...@@ -31,14 +31,15 @@ typedef Queue<int> PidQueue; ...@@ -31,14 +31,15 @@ typedef Queue<int> PidQueue;
typedef std::unique_ptr<TrainerThread> TrainerThreadPtr; typedef std::unique_ptr<TrainerThread> TrainerThreadPtr;
struct GradBuffer { struct GradBuffer {
// GradBuffer is used for gathering gradient for GPU parameters /// GradBuffer is used for gathering gradient for GPU parameters
int paramId; int paramId;
// sem is used to notify that the local gradient merge of the current thread /// sem is used to notify that the local gradient merge of the current thread
// finished for the current thread. /// finished for the current thread.
Semaphore sem; Semaphore sem;
std::vector<VectorPtr> bufs; // bufs[mergeIndex] // bufs[mergeIndex]
std::vector<VectorPtr> bufs;
}; };
/** /**
...@@ -189,14 +190,14 @@ public: ...@@ -189,14 +190,14 @@ public:
return useGpu_; return useGpu_;
} }
// @return whether to pass the gradients in outArgs_ to each threads. /// @return whether to pass the gradients in outArgs_ to each threads.
bool isPassGrad() { return isPassGrad_; } bool isPassGrad() { return isPassGrad_; }
// @brief set whether to pass the gradient in outArgs_ to each threads. /// @brief set whether to pass the gradient in outArgs_ to each threads.
void setPassGrad(bool isPass) { isPassGrad_ = isPass; } void setPassGrad(bool isPass) { isPassGrad_ = isPass; }
// Set the gradients of the outputs. /// Set the gradients of the outputs.
// The gradietns will be copied to each thread in the computing threads. /// The gradietns will be copied to each thread in the computing threads.
virtual void setOutputGrad(const std::vector<Argument>& args); virtual void setOutputGrad(const std::vector<Argument>& args);
protected: protected:
...@@ -205,8 +206,8 @@ protected: ...@@ -205,8 +206,8 @@ protected:
std::vector<TrainerThreadPtr>& getAllThreads() { std::vector<TrainerThreadPtr>& getAllThreads() {
return threads_; return threads_;
} }
// Calculate the real device id based on the logical device id and the /// Calculate the real device id based on the logical device id and the
// thread id. /// thread id.
int logicalDeviceId2RealDeviceId(int logicalId, int threadId = 0) const { int logicalDeviceId2RealDeviceId(int logicalId, int threadId = 0) const {
if (logicalId == -1) { if (logicalId == -1) {
logicalId = 0; logicalId = 0;
...@@ -215,8 +216,8 @@ protected: ...@@ -215,8 +216,8 @@ protected:
numDevices_); numDevices_);
} }
// Calculate the logical device id based on the real device id and the /// Calculate the logical device id based on the real device id and the
// thread id. /// thread id.
int realDeviceId2LogicalDeviceId(int realId, int threadId = 0) const { int realDeviceId2LogicalDeviceId(int realId, int threadId = 0) const {
if (realId == -1) { if (realId == -1) {
return 0; return 0;
...@@ -232,15 +233,15 @@ protected: ...@@ -232,15 +233,15 @@ protected:
return hasNonstaticCpuParamters_; return hasNonstaticCpuParamters_;
} }
// Called TrainerThread to wait before merging CPU parameter gradients. /// Called TrainerThread to wait before merging CPU parameter gradients.
void waitBeforeMerge() { trainerBarrier_.wait(); } void waitBeforeMerge() { trainerBarrier_.wait(); }
// called by MultiGradientMachine and TrainerThread to wait after merging /// called by MultiGradientMachine and TrainerThread to wait after merging
// CPU parameter graidents. /// CPU parameter graidents.
void waitAfterMerge() { allBarrier_.wait(); } void waitAfterMerge() { allBarrier_.wait(); }
// called by MultiGradientMachine and TrainerThread to wait for copyInArgs() /// called by MultiGradientMachine and TrainerThread to wait for copyInArgs()
// finishing /// finishing
void waitForCopyInArgs() { allBarrier_.wait(); } void waitForCopyInArgs() { allBarrier_.wait(); }
TrainerThreadPtr& getThread(int threadId) { TrainerThreadPtr& getThread(int threadId) {
...@@ -255,8 +256,8 @@ protected: ...@@ -255,8 +256,8 @@ protected:
return passType_; return passType_;
} }
// Called by TrainerThread to notify MultiGradientMachine that the gradient /// Called by TrainerThread to notify MultiGradientMachine that the gradient
// for paramId is ready /// for paramId is ready
void notifyGradientTransfer(int paramId); void notifyGradientTransfer(int paramId);
const std::vector<Argument>& getInArgs() { const std::vector<Argument>& getInArgs() {
...@@ -297,7 +298,7 @@ protected: ...@@ -297,7 +298,7 @@ protected:
virtual void backwardImp( virtual void backwardImp(
const UpdateCallback& callback = NULL); const UpdateCallback& callback = NULL);
// update all parameters /// update all parameters
void updateThreadParameters(); void updateThreadParameters();
void startTask(TaskType taskType); void startTask(TaskType taskType);
...@@ -311,7 +312,7 @@ protected: ...@@ -311,7 +312,7 @@ protected:
bool hasNonstaticCpuParamters_; bool hasNonstaticCpuParamters_;
// store main parameter only /// store main parameter only
std::unique_ptr<GradientMachine> gradientMachine_; std::unique_ptr<GradientMachine> gradientMachine_;
std::vector<TrainerThreadPtr> threads_; std::vector<TrainerThreadPtr> threads_;
...@@ -326,7 +327,7 @@ protected: ...@@ -326,7 +327,7 @@ protected:
std::vector<Argument> outArgs_; std::vector<Argument> outArgs_;
hl_stream_t outArgStream_; hl_stream_t outArgStream_;
// ParameterType which needs to be merged from each GPU /// ParameterType which needs to be merged from each GPU
std::vector<ParameterType> mergeTypes_; std::vector<ParameterType> mergeTypes_;
int numDevices_; /* number of gpu devices */ int numDevices_; /* number of gpu devices */
int numLogicalDevices_; // number of GPU used by one NN int numLogicalDevices_; // number of GPU used by one NN
...@@ -334,16 +335,16 @@ protected: ...@@ -334,16 +335,16 @@ protected:
UpdateCallback backwardCallback_; UpdateCallback backwardCallback_;
// barrrier for threads_ /// barrrier for threads_
ThreadBarrier trainerBarrier_; ThreadBarrier trainerBarrier_;
// barrier for both MultiGradientMachine and threds_ /// barrier for both MultiGradientMachine and threds_
ThreadBarrier allBarrier_; ThreadBarrier allBarrier_;
// indicate whether inArgs is copied before forward() /// indicate whether inArgs is copied before forward()
bool inArgsCopied_; bool inArgsCopied_;
// Whether to copy the gradient back from an external input. /// Whether to copy the gradient back from an external input.
bool isPassGrad_; bool isPassGrad_;
}; };
...@@ -413,7 +414,7 @@ public: ...@@ -413,7 +414,7 @@ public:
void prefetch(); void prefetch();
// copy the output gradient from the main GradientMachine. /// copy the output gradient from the main GradientMachine.
void copyOutputGrad(); void copyOutputGrad();
protected: protected:
...@@ -441,51 +442,60 @@ protected: ...@@ -441,51 +442,60 @@ protected:
void backward(); void backward();
void backwardCallback(Parameter* para); void backwardCallback(Parameter* para);
// call the actuall callback supplied by the caller of /// call the actuall callback supplied by the caller of
// GradientMachine::backward /// GradientMachine::backward
void doCallback(int pid); void doCallback(int pid);
protected: protected:
MultiGradientMachine* multiMachine_; MultiGradientMachine* multiMachine_;
ModelConfig config_; ModelConfig config_;
bool stopping_; // whether the thread should stop /// whether the thread should stop
int partnerId_; // the threads form which to collect gradient bool stopping_;
int threadId_; // from 0 to #threads-1 /// the threads form which to collect gradient
int partnerId_;
/// from 0 to threads-1
int threadId_;
int deviceId_; int deviceId_;
std::unique_ptr<GradientMachine> gradientMachine_; std::unique_ptr<GradientMachine> gradientMachine_;
std::vector<ParameterPtr> parameters_; std::vector<ParameterPtr> parameters_;
// ParameterType which needs to be merged from each GPU /// ParameterType which needs to be merged from each GPU
std::vector<ParameterType> mergeTypes_; std::vector<ParameterType> mergeTypes_;
std::unique_ptr<std::thread> computeThread_; // compute thread /// compute thread
std::unique_ptr<std::thread> computeThread_;
std::vector<Argument> inArgs_; std::vector<Argument> inArgs_;
std::vector<Argument> outArgs_; std::vector<Argument> outArgs_;
Semaphore taskReadySem_; Semaphore taskReadySem_;
Semaphore outArgsReadySem_; Semaphore outArgsReadySem_;
std::unique_ptr<std::thread> copyThread_; // copy thread /// copy thread
PidQueue gradBufQueue_; // queue of gradient needs to be copied to partner std::unique_ptr<std::thread> copyThread_;
/// queue of gradient needs to be copied to partner
PidQueue gradBufQueue_;
hl_stream_t gradStream_; hl_stream_t gradStream_;
std::unique_ptr<std::thread> gradCollectThread_; // grad merge thread /// grad merge thread
// queue of gradient needs to be merged with gradient coopied by std::unique_ptr<std::thread> gradCollectThread_;
// copyGradToBufferThread /// queue of gradient needs to be merged with gradient coopied by
/// copyGradToBufferThread
PidQueue gradQueue_; PidQueue gradQueue_;
UpdateCallback backwardCallback_; UpdateCallback backwardCallback_;
std::unique_ptr<std::thread> valueDispatchThread_; // value dispatch thread /// value dispatch thread
// queue of the parameter whose the vale are ready for copy std::unique_ptr<std::thread> valueDispatchThread_;
/// queue of the parameter whose the vale are ready for copy
PidQueue valueReadyQueue_; PidQueue valueReadyQueue_;
// used to notify all the parameter values are ready /// used to notify all the parameter values are ready
LockedCondition valueReadyCond_; LockedCondition valueReadyCond_;
hl_stream_t valueStream_; hl_stream_t valueStream_;
std::atomic<int> updateCounter_; // how many parameters are updated /// how many parameters are updated
std::atomic<int> updateCounter_;
bool parameterUpdated_; bool parameterUpdated_;
// indicate whether inArgs is copied before forward() /// indicate whether inArgs is copied before forward()
bool inArgsCopied_; bool inArgsCopied_;
}; };
......
...@@ -66,12 +66,15 @@ public: ...@@ -66,12 +66,15 @@ public:
PARAMETER_MOMENTUM}, PARAMETER_MOMENTUM},
bool useGpu = FLAGS_use_gpu); bool useGpu = FLAGS_use_gpu);
// connect two submodels /**
// down-submodel's output become up-submodel's input * Connect two submodels and
// *realLayer* is down-submodel's output layer * down-submodel's output become up-submodel's input.
// *agentLayer* is up-submodel's input agent layer * By default, connection is one by one,
// by default, connection is one by one, * If the agent height is smaller than real layer, *height* has to be filled.
// if the agent height is smaller than real layer, *height* has to be filled *
* @param realLayer The down-submodel's output layer.
* @param agentLayer The up-submodel's input agent layer.
*/
static void connect(LayerPtr agentLayer, LayerPtr realLayer, int height = 0); static void connect(LayerPtr agentLayer, LayerPtr realLayer, int height = 0);
void connect(std::string agentLayerName, NeuralNetwork* srcNN, void connect(std::string agentLayerName, NeuralNetwork* srcNN,
std::string realLayerName); std::string realLayerName);
...@@ -98,10 +101,10 @@ public: ...@@ -98,10 +101,10 @@ public:
virtual void resetState(); virtual void resetState();
virtual void setOutputGrad(const std::vector<Argument>& args); virtual void setOutputGrad(const std::vector<Argument>& args);
// set machine state /// set machine state
virtual void setState(const MachineState& machineState); virtual void setState(const MachineState& machineState);
// get machine state /// get machine state
virtual void getState(MachineState& machineState); virtual void getState(MachineState& machineState);
static NeuralNetwork* create(const ModelConfig& config); static NeuralNetwork* create(const ModelConfig& config);
...@@ -126,8 +129,14 @@ public: ...@@ -126,8 +129,14 @@ public:
NeuralNetwork* rootNetwork = nullptr); NeuralNetwork* rootNetwork = nullptr);
protected: protected:
// rootNetwork: used in MultiNetwork /**
// sub networks can get parameters_ and parameterMap_ from base NeuralNetwork * The constructor of NeuralNetwork.
* The sub networks can get parameters_ and parameterMap_
* from base NeuralNetwork.
*
* @param subModelName The name of sub-model.
* @param rootNetwork It used in MultiNetwork.
*/
NeuralNetwork(std::string subModelName = "", NeuralNetwork(std::string subModelName = "",
NeuralNetwork* rootNetwork = nullptr) NeuralNetwork* rootNetwork = nullptr)
: subModelName_(subModelName), : subModelName_(subModelName),
...@@ -146,8 +155,8 @@ protected: ...@@ -146,8 +155,8 @@ protected:
NeuralNetwork* rootNetwork_; NeuralNetwork* rootNetwork_;
// Whether parameter of this NN is initialized by its own /// Whether parameter of this NN is initialized by its own
// (i.e., not by callback supplied with the caller) /// (i.e., not by callback supplied with the caller)
bool paramSelfInited_; bool paramSelfInited_;
}; };
......
...@@ -67,7 +67,8 @@ public: ...@@ -67,7 +67,8 @@ public:
protected: protected:
bool useGpu_; bool useGpu_;
int numDevices_; /* number of gpu devices */ /// number of gpu devices
int numDevices_;
std::vector<std::unique_ptr<ParallelThread>> threads_; std::vector<std::unique_ptr<ParallelThread>> threads_;
}; };
...@@ -97,11 +98,14 @@ public: ...@@ -97,11 +98,14 @@ public:
JobQueue queue_; JobQueue queue_;
protected: protected:
int threadId_; // from 0 to #threads-1 /// from 0 to threads-1
int deviceId_; // the GPU device Id which the computeThread_ used int threadId_;
/// the GPU device Id which the computeThread_ used
int deviceId_;
bool useGpu_; bool useGpu_;
std::unique_ptr<std::thread> computeThread_; std::unique_ptr<std::thread> computeThread_;
bool stopping_; // whether the thread should stop /// whether the thread should stop
bool stopping_;
UpdateCallback backwardCallback_; UpdateCallback backwardCallback_;
PassType passType_; PassType passType_;
}; };
......
...@@ -53,8 +53,13 @@ protected: ...@@ -53,8 +53,13 @@ protected:
real coeff_; real coeff_;
}; };
/* /**
* MultiClassCrossEntropy * The cross-entropy loss for multi-class classification task.
* The loss function is:
*
* \f[
* L = - \sum_{i}{t_{k} * log(P(y=k))}
* \f]
*/ */
class MultiClassCrossEntropy : public CostLayer { class MultiClassCrossEntropy : public CostLayer {
public: public:
...@@ -68,9 +73,20 @@ public: ...@@ -68,9 +73,20 @@ public:
void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad); void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad);
}; };
/* /**
* MultiClassCrossEntropyWithSelfNorm * The cross-entropy with self-normalization for multi-class classification.
* \sum_i (-log(x_label(i)) + alpha * log(Z(i)^2) *
* The loss function is:
* \f[
* L = \sum_{i}[-log(P(x_{i})) + alpha * log(Z(x_{i})^2)]
* \f]
*
* The \f$Z(x)\f$ is the softmax normalizer.
*
* [1] Jacob Devlin, Rabih Zbib, Zhongqiang Huang, Thomas Lamar,
* Richard Schwartz, and John Makhoul. Fast and robust neural
* network joint models for statistical machine translation.
* In Proceedings of the ACL 2014 Conference.
*/ */
class MultiClassCrossEntropyWithSelfNorm : public CostLayer { class MultiClassCrossEntropyWithSelfNorm : public CostLayer {
public: public:
...@@ -88,9 +104,11 @@ protected: ...@@ -88,9 +104,11 @@ protected:
MatrixPtr sumInv_; MatrixPtr sumInv_;
}; };
/* /**
* SoftBinaryClassCrossEntropy * The cross-entropy for soft binary class.
* \sum_i (\sum_j -y_j(i)*log(x_j(i))-(1-y_j(i))*log(1-x_j(i))) * \f[
* L = \sum_i (\sum_j -y_j(i)*log(x_j(i))-(1-y_j(i))*log(1-x_j(i)))
* \f]
*/ */
class SoftBinaryClassCrossEntropy : public CostLayer { class SoftBinaryClassCrossEntropy : public CostLayer {
public: public:
...@@ -107,6 +125,13 @@ protected: ...@@ -107,6 +125,13 @@ protected:
MatrixPtr targetPerDim_; MatrixPtr targetPerDim_;
}; };
/**
* This cost layer compute Euclidean (L2) loss for real-valued regression
* tasks.
* \f[
* L = \frac{1}{2N} \sum_{i=1}^N {|| \hat{y}_i - y_i||_2^2}
* \f]
*/
class SumOfSquaresCostLayer : public CostLayer { class SumOfSquaresCostLayer : public CostLayer {
public: public:
explicit SumOfSquaresCostLayer(const LayerConfig& config) explicit SumOfSquaresCostLayer(const LayerConfig& config)
...@@ -119,8 +144,17 @@ public: ...@@ -119,8 +144,17 @@ public:
void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad); void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad);
}; };
/* /**
* RankingCost * A cost layer for learning to rank (LTR) task. This layer contains at leat
* three inputs.
* \f[
* C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + log(1 + e^{o_{i,j}}) \\
* o_{i,j} = o_i - o_j \\
* \tilde{P_{i,j}} = \left \{0, 0.5, 1 \right \} \ or \ \left \{0, 1 \right \}
* \f]
*
* [1]. Chris Burges, Tal Shaked, Erin Renshaw, et al. Learning to
* Rank useing Gradient Descent.
*/ */
class RankingCost : public Layer { class RankingCost : public Layer {
public: public:
...@@ -155,12 +189,25 @@ private: ...@@ -155,12 +189,25 @@ private:
double negPairCount_; double negPairCount_;
MatrixPtr margin_; MatrixPtr margin_;
MatrixPtr marginGrad_; MatrixPtr marginGrad_;
// if input label is put in ids (not value), copy to this buffer. /// if input label is put in ids (not value), copy to this buffer.
MatrixPtr labelBuf_; MatrixPtr labelBuf_;
LayerPtr weightLayer_; LayerPtr weightLayer_;
}; };
/* lambdaRank listwise LTR approach */ /**
* LambdaRank os a method for learning arbitrary information retrieval
* measures. It can be applied to any algorithm that learns through gradient
* descent. LambdaRank is a listwise method, in that the cost depends on the
* sorted order of the documents. LambdaRank gives the gradient of cost
* function:
*
* \f[
* \lambda_{ij} = \frac{1}{1 + e^{o_i - o_j}} \left| \Delta_{NDCG} \right|
* \f]
*
* [1] Christopher J.C. Burges, Robert Ragno, Quoc Viet Le. Learning to Rank
* with Nonsmooth Cost Functions.
*/
class LambdaCost : public Layer { class LambdaCost : public Layer {
public: public:
explicit LambdaCost(const LayerConfig& config) : Layer(config) {} explicit LambdaCost(const LayerConfig& config) : Layer(config) {}
...@@ -191,9 +238,11 @@ private: ...@@ -191,9 +238,11 @@ private:
}; };
/** /**
* Cross entropy for multi binary labels * Cross entropy for multi binary labels.
* cost[i] = -sum(label[i][j]*log(output[i][j]) * \f[
* + (1-label[i][j])*log(1-output[i][j])) * cost[i] = -sum(label[i][j]*log(output[i][j]) +
* (1-label[i][j])*log(1-output[i][j]))
* \f]
*/ */
class MultiBinaryLabelCrossEntropy : public CostLayer { class MultiBinaryLabelCrossEntropy : public CostLayer {
protected: protected:
...@@ -210,13 +259,18 @@ public: ...@@ -210,13 +259,18 @@ public:
void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad); void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad);
}; };
/* /**
* Huber loss for robust 2-classes classification * Huber loss for robust 2-classes classification.
* *
* For label={0, 1}, let y=2*label-1. Given output f, the loss is: * For label={0, 1}, let y=2*label-1. Given output f, the loss is:
* -4*y*f, if y*f < -1 * \f[
* (1-y*f)^2, if -1 < y*f < 1, * Loss =
* 0, otherwise * \left\{\begin{matrix}
* 4 * y * f & \textit{if} \ \ y* f < -1 \\
* (1 - y * f)^2 & \textit{if} \ \ -1 < y * f < 1 \\
* 0 & \textit{otherwise}
* \end{matrix}\right.
* \f]
*/ */
class HuberTwoClass : public CostLayer { class HuberTwoClass : public CostLayer {
std::vector<Argument> tmpCpuInput_; std::vector<Argument> tmpCpuInput_;
......
...@@ -3083,7 +3083,7 @@ following are cost Layers. ...@@ -3083,7 +3083,7 @@ following are cost Layers.
@wrap_name_default() @wrap_name_default()
def rank_cost(left, right, lable, weight=None, name=None, coeff=1.0): def rank_cost(left, right, lable, weight=None, name=None, coeff=1.0):
""" """
A cost Layer for leanrning to rank using gradient descent. Details can refer A cost Layer for learning to rank using gradient descent. Details can refer
to `papers <http://research.microsoft.com/en-us/um/people/cburges/papers/ICML_ranking.pdf>`_. to `papers <http://research.microsoft.com/en-us/um/people/cburges/papers/ICML_ranking.pdf>`_.
This layer contains at least three inputs. The weight is an optional This layer contains at least three inputs. The weight is an optional
argument, which affects the cost. argument, which affects the cost.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册