提交 f0637523 编写于 作者: D dangqingqing

Adjustment doc and code for CostLayer, GradientMachine and DataProvider.

Also add some comments for cost layers.
ISSUE=4580653

git-svn-id: https://svn.baidu.com/idl/trunk/paddle@1410 1ad973e4-5ce8-4261-8a94-b56d1f490c56
上级 4268885c
Activations
=============
.. doxygenfile:: paddle/gserver/activations/ActivationFunction.h
.. doxygenfile:: paddle/gserver/activations/ActivationFunction.cpp
.. doxygenfile:: paddle/gserver/activations/ActivationFunction.h
.. doxygenfile:: paddle/gserver/activations/ActivationFunction.cpp
Data Providers
================
Data Provider
Base DataProvider
------------------
.. doxygenclass:: paddle::DataProvider
:members:
DataProviderGroup
-------------------
.. doxygenclass:: paddle::DataProviderGroup
:members:
MultiDataProvider
-------------------
.. doxygenclass:: paddle::MultiDataProvider
:members:
PyDataProvider
===================
IFieldScanner
-------------
.. doxygenclass:: paddle::IFieldScanner
:members:
DenseScanner
-------------
.. doxygenclass:: paddle::DenseScanner
:members:
IndexScanner
-------------
.. doxygenclass:: paddle::IndexScanner
:members:
SparseNonValueScanner
---------------------
.. doxygenclass:: paddle::SparseNonValueScanner
:members:
SparseValueScanner
------------------
.. doxygenclass:: paddle::SparseValueScanner
:members:
SequenceScanner
------------------
.. doxygenclass:: paddle::SparseValueScanner
:members:
IPyDataProviderCache
--------------------
.. doxygenclass:: paddle::IPyDataProviderCache
:members:
NoCacheStrategy
---------------
.. doxygenfile:: paddle/gserver/dataproviders/DataProvider.h
.. doxygenfile:: paddle/gserver/dataproviders/PyDataProvider2.cpp
.. doxygenfile:: paddle/gserver/dataproviders/DataProviderGroup.h
.. doxygenfile:: paddle/gserver/dataproviders/MultiDataProvider.h
.. doxygenclass:: paddle::NoCacheStrategy
:members:
Proto Data Provider
CacheOnePassInMemory
--------------------
.. doxygenfile:: paddle/gserver/dataproviders/ProtoDataProvider.h
.. doxygenfile:: paddle/gserver/dataproviders/ProtoReader.h
.. doxygenclass:: paddle::CacheOnePassInMemory
:members:
IPyDataProvider
---------------
.. doxygenclass:: paddle::PyDataProvider2
:members:
Proto Data Provider
===================
ProtoDataProvider
----------------
.. doxygenclass:: paddle::ProtoDataProvider
:members:
ProtoSequenceDataProvider
----------------
.. doxygenclass:: paddle::ProtoSequenceDataProvider
:members:
Base Evaluator
==============
Evaluator
---------
.. doxygenclass:: paddle::Evaluator
:members:
Utils
=====
SumEvaluator
------------
.. doxygenclass:: paddle::SumEvaluator
:members:
ColumnSumEvaluator
------------------
.. doxygenclass:: paddle::ColumnSumEvaluator
:members:
Classification
==============
ClassificationErrorEvaluator
---------------------------
.. doxygenclass:: paddle::ClassificationErrorEvaluator
:members:
SequenceClassificationErrorEvaluator
------------------------------------
.. doxygenclass:: paddle::SequenceClassificationErrorEvaluator
:members:
AucEvaluator
-------------
.. doxygenclass:: paddle::AucEvaluator
:members:
PrecisionRecallEvaluator
------------------------
.. doxygenclass:: paddle::PrecisionRecallEvaluator
:members:
ChunkEvaluator
--------------
.. doxygenclass:: paddle::ChunkEvaluator
:members:
CTCEvaluator
------------
.. doxygenclass:: paddle::CTCErrorEvaluator
:members:
Rank
====
PnpairEvaluator
-------------
.. doxygenclass:: paddle::PnpairEvaluator
:members:
AucEvaluator
-------------
.. doxygenclass:: paddle::RankAucEvaluator
:members:
Printer
=======
ValuePrinter
-------------
.. doxygenclass:: paddle::ValuePrinter
:members:
GradientPrinter
---------------
.. doxygenclass:: paddle::GradientPrinter
:members:
MaxIdPrinter
------------
.. doxygenclass:: paddle::MaxIdPrinter
:members:
MaxFramePrinter
---------------
.. doxygenclass:: paddle::MaxFramePrinter
:members:
SequenceTextPrinter
------------------
.. doxygenclass:: paddle::SequenceTextPrinter
:members:
ClassificationErrorPrinter
--------------------------
.. doxygenclass:: paddle::ClassificationErrorPrinter
:members:
Evaluators
============
.. doxygenfile:: paddle/gserver/evaluators/Evaluator.h
.. doxygenfile:: paddle/gserver/evaluators/ChunkEvaluator.cpp
.. doxygenfile:: paddle/gserver/evaluators/CTCErrorEvaluator.cpp
==========
.. toctree::
:maxdepth: 3
evaluators.rst
Gradient machines
===================
Gradient Machines
================
Networks
------------
.. doxygenfile:: paddle/gserver/gradientmachines/MultiNetwork.h
.. doxygenfile:: paddle/gserver/gradientmachines/ParallelNeuralNetwork.h
GradientMachine
---------------------
.. doxygenclass:: paddle::GradientMachine
:members:
Gradient Machines
GradientMachineModel
--------------------
.. doxygenfile:: paddle/gserver/gradientmachines/GradientMachine.h
.. doxygenfile:: paddle/gserver/gradientmachines/MultiGradientMachine.h
.. doxygenclass:: paddle::IGradientMachineMode
:members:
MultiGradientMachine
---------------------
.. doxygenclass:: paddle::MultiGradientMachine
:members:
TrainerThread
`````````````
.. doxygenclass:: paddle::TrainerThread
:members:
Recurrent Gradient Machines
-----------------------------
.. doxygenfile:: paddle/gserver/gradientmachines/RecurrentGradientMachine.h
.. doxygenfile:: paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
---------------------------
.. doxygenclass:: paddle::RecurrentGradientMachine
:members:
Networks
========
NeuralNetwork
-------------
.. doxygenclass:: paddle::NeuralNetwork
:members:
ParallelNeuralNetwork
---------------------
.. doxygenclass:: paddle::ParallelNeuralNetwork
:members:
......@@ -118,10 +118,10 @@ public:
data_.push_back(argu);
}
/*
* argus: DataBatch.getStreams()
* size: DataBatch.getSize()
* dataId: sub dataprovider id (in MultiDataProvider)
/**
* @param argus: DataBatch.getStreams()
* @param size: DataBatch.getSize()
* @param dataId: sub dataprovider id (in MultiDataProvider)
*/
void appendArguments(const std::vector<Argument>& argus, int size,
int dataId) {
......@@ -312,22 +312,28 @@ public:
}
};
// Data provider for one input and one integer label
/**
* Data provider for one input and one integer label.
*/
class SimpleDataProviderBase : public DataProvider {
protected:
int64_t sampleDim_; // sample feature dimension
int64_t bufferCapacity_; // the number of samples
/// sample feature dimension
int64_t sampleDim_;
/// the number of samples
int64_t bufferCapacity_;
int64_t sampleNumInBuf_;
int64_t nextItemIndex_; // next item to read in buffer
bool withInfo_; // some user defined info for validation
/// next item to read in buffer
int64_t nextItemIndex_;
/// some user defined info for validation
bool withInfo_;
// data buffer: bufferCapacity_ * nDataDim_
/// data buffer: bufferCapacity_ * nDataDim_
CpuMatrixPtr hInputDataBuf_;
// label buffer:bufferCapacity_ * 1
/// label buffer:bufferCapacity_ * 1
CpuIVectorPtr hInputLabelBuf_;
// info buffer:bufferCapacity_ * 1
/// info buffer:bufferCapacity_ * 1
CpuIVectorPtr hInputInfoBuf_;
ThreadLocal<MatrixPtr> dataBatch_;
......@@ -348,7 +354,7 @@ public:
virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
// return the number of samples in the buffer
/// return the number of samples in the buffer
int64_t fillBuffer();
protected:
......
......@@ -80,7 +80,7 @@ protected:
*/
inline bool iidData() const { return sequenceStartPositions_.empty(); }
// check that sample is consistent with header_
/// check that sample is consistent with header_
void checkSample(const DataSample& sample);
template <class Op>
......@@ -129,14 +129,15 @@ protected:
int64_t currentSequenceIndex_;
// The size should be the number of sequences.
/// The size should be the number of sequences.
std::vector<size_t> shuffledSequenceIds_;
ThreadLocalD<DataBatch> cpuBatch_;
ThreadLocalD<DataBatch> gpuBatch_;
RWLock lock_;
std::vector<StatPtr> nnzStats_; // stats for number of none-zeros entries
// stats for number of none-zeros entries
std::vector<StatPtr> nnzStats_;
};
/**
......
......@@ -1000,20 +1000,34 @@ REGISTER_EVALUATOR(max_frame_printer, MaxFramePrinter);
/**
* Sequence text printer will print text according to index matrix and a
* dictionary. There can be multiple input to this layer:
*
* 1) If there is only one input, the input must be a matrix containing
* the sequence of indices;
*
* 2) If there are more than one input, the first input should be ids,
* and are interpreted as sample ids.
*
* The output format will be:
*
* 1) sequence without sub-sequence, and there is probability.
*
* @code
* id \t prob space_seperated_tokens_from_dictionary_according_to_seq
* @endcode
*
* 2) sequence without sub-sequence, and there is not probability.
*
* @code
* id \t space_seperated_tokens_from_dictionary_according_to_seq
* @endcode
*
* 3) sequence with sub-sequence, and there is not probability.
*
* @code
* id \t space_seperated_tokens_from_dictionary_according_to_sub_seq
* \t \t space_seperated_tokens_from_dictionary_according_to_sub_seq
* ...
* @endcode
*
* Typically SequenceTextPrinter layer takes output of maxid or RecurrentGroup
* with maxid (when generating) as an input.
......
......@@ -68,7 +68,7 @@ public:
numSamples_ += arguments[0].getBatchSize();
}
// finish() should be called before distributeEval
/// finish() should be called before distributeEval
virtual void distributeEval(ParameterClient2* client) {
LOG(FATAL) << "Not implemeted";
}
......@@ -85,7 +85,7 @@ public:
*/
virtual void finish() {}
// finish() should be called before printStats
/// finish() should be called before printStats
virtual void printStats(std::ostream& os) {
os << config_.name() << "="
<< (numSamples_ ? totalScore_ / numSamples_ : 0);
......@@ -130,9 +130,9 @@ public:
/**
* @brief evaluate AUC using colIdx-th column as prediction.
*
* colIdx = 0: the 0-th column.
* colIdx > 0: the colIdx-th column.
* colIdx < 0: the last colIdx-th column.
* - colIdx = 0: the 0-th column.
* - colIdx > 0: the colIdx-th column.
* - colIdx < 0: the last colIdx-th column.
*
*/
AucEvaluator(int32_t colIdx)
......@@ -223,10 +223,14 @@ public:
virtual void distributeEval(ParameterClient2* client);
struct StatsInfo {
double TP; // numbers of true positives
double TN; // numbers of true negatives
double FP; // numbers of false positives
double FN; // numbers of false negatives
/// numbers of true positives
double TP;
/// numbers of true negatives
double TN;
/// numbers of false positives
double FP;
/// numbers of false negatives
double FN;
StatsInfo() : TP(0.0), TN(0.0), FP(0.0), FN(0.0) {}
};
......
......@@ -247,7 +247,7 @@ public:
virtual void restart() {}
// Set the gradient of the output from outside.
/// Set the gradient of the output from outside.
virtual void setOutputGrad(const std::vector<Argument>& args) {
LOG(FATAL) << "Not implemented!";
}
......
......@@ -31,14 +31,15 @@ typedef Queue<int> PidQueue;
typedef std::unique_ptr<TrainerThread> TrainerThreadPtr;
struct GradBuffer {
// GradBuffer is used for gathering gradient for GPU parameters
/// GradBuffer is used for gathering gradient for GPU parameters
int paramId;
// sem is used to notify that the local gradient merge of the current thread
// finished for the current thread.
/// sem is used to notify that the local gradient merge of the current thread
/// finished for the current thread.
Semaphore sem;
std::vector<VectorPtr> bufs; // bufs[mergeIndex]
// bufs[mergeIndex]
std::vector<VectorPtr> bufs;
};
/**
......@@ -189,14 +190,14 @@ public:
return useGpu_;
}
// @return whether to pass the gradients in outArgs_ to each threads.
/// @return whether to pass the gradients in outArgs_ to each threads.
bool isPassGrad() { return isPassGrad_; }
// @brief set whether to pass the gradient in outArgs_ to each threads.
/// @brief set whether to pass the gradient in outArgs_ to each threads.
void setPassGrad(bool isPass) { isPassGrad_ = isPass; }
// Set the gradients of the outputs.
// The gradietns will be copied to each thread in the computing threads.
/// Set the gradients of the outputs.
/// The gradietns will be copied to each thread in the computing threads.
virtual void setOutputGrad(const std::vector<Argument>& args);
protected:
......@@ -205,8 +206,8 @@ protected:
std::vector<TrainerThreadPtr>& getAllThreads() {
return threads_;
}
// Calculate the real device id based on the logical device id and the
// thread id.
/// Calculate the real device id based on the logical device id and the
/// thread id.
int logicalDeviceId2RealDeviceId(int logicalId, int threadId = 0) const {
if (logicalId == -1) {
logicalId = 0;
......@@ -215,8 +216,8 @@ protected:
numDevices_);
}
// Calculate the logical device id based on the real device id and the
// thread id.
/// Calculate the logical device id based on the real device id and the
/// thread id.
int realDeviceId2LogicalDeviceId(int realId, int threadId = 0) const {
if (realId == -1) {
return 0;
......@@ -232,15 +233,15 @@ protected:
return hasNonstaticCpuParamters_;
}
// Called TrainerThread to wait before merging CPU parameter gradients.
/// Called TrainerThread to wait before merging CPU parameter gradients.
void waitBeforeMerge() { trainerBarrier_.wait(); }
// called by MultiGradientMachine and TrainerThread to wait after merging
// CPU parameter graidents.
/// called by MultiGradientMachine and TrainerThread to wait after merging
/// CPU parameter graidents.
void waitAfterMerge() { allBarrier_.wait(); }
// called by MultiGradientMachine and TrainerThread to wait for copyInArgs()
// finishing
/// called by MultiGradientMachine and TrainerThread to wait for copyInArgs()
/// finishing
void waitForCopyInArgs() { allBarrier_.wait(); }
TrainerThreadPtr& getThread(int threadId) {
......@@ -255,8 +256,8 @@ protected:
return passType_;
}
// Called by TrainerThread to notify MultiGradientMachine that the gradient
// for paramId is ready
/// Called by TrainerThread to notify MultiGradientMachine that the gradient
/// for paramId is ready
void notifyGradientTransfer(int paramId);
const std::vector<Argument>& getInArgs() {
......@@ -297,7 +298,7 @@ protected:
virtual void backwardImp(
const UpdateCallback& callback = NULL);
// update all parameters
/// update all parameters
void updateThreadParameters();
void startTask(TaskType taskType);
......@@ -311,7 +312,7 @@ protected:
bool hasNonstaticCpuParamters_;
// store main parameter only
/// store main parameter only
std::unique_ptr<GradientMachine> gradientMachine_;
std::vector<TrainerThreadPtr> threads_;
......@@ -326,7 +327,7 @@ protected:
std::vector<Argument> outArgs_;
hl_stream_t outArgStream_;
// ParameterType which needs to be merged from each GPU
/// ParameterType which needs to be merged from each GPU
std::vector<ParameterType> mergeTypes_;
int numDevices_; /* number of gpu devices */
int numLogicalDevices_; // number of GPU used by one NN
......@@ -334,16 +335,16 @@ protected:
UpdateCallback backwardCallback_;
// barrrier for threads_
/// barrrier for threads_
ThreadBarrier trainerBarrier_;
// barrier for both MultiGradientMachine and threds_
/// barrier for both MultiGradientMachine and threds_
ThreadBarrier allBarrier_;
// indicate whether inArgs is copied before forward()
/// indicate whether inArgs is copied before forward()
bool inArgsCopied_;
// Whether to copy the gradient back from an external input.
/// Whether to copy the gradient back from an external input.
bool isPassGrad_;
};
......@@ -413,7 +414,7 @@ public:
void prefetch();
// copy the output gradient from the main GradientMachine.
/// copy the output gradient from the main GradientMachine.
void copyOutputGrad();
protected:
......@@ -441,51 +442,60 @@ protected:
void backward();
void backwardCallback(Parameter* para);
// call the actuall callback supplied by the caller of
// GradientMachine::backward
/// call the actuall callback supplied by the caller of
/// GradientMachine::backward
void doCallback(int pid);
protected:
MultiGradientMachine* multiMachine_;
ModelConfig config_;
bool stopping_; // whether the thread should stop
int partnerId_; // the threads form which to collect gradient
int threadId_; // from 0 to #threads-1
/// whether the thread should stop
bool stopping_;
/// the threads form which to collect gradient
int partnerId_;
/// from 0 to threads-1
int threadId_;
int deviceId_;
std::unique_ptr<GradientMachine> gradientMachine_;
std::vector<ParameterPtr> parameters_;
// ParameterType which needs to be merged from each GPU
/// ParameterType which needs to be merged from each GPU
std::vector<ParameterType> mergeTypes_;
std::unique_ptr<std::thread> computeThread_; // compute thread
/// compute thread
std::unique_ptr<std::thread> computeThread_;
std::vector<Argument> inArgs_;
std::vector<Argument> outArgs_;
Semaphore taskReadySem_;
Semaphore outArgsReadySem_;
std::unique_ptr<std::thread> copyThread_; // copy thread
PidQueue gradBufQueue_; // queue of gradient needs to be copied to partner
/// copy thread
std::unique_ptr<std::thread> copyThread_;
/// queue of gradient needs to be copied to partner
PidQueue gradBufQueue_;
hl_stream_t gradStream_;
std::unique_ptr<std::thread> gradCollectThread_; // grad merge thread
// queue of gradient needs to be merged with gradient coopied by
// copyGradToBufferThread
/// grad merge thread
std::unique_ptr<std::thread> gradCollectThread_;
/// queue of gradient needs to be merged with gradient coopied by
/// copyGradToBufferThread
PidQueue gradQueue_;
UpdateCallback backwardCallback_;
std::unique_ptr<std::thread> valueDispatchThread_; // value dispatch thread
// queue of the parameter whose the vale are ready for copy
/// value dispatch thread
std::unique_ptr<std::thread> valueDispatchThread_;
/// queue of the parameter whose the vale are ready for copy
PidQueue valueReadyQueue_;
// used to notify all the parameter values are ready
/// used to notify all the parameter values are ready
LockedCondition valueReadyCond_;
hl_stream_t valueStream_;
std::atomic<int> updateCounter_; // how many parameters are updated
/// how many parameters are updated
std::atomic<int> updateCounter_;
bool parameterUpdated_;
// indicate whether inArgs is copied before forward()
/// indicate whether inArgs is copied before forward()
bool inArgsCopied_;
};
......
......@@ -66,12 +66,15 @@ public:
PARAMETER_MOMENTUM},
bool useGpu = FLAGS_use_gpu);
// connect two submodels
// down-submodel's output become up-submodel's input
// *realLayer* is down-submodel's output layer
// *agentLayer* is up-submodel's input agent layer
// by default, connection is one by one,
// if the agent height is smaller than real layer, *height* has to be filled
/**
* Connect two submodels and
* down-submodel's output become up-submodel's input.
* By default, connection is one by one,
* If the agent height is smaller than real layer, *height* has to be filled.
*
* @param realLayer The down-submodel's output layer.
* @param agentLayer The up-submodel's input agent layer.
*/
static void connect(LayerPtr agentLayer, LayerPtr realLayer, int height = 0);
void connect(std::string agentLayerName, NeuralNetwork* srcNN,
std::string realLayerName);
......@@ -98,10 +101,10 @@ public:
virtual void resetState();
virtual void setOutputGrad(const std::vector<Argument>& args);
// set machine state
/// set machine state
virtual void setState(const MachineState& machineState);
// get machine state
/// get machine state
virtual void getState(MachineState& machineState);
static NeuralNetwork* create(const ModelConfig& config);
......@@ -126,8 +129,14 @@ public:
NeuralNetwork* rootNetwork = nullptr);
protected:
// rootNetwork: used in MultiNetwork
// sub networks can get parameters_ and parameterMap_ from base NeuralNetwork
/**
* The constructor of NeuralNetwork.
* The sub networks can get parameters_ and parameterMap_
* from base NeuralNetwork.
*
* @param subModelName The name of sub-model.
* @param rootNetwork It used in MultiNetwork.
*/
NeuralNetwork(std::string subModelName = "",
NeuralNetwork* rootNetwork = nullptr)
: subModelName_(subModelName),
......@@ -146,8 +155,8 @@ protected:
NeuralNetwork* rootNetwork_;
// Whether parameter of this NN is initialized by its own
// (i.e., not by callback supplied with the caller)
/// Whether parameter of this NN is initialized by its own
/// (i.e., not by callback supplied with the caller)
bool paramSelfInited_;
};
......
......@@ -67,7 +67,8 @@ public:
protected:
bool useGpu_;
int numDevices_; /* number of gpu devices */
/// number of gpu devices
int numDevices_;
std::vector<std::unique_ptr<ParallelThread>> threads_;
};
......@@ -97,11 +98,14 @@ public:
JobQueue queue_;
protected:
int threadId_; // from 0 to #threads-1
int deviceId_; // the GPU device Id which the computeThread_ used
/// from 0 to threads-1
int threadId_;
/// the GPU device Id which the computeThread_ used
int deviceId_;
bool useGpu_;
std::unique_ptr<std::thread> computeThread_;
bool stopping_; // whether the thread should stop
/// whether the thread should stop
bool stopping_;
UpdateCallback backwardCallback_;
PassType passType_;
};
......
......@@ -53,8 +53,13 @@ protected:
real coeff_;
};
/*
* MultiClassCrossEntropy
/**
* The cross-entropy loss for multi-class classification task.
* The loss function is:
*
* \f[
* L = - \sum_{i}{t_{k} * log(P(y=k))}
* \f]
*/
class MultiClassCrossEntropy : public CostLayer {
public:
......@@ -68,9 +73,20 @@ public:
void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad);
};
/*
* MultiClassCrossEntropyWithSelfNorm
* \sum_i (-log(x_label(i)) + alpha * log(Z(i)^2)
/**
* The cross-entropy with self-normalization for multi-class classification.
*
* The loss function is:
* \f[
* L = \sum_{i}[-log(P(x_{i})) + alpha * log(Z(x_{i})^2)]
* \f]
*
* The \f$Z(x)\f$ is the softmax normalizer.
*
* [1] Jacob Devlin, Rabih Zbib, Zhongqiang Huang, Thomas Lamar,
* Richard Schwartz, and John Makhoul. Fast and robust neural
* network joint models for statistical machine translation.
* In Proceedings of the ACL 2014 Conference.
*/
class MultiClassCrossEntropyWithSelfNorm : public CostLayer {
public:
......@@ -88,9 +104,11 @@ protected:
MatrixPtr sumInv_;
};
/*
* SoftBinaryClassCrossEntropy
* \sum_i (\sum_j -y_j(i)*log(x_j(i))-(1-y_j(i))*log(1-x_j(i)))
/**
* The cross-entropy for soft binary class.
* \f[
* L = \sum_i (\sum_j -y_j(i)*log(x_j(i))-(1-y_j(i))*log(1-x_j(i)))
* \f]
*/
class SoftBinaryClassCrossEntropy : public CostLayer {
public:
......@@ -107,6 +125,13 @@ protected:
MatrixPtr targetPerDim_;
};
/**
* This cost layer compute Euclidean (L2) loss for real-valued regression
* tasks.
* \f[
* L = \frac{1}{2N} \sum_{i=1}^N {|| \hat{y}_i - y_i||_2^2}
* \f]
*/
class SumOfSquaresCostLayer : public CostLayer {
public:
explicit SumOfSquaresCostLayer(const LayerConfig& config)
......@@ -119,8 +144,17 @@ public:
void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad);
};
/*
* RankingCost
/**
* A cost layer for learning to rank (LTR) task. This layer contains at leat
* three inputs.
* \f[
* C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + log(1 + e^{o_{i,j}}) \\
* o_{i,j} = o_i - o_j \\
* \tilde{P_{i,j}} = \left \{0, 0.5, 1 \right \} \ or \ \left \{0, 1 \right \}
* \f]
*
* [1]. Chris Burges, Tal Shaked, Erin Renshaw, et al. Learning to
* Rank useing Gradient Descent.
*/
class RankingCost : public Layer {
public:
......@@ -155,12 +189,25 @@ private:
double negPairCount_;
MatrixPtr margin_;
MatrixPtr marginGrad_;
// if input label is put in ids (not value), copy to this buffer.
/// if input label is put in ids (not value), copy to this buffer.
MatrixPtr labelBuf_;
LayerPtr weightLayer_;
};
/* lambdaRank listwise LTR approach */
/**
* LambdaRank os a method for learning arbitrary information retrieval
* measures. It can be applied to any algorithm that learns through gradient
* descent. LambdaRank is a listwise method, in that the cost depends on the
* sorted order of the documents. LambdaRank gives the gradient of cost
* function:
*
* \f[
* \lambda_{ij} = \frac{1}{1 + e^{o_i - o_j}} \left| \Delta_{NDCG} \right|
* \f]
*
* [1] Christopher J.C. Burges, Robert Ragno, Quoc Viet Le. Learning to Rank
* with Nonsmooth Cost Functions.
*/
class LambdaCost : public Layer {
public:
explicit LambdaCost(const LayerConfig& config) : Layer(config) {}
......@@ -191,9 +238,11 @@ private:
};
/**
* Cross entropy for multi binary labels
* cost[i] = -sum(label[i][j]*log(output[i][j])
* + (1-label[i][j])*log(1-output[i][j]))
* Cross entropy for multi binary labels.
* \f[
* cost[i] = -sum(label[i][j]*log(output[i][j]) +
* (1-label[i][j])*log(1-output[i][j]))
* \f]
*/
class MultiBinaryLabelCrossEntropy : public CostLayer {
protected:
......@@ -210,13 +259,18 @@ public:
void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad);
};
/*
* Huber loss for robust 2-classes classification
/**
* Huber loss for robust 2-classes classification.
*
* For label={0, 1}, let y=2*label-1. Given output f, the loss is:
* -4*y*f, if y*f < -1
* (1-y*f)^2, if -1 < y*f < 1,
* 0, otherwise
* \f[
* Loss =
* \left\{\begin{matrix}
* 4 * y * f & \textit{if} \ \ y* f < -1 \\
* (1 - y * f)^2 & \textit{if} \ \ -1 < y * f < 1 \\
* 0 & \textit{otherwise}
* \end{matrix}\right.
* \f]
*/
class HuberTwoClass : public CostLayer {
std::vector<Argument> tmpCpuInput_;
......
......@@ -3083,7 +3083,7 @@ following are cost Layers.
@wrap_name_default()
def rank_cost(left, right, lable, weight=None, name=None, coeff=1.0):
"""
A cost Layer for leanrning to rank using gradient descent. Details can refer
A cost Layer for learning to rank using gradient descent. Details can refer
to `papers <http://research.microsoft.com/en-us/um/people/cburges/papers/ICML_ranking.pdf>`_.
This layer contains at least three inputs. The weight is an optional
argument, which affects the cost.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册