diff --git a/paddle/gserver/layers/FactorizationMachineLayer.cpp b/paddle/gserver/layers/FactorizationMachineLayer.cpp index 3bd8d7cb4c7c63fc4fcdc931cf6935b02dbf0824..f0f1738f305503ec262f0de90dc5c4111fd959d0 100644 --- a/paddle/gserver/layers/FactorizationMachineLayer.cpp +++ b/paddle/gserver/layers/FactorizationMachineLayer.cpp @@ -32,12 +32,10 @@ bool FactorizationMachineLayer::init(const LayerMap& layerMap, /* initialize the latentVectors_ */ CHECK_EQ(inputLayers_.size(), 1UL); - size_t height = inputLayers_[0]->getSize(); - CHECK_EQ(parameters_[0]->getSize(), height * factorSize_); - latentVectors_ = - std::unique_ptr(new Weight(height, factorSize_, parameters_[0])); - - v2_ = Matrix::create(height, factorSize_, false, useGpu_); + size_t inputSize = inputLayers_[0]->getSize(); + CHECK_EQ(parameters_[0]->getSize(), inputSize * factorSize_); + latentVectors_ = std::unique_ptr( + new Weight(inputSize, factorSize_, parameters_[0])); return true; } @@ -48,79 +46,85 @@ void FactorizationMachineLayer::forward(PassType passType) { const MatrixPtr& inputV = getInputValue(0); size_t batchSize = inputV->getHeight(); - size_t size = getSize(); - reserveOutput(batchSize, size); + size_t outputSize = getSize(); + size_t inputSize = inputLayers_[0]->getSize(); + reserveOutput(batchSize, outputSize); MatrixPtr outV = getOutputValue(); - Matrix::resizeOrCreate(tmpMul_, batchSize, factorSize_, false, useGpu_); + Matrix::resizeOrCreate( + latentVectorsSquare_, inputSize, factorSize_, false, useGpu_); + Matrix::resizeOrCreate( + inputMulFactor_, batchSize, factorSize_, false, useGpu_); Matrix::resizeOrCreate(tmpOut_, batchSize, factorSize_, false, useGpu_); - REGISTER_TIMER_INFO("FwMulTimer", getName().c_str()); - tmpMul_->mul(*inputV, *latentVectors_->getW()); - tmpMul_->square2(*tmpOut_); + REGISTER_TIMER_INFO("InputMulFactorTimer", getName().c_str()); + inputMulFactor_->mul(*inputV, *latentVectors_->getW()); + inputMulFactor_->square2(*tmpOut_); outV->sumRows(*tmpOut_, 0.5, 0); - x2_ = inputV->clone(0, 0, useGpu_); - if (dynamic_cast(x2_.get())) { - x2_->copyFrom(*inputV); - (dynamic_cast(x2_.get()))->square2(); + inputSquare_ = inputV->clone(0, 0, useGpu_); + if (dynamic_cast(inputSquare_.get())) { + inputSquare_->copyFrom(*inputV); + (dynamic_cast(inputSquare_.get()))->square2(); } else { - inputV->square2(*x2_); + inputV->square2(*inputSquare_); } - latentVectors_->getW()->square2(*v2_); - tmpOut_->mul(*x2_, *v2_); + latentVectors_->getW()->square2(*latentVectorsSquare_); + tmpOut_->mul(*inputSquare_, *latentVectorsSquare_); outV->sumRows(*tmpOut_, -0.5, 1.0); /* activation */ { - REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str()); + REGISTER_TIMER_INFO("FmAtvTimer", getName().c_str()); forwardActivation(); } } void FactorizationMachineLayer::backward(const UpdateCallback& callback) { - /* Do derivation */ { - REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str()); - backwardActivation(); - } + /* Do derivation */ { backwardActivation(); } const MatrixPtr& inputV = getInputValue(0); const MatrixPtr& oGrad = getOutputGrad(); - MatrixPtr tmpSum = - Matrix::create(1, latentVectors_->getW()->getHeight(), false, useGpu_); - MatrixPtr tmpSum_T = Matrix::create(tmpSum->getRowBuf(0), - latentVectors_->getW()->getHeight(), - 1, - false, - useGpu_); + Matrix::resizeOrCreate( + tmpSum_, 1, latentVectors_->getW()->getHeight(), false, useGpu_); + MatrixPtr tmpSumTrans = Matrix::create(tmpSum_->getRowBuf(0), + latentVectors_->getW()->getHeight(), + 1, + false, + useGpu_); /* Calculate the gradients of the latentVectors_ matrix */ if (latentVectors_->getWGrad()) { - MatrixPtr tmpIn = inputV->clone(0, 0, useGpu_); + MatrixPtr tmpInput = inputV->clone(0, 0, useGpu_); if (dynamic_cast(inputV.get())) { - CpuSparseMatrix* inputV_s = dynamic_cast(inputV.get()); - CpuSparseMatrix* x2_s = dynamic_cast(x2_.get()); - CpuSparseMatrix* tmpIn_s = dynamic_cast(tmpIn.get()); - tmpIn_s->copyFrom(*inputV_s); - tmpIn_s->rowScale(0, *inputV_s, *oGrad); - latentVectors_->getWGrad()->mul(*tmpIn_s->getTranspose(), *tmpMul_, 1, 1); - tmpIn_s->rowScale(0, *x2_s, *oGrad); - - MatrixPtr ones = Matrix::create(1, inputV->getHeight(), false, useGpu_); - ones->zeroMem(); - ones->add(-1); - tmpSum->mul(*ones, *tmpIn_s, 1, 0); + CpuSparseMatrix* sparseInputV = + dynamic_cast(inputV.get()); + CpuSparseMatrix* sparseInputSquare = + dynamic_cast(inputSquare_.get()); + CpuSparseMatrix* sparseTmpInput = + dynamic_cast(tmpInput.get()); + sparseTmpInput->copyFrom(*sparseInputV); + sparseTmpInput->rowScale(0, *sparseInputV, *oGrad); + latentVectors_->getWGrad()->mul( + *sparseTmpInput->getTranspose(), *inputMulFactor_, 1, 1); + sparseTmpInput->rowScale(0, *sparseInputSquare, *oGrad); + + Matrix::resizeOrCreate(negOnes_, 1, inputV->getHeight(), false, useGpu_); + negOnes_->zeroMem(); + negOnes_->add(-1); + tmpSum_->mul(*negOnes_, *sparseTmpInput, 1, 0); } else { - tmpIn->rowScale(0, *inputV, *oGrad); - latentVectors_->getWGrad()->mul(*tmpIn->getTranspose(), *tmpMul_, 1, 1); - tmpIn->rowScale(0, *x2_, *oGrad); + tmpInput->rowScale(0, *inputV, *oGrad); + latentVectors_->getWGrad()->mul( + *tmpInput->getTranspose(), *inputMulFactor_, 1, 1); + tmpInput->rowScale(0, *inputSquare_, *oGrad); - tmpSum->sumCols(*tmpIn, -1, 0); + tmpSum_->sumCols(*tmpInput, -1, 0); } latentVectors_->getWGrad()->addRowScale( - 0, *latentVectors_->getW(), *tmpSum_T); + 0, *latentVectors_->getW(), *tmpSumTrans); /* Increasing the number of gradient */ latentVectors_->getParameterPtr()->incUpdate(callback); @@ -129,10 +133,10 @@ void FactorizationMachineLayer::backward(const UpdateCallback& callback) { /* Calculate the input layers gradient */ MatrixPtr inGrad = getInputGrad(0); if (inGrad != NULL) { - MatrixPtr latentVectors_T = latentVectors_->getW()->getTranspose(); - inGrad->mul(*tmpMul_, *latentVectors_T, 1, 1); - tmpSum_T->sumRows(*v2_, -1, 0); - inGrad->addColScale(0, *inputV, *tmpSum); + inGrad->mul( + *inputMulFactor_, *latentVectors_->getW()->getTranspose(), 1, 1); + tmpSumTrans->sumRows(*latentVectorsSquare_, -1, 0); + inGrad->addColScale(0, *inputV, *tmpSum_); inGrad->rowScale(0, *inGrad, *oGrad); } } diff --git a/paddle/gserver/layers/FactorizationMachineLayer.h b/paddle/gserver/layers/FactorizationMachineLayer.h index 7cf064690ff8dd85e6c064a98a1f441036e9d38a..85d40fdb1ee39d9e9c644da03a0a584e2342963c 100644 --- a/paddle/gserver/layers/FactorizationMachineLayer.h +++ b/paddle/gserver/layers/FactorizationMachineLayer.h @@ -34,27 +34,36 @@ namespace paddle { * y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \rangle x_i x_j * \f] * + * The detailed calculation for forward and backward can be found at this paper: + * + * Rendle, Steffen. Factorization machines. IEEE 10th International + * Conference on Data Mining (ICDM). IEEE, 2010. + * * The config file api is factorization_machine. */ class FactorizationMachineLayer : public Layer { protected: - /// The latent vectors, shape: (size, factorSize_) - /// Each row of the latentVectors_ matrix is the latent vector - /// corresponding to one input feature dimension + // The latent vectors, shape: (size, factorSize_) + // Each row of the latentVectors_ matrix is the latent vector + // corresponding to one input feature dimension std::unique_ptr latentVectors_; - /// The hyperparameter that defines the dimensionality of the factorization + // The hyperparameter that defines the dimensionality of the factorization size_t factorSize_; private: - /// The result of input matrix * letent vector matrix that will be used in - /// both forward and backward step - MatrixPtr tmpMul_; + // Store the square values of the letent vectors matrix + MatrixPtr latentVectorsSquare_; + // Store the square values of input matrix + MatrixPtr inputSquare_; + // The result of input matrix * latent vector matrix that will be used in + // both forward and backward step + MatrixPtr inputMulFactor_; + // Temporary calculation result store MatrixPtr tmpOut_; - /// Store the square values of the letent vectors matrix - MatrixPtr v2_; - /// Store the square values of input matrix - MatrixPtr x2_; + MatrixPrt tmpSum_; + // Negative identity matrix + MatrixPtr negOnes_; public: explicit FactorizationMachineLayer(const LayerConfig& config) diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp index 072d75c23d64d5b5b56befa62164bb501588e8e0..04ff618c214274cffbd554f1cd029bb91e29892f 100644 --- a/paddle/gserver/tests/test_LayerGrad.cpp +++ b/paddle/gserver/tests/test_LayerGrad.cpp @@ -2442,6 +2442,7 @@ void testFactorizationMachineLayer(InputType type, bool useGpu) { TEST(Layer, FactorizationMachineLayer) { for (auto useGpu : {false, true}) { testFactorizationMachineLayer(INPUT_DATA, useGpu); + testFactorizationMachineLayer(INPUT_SPARSE_FLOAT_VALUE_DATA, useGpu); } } diff --git a/paddle/math/CpuSparseMatrix.cpp b/paddle/math/CpuSparseMatrix.cpp index e211c23a7e670034af31a38883555bdcabf8b158..6a432cd16b727318a9cd2550632cd1ea2e90c66d 100644 --- a/paddle/math/CpuSparseMatrix.cpp +++ b/paddle/math/CpuSparseMatrix.cpp @@ -262,15 +262,15 @@ void CpuSparseMatrix::printOneRow(std::ostream& os, size_t idx) const { void CpuSparseMatrix::rowScale(size_t cCol, CpuSparseMatrix& b, Matrix& c) { CHECK(getFormat() != SPARSE_CSC) << "Not supported"; - CHECK(height_ == b.getHeight()); - CHECK(width_ == b.getWidth()); + CHECK_EQ(height_, b.getHeight()); + CHECK_EQ(width_, b.getWidth()); real* A = getValue(); real* B = b.getValue(); for (size_t i = 0; i < height_; i++) { size_t start = getRowStartIdx(i); size_t end = getRowStartIdx(i + 1); - CHECK(start == b.getRowStartIdx(i)); - CHECK(end == b.getRowStartIdx(i + 1)); + CHECK_EQ(start, b.getRowStartIdx(i)); + CHECK_EQ(end, b.getRowStartIdx(i + 1)); for (size_t j = start; j < end; j++) { A[j] = B[j] * c.getElement(i, cCol); } diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index 30e334e7c8aa5608ff1bd1ca0345e80f5d8139f1..7e38383bd623c1cbf609f14607a50cdb785c2033 100644 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -7161,16 +7161,26 @@ def factorization_machine(input, The Factorization Machine models pairwise feature interactions as inner product of the learned latent vectors corresponding to each input feature. The Factorization Machine can effectively capture feature interactions - especially when the input is sparse. In practice, usually order 2 feature - interactions are considered using Factorization Machine with the formula: + especially when the input is sparse. + + This implementation only consider the 2-order feature interactions using + Factorization Machine with the formula: + .. math:: y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \rangle x_i x_j + Note: X is the input vector with size n. V is the factor matrix. Each row of V is the latent vector corresponding to each input dimesion. The size of each latent vector is k. + + For details of Factorization Machine, please refer to the paper: + Rendle, Steffen. Factorization machines. IEEE 10th International + Conference on Data Mining (ICDM). IEEE, 2010. + .. code-block:: python factor_machine = factorization_machine(input=input_layer, factor_size=10) + :param input: The input layer. :type input: LayerOutput :param factor_size: The hyperparameter that defines the dimensionality of