diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp b/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp index d62a8d846e5b347aa44ce1951c043d5813a5b3ff..6317b66a459db3e4c2165210888e6b884ed2d1bc 100644 --- a/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp +++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp @@ -64,49 +64,111 @@ void HierarchicalSigmoidLayer::forward(PassType passType) { batchSize, codeLength_, /* trans */ false, - useGpu(deviceId_)); + false); Matrix::resizeOrCreate(preOutput_.grad, batchSize, codeLength_, /* trans */ false, - useGpu(deviceId_)); - + false); IVectorPtr label = getInput(*getLabelLayer()).ids; - preOutput_.value->zeroMem(); + if (useGpu_) { + Matrix::resizeOrCreate(cpuOutput_, + output_.value->getHeight(), + output_.value->getWidth(), + /* trans */ false, + false); + IVector::resizeOrCreate(cpuLabel_, label->getSize(), false); + cpuLabel_->copyFrom(*label); + cpuOutput_->copyFrom(*output_.value); + } else { + cpuOutput_ = output_.value; + cpuLabel_ = label; + } /* add the bias-vector */ if (biases_.get() != NULL) { - preOutput_.value->addByBitCode(numClasses_, *label, *biases_->getW()); + if (useGpu_) { + Matrix::resizeOrCreate(cpuBias_, + 1, + numClasses_ - 1, + /* trans */ false, + false); + cpuBias_->copyFrom(*biases_->getW()); + } else { + cpuBias_ = biases_->getW(); + } + preOutput_.value->addByBitCode(numClasses_, *cpuLabel_, *cpuBias_); } for (size_t i = 0; i < inputLayers_.size() - 1; ++i) { MatrixPtr input = getInputValue(i); + if (useGpu_) { + Matrix::resizeOrCreate(cpuInput_, + input->getHeight(), + input->getWidth(), + /* trans */ false, + false); + Matrix::resizeOrCreate(cpuWeight_, + weights_[i]->getW()->getHeight(), + weights_[i]->getW()->getWidth(), + /* trans */ false, + false); + cpuInput_->copyFrom(*input); + cpuWeight_->copyFrom(*weights_[i]->getW()); + } else { + cpuInput_ = input; + cpuWeight_ = weights_[i]->getW(); + } preOutput_.value->mulByBitCode( - numClasses_, *label, *weights_[i]->getW(), *input); + numClasses_, *cpuLabel_, *cpuWeight_, *cpuInput_); } // keep consistent with the clipping in the following softrelu preOutput_.value->clip(-40.0, 40.0); preOutput_.value->sumByBitCode(numClasses_, - *label, - *output_.value, + *cpuLabel_, + *cpuOutput_, -1); // scaleSum preOutput_.value->softrelu(*preOutput_.value); - MatrixPtr sum = - Matrix::create(batchSize, 1, /* trans= */ false, useGpu(deviceId_)); + MatrixPtr sum = Matrix::create(batchSize, 1, /* trans= */ false, false); preOutput_.value->rowSum(*sum); - output_.value->add(*sum); + cpuOutput_->add(*sum); + if (useGpu_) { + output_.value->copyFrom(*cpuOutput_); + } else { + output_.value = cpuOutput_; + } } void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) { IVectorPtr label = getInput(*getLabelLayer()).ids; + if (useGpu_) { + IVector::resizeOrCreate(cpuLabel_, label->getSize(), false); + cpuLabel_->copyFrom(*label); + } else { + cpuLabel_ = label; + } preOutput_.grad->one(); preOutput_.grad->softreluDerivative(*preOutput_.value); - preOutput_.grad->subByBitCode(numClasses_, *label); + preOutput_.grad->subByBitCode(numClasses_, *cpuLabel_); if (biases_ && biases_->getWGrad()) { - preOutput_.grad->addByBitCodeBackward( - numClasses_, *label, *biases_->getWGrad()); - + MatrixPtr biases_grad = biases_->getWGrad(); + if (useGpu_) { + Matrix::resizeOrCreate(cpuBias_, + 1, + numClasses_ - 1, + /* trans */ false, + false); + cpuBias_->copyFrom(*biases_grad); + } else { + cpuBias_ = biases_grad; + } + preOutput_.grad->addByBitCodeBackward(numClasses_, *cpuLabel_, *cpuBias_); + if (useGpu) { + biases_grad->copyFrom(*cpuBias_); + } else { + biases_grad = cpuBias_; + } /* Increasing the number of gradient */ biases_->getParameterPtr()->incUpdate(callback); } @@ -115,9 +177,31 @@ void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) { /* Calculate the W-gradient for the current layer */ MatrixPtr input = getInputValue(i); if (weights_[i]->getWGrad()) { + MatrixPtr weights_grad = weights_[i]->getWGrad(); + if (useGpu_) { + Matrix::resizeOrCreate(cpuInput_, + input->getHeight(), + input->getWidth(), + /* trans */ false, + false); + Matrix::resizeOrCreate(cpuWeightGrad_, + weights_grad->getHeight(), + weights_grad->getWidth(), + /* trans */ false, + false); + cpuInput_->copyFrom(*input); + cpuWeightGrad_->copyFrom(*weights_grad); + } else { + cpuInput_ = input; + cpuWeightGrad_ = weights_grad; + } preOutput_.grad->mulByBitCodeBackwardWeight( - numClasses_, *label, *weights_[i]->getWGrad(), *input); - + numClasses_, *cpuLabel_, *cpuWeightGrad_, *cpuInput_); + if (useGpu_) { + weights_grad->copyFrom(*cpuWeightGrad_); + } else { + weights_grad = cpuWeightGrad_; + } /* Increasing the number of gradient */ weights_[i]->getParameterPtr()->incUpdate(callback); } @@ -125,8 +209,30 @@ void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) { /* Calculate the input layers error */ MatrixPtr inputGrad = getInputGrad(i); if (inputGrad) { + if (useGpu_) { + Matrix::resizeOrCreate(cpuInputGrad_, + inputGrad->getHeight(), + inputGrad->getWidth(), + /* trans */ false, + false); + Matrix::resizeOrCreate(cpuWeight_, + weights_[i]->getW()->getHeight(), + weights_[i]->getW()->getWidth(), + /* trans */ false, + false); + cpuInputGrad_->copyFrom(*inputGrad); + cpuWeight_->copyFrom(*weights_[i]->getW()); + } else { + cpuInputGrad_ = inputGrad; + cpuWeight_ = weights_[i]->getW(); + } preOutput_.grad->mulByBitCodeBackwardError( - numClasses_, *label, *weights_[i]->getW(), *inputGrad); + numClasses_, *cpuLabel_, *cpuWeight_, *cpuInputGrad_); + if (useGpu_) { + inputGrad->copyFrom(*cpuInputGrad_); + } else { + inputGrad = cpuInputGrad_; + } } } } diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.h b/paddle/gserver/layers/HierarchicalSigmoidLayer.h index 9afd40b1674680da962d6e51caa56b46279b70de..7f896e61ca26e3e22b99b65b1285384a121f7f02 100644 --- a/paddle/gserver/layers/HierarchicalSigmoidLayer.h +++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.h @@ -80,6 +80,15 @@ protected: int codeLength_; /// temporary result of output_ Argument preOutput_; + + /// The temporary variables in CPU memory. + MatrixPtr cpuWeight_; + MatrixPtr cpuWeightGrad_; + MatrixPtr cpuInput_; + MatrixPtr cpuInputGrad_; + MatrixPtr cpuBias_; + MatrixPtr cpuOutput_; + IVectorPtr cpuLabel_; }; } // namespace paddle diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp index a9fc733d1de441dea9f817c18ec65743836c2f23..c5359f272b4bed4d4d2483bf19d7ae482b0d33dd 100644 --- a/paddle/gserver/tests/test_LayerGrad.cpp +++ b/paddle/gserver/tests/test_LayerGrad.cpp @@ -681,12 +681,13 @@ TEST(Layer, hsigmoidLayer) { config.layerConfig.add_inputs(); config.layerConfig.add_inputs(); - // Not support GPU now - testLayerGrad(config, - "hsigmoid", - 100, - /* trans */ false, /* useGpu */ - false); + for (auto useGpu : {false, true}) { + testLayerGrad(config, + "hsigmoid", + 100, + /* trans */ false, + /* useGpu */ useGpu); + } } TEST(Layer, multi_cross) {