diff --git a/paddle/gserver/CMakeLists.txt b/paddle/gserver/CMakeLists.txt index 41ead3c5ecef248830cfb0f8be360f21dcd58e7b..3d6ced713f00bd72622d8aeed3967642b6774ffe 100644 --- a/paddle/gserver/CMakeLists.txt +++ b/paddle/gserver/CMakeLists.txt @@ -34,6 +34,16 @@ else() message(STATUS "Compile with MKLDNNLayers and MKLDNNActivations") endif() +if(NOT WITH_MKLML) + file(GLOB_RECURSE MKL_HEADER RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "MKLPacked*.h") + file(GLOB_RECURSE MKL_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "MKLPacked*.cpp") + list(REMOVE_ITEM GSERVER_HEADER ${MKL_HEADER}) + list(REMOVE_ITEM GSERVER_SOURCES ${MKL_SOURCES}) + message(STATUS "Skip compiling with MKLPackedLayers") +else() + message(STATUS "Compile with MKLPackedLayers") +endif() + if(NOT WITH_GPU) list(REMOVE_ITEM GSERVER_HEADER layers/CudnnConvBaseLayer.h diff --git a/paddle/gserver/layers/MKLPackedGemm.h b/paddle/gserver/layers/MKLPackedGemm.h new file mode 100644 index 0000000000000000000000000000000000000000..3c4c62eeb87f6ad3b5517ab5e8f80156210626e2 --- /dev/null +++ b/paddle/gserver/layers/MKLPackedGemm.h @@ -0,0 +1,94 @@ +/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/math/MathFunctions.h" +#include "paddle/math/Matrix.h" + +namespace paddle { + +class MKLPackedGemm { +protected: + real* weightPacked_; + real* weightTPacked_; + size_t weightHeight_; + size_t weightWidth_; + +public: + MKLPackedGemm(MatrixPtr weight) { + weightHeight_ = weight->getHeight(); + weightWidth_ = weight->getWidth(); + weightPacked_ = + cblas_sgemm_alloc(CblasBMatrix, 1, weightWidth_, weightHeight_); + weightTPacked_ = + cblas_sgemm_alloc(CblasBMatrix, 1, weightWidth_, weightHeight_); + cblas_sgemm_pack(CblasRowMajor, + CblasBMatrix, + CblasNoTrans, + 1, + weightWidth_, + weightHeight_, + 1.0, + weight->getData(), + weightWidth_, + weightPacked_); + cblas_sgemm_pack(CblasRowMajor, + CblasBMatrix, + CblasTrans, + 1, + weightWidth_, + weightHeight_, + 1.0, + weight->getData(), + weightWidth_, + weightTPacked_); + } + void compute(MatrixPtr batch2, MatrixPtr batch1, bool transW = false) { + if (transW) { + cblas_sgemm_compute(CblasRowMajor, + CblasNoTrans, + CblasPacked, + batch2->getHeight(), + weightWidth_, + weightHeight_, + batch1->getData(), + weightHeight_, + weightTPacked_, + weightWidth_, + 1, + batch2->getData(), + weightWidth_); + } else { + cblas_sgemm_compute(CblasRowMajor, + CblasNoTrans, + CblasPacked, + batch2->getHeight(), + weightWidth_, + weightHeight_, + batch1->getData(), + weightHeight_, + weightPacked_, + weightWidth_, + 1, + batch2->getData(), + weightWidth_); + } + } + ~MKLPackedGemm() { + cblas_sgemm_free(weightPacked_); + cblas_sgemm_free(weightTPacked_); + } +}; +} // namespace paddle diff --git a/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp b/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6f455af91ed800f2e3504a865412779ba601442b --- /dev/null +++ b/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp @@ -0,0 +1,311 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "MKLPackedRecurrentLayer.h" + +namespace paddle { + +REGISTER_LAYER(mkl_packed_recurrent, MKLPackedRecurrentLayer); + +bool MKLPackedRecurrentLayer::init(const LayerMap& layerMap, + const ParameterMap& parameterMap) { + if (!Layer::init(layerMap, parameterMap)) return false; + CHECK_EQ(1U, inputLayers_.size()); + CHECK_EQ(1U, parameters_.size()); + CHECK_EQ(getSize() * getSize(), parameters_[0]->getSize()); + weight_.reset(new Weight(getSize(), getSize(), parameters_[0])); + if (biasParameter_.get() != NULL) { + bias_.reset(new Weight(1, getSize(), biasParameter_)); + } + reversed_ = config_.reversed(); + + sgemm_packed_.reset(new MKLPackedGemm(weight_->getW())); + + return true; +} + +void MKLPackedRecurrentLayer::resetState() { + CHECK(!reversed_) << "state is not allowed for reversed recurrent layer"; + Matrix::resizeOrCreate( + prevOutput_, 1, getSize(), /* trans= */ false, useGpu_); + prevOutput_->zeroMem(); +} + +void MKLPackedRecurrentLayer::setState(LayerStatePtr state) { + CHECK(state->value.size() == 1) << "one matrix is expected for RNN state"; + prevOutput_->copyFrom(*(state->value[0])); +} + +LayerStatePtr MKLPackedRecurrentLayer::getState() { + LayerStatePtr res = std::make_shared(); + res->value.push_back(prevOutput_->clone(0, 0, useGpu_)); + res->value[0]->copyFrom(*prevOutput_); + return res; +} + +void MKLPackedRecurrentLayer::forward(PassType passType) { + REGISTER_TIMER_INFO("RecurrentFwTimer", getName().c_str()); + Layer::forward(passType); + const Argument& input = getInput(0); + CHECK(input.sequenceStartPositions); + int batchSize = input.getBatchSize(); + size_t numSequences = input.getNumSequences(); + resetOutput(batchSize, getSize()); + CHECK_EQ(getSize(), input.value->getWidth()); + const int* starts = input.sequenceStartPositions->getData(false); + CHECK_EQ(starts[numSequences], batchSize); + + output_.value->assign(*input.value); + if (bias_) { + output_.value->addBias(*bias_->getW(), 1); + } + if (!FLAGS_rnn_use_batch) { + forwardSequence(batchSize, numSequences, starts); + } else { + forwardBatch(batchSize, numSequences, starts); + } +} + +void MKLPackedRecurrentLayer::forwardSequence(int batchSize, + size_t numSequences, + const int* starts) { + REGISTER_TIMER_INFO("RecurrentFwSequence", getName().c_str()); + + frameOutput_.reserve(batchSize); + for (int i = frameOutput_.size(); i < batchSize; ++i) { + Argument arg; + arg.value = Matrix::create(nullptr, + /* height= */ 1, + getSize(), + /* trans= */ false, + useGpu_); + arg.grad = Matrix::create(nullptr, + /* height= */ 1, + getSize(), + /* trans= */ false, + useGpu_); + frameOutput_.push_back(arg); + } + + for (int i = 0; i < batchSize; ++i) { + frameOutput_[i].value->setData(output_.value->getData() + i * getSize()); + } + + for (size_t i = 0; i < numSequences; ++i) { + forwardOneSequence(starts[i], starts[i + 1] - starts[i]); + } +} + +void MKLPackedRecurrentLayer::forwardOneSequence(int start, int length) { + if (!reversed_) { + if (prevOutput_) { + frameOutput_[start].value->mul(*prevOutput_, *weight_->getW(), 1, 1); + } + activation_->forward(frameOutput_[start]).check(); + + for (int i = 1; i < length; ++i) { + frameOutput_[start + i].value->mul( + *frameOutput_[start + i - 1].value, *weight_->getW(), 1, 1); + activation_->forward(frameOutput_[start + i]).check(); + } + if (prevOutput_) { + prevOutput_->assign(*frameOutput_[start + length - 1].value); + } + } else { + activation_->forward(frameOutput_[start + length - 1]).check(); + for (int i = length - 2; i >= 0; --i) { + frameOutput_[start + i].value->mul( + *frameOutput_[start + i + 1].value, *weight_->getW(), 1, 1); + activation_->forward(frameOutput_[start + i]).check(); + } + } +} + +void MKLPackedRecurrentLayer::backward(const UpdateCallback& callback) { + REGISTER_TIMER_INFO("RecurrentBwTimer", getName().c_str()); + const Argument& input = getInput(0); + CHECK(input.sequenceStartPositions); + int batchSize = input.getBatchSize(); + const int* starts = input.sequenceStartPositions->getData(false); + size_t numSequences = input.getNumSequences(); + + if (!FLAGS_rnn_use_batch) { + backwardSequence(batchSize, numSequences, starts); + } else { + backwardBatch(batchSize, numSequences, starts); + } + + if (input.grad) { + input.grad->add(*output_.grad); + } + + if (bias_ && bias_->getWGrad()) { + bias_->getWGrad()->collectBias(*output_.grad, 1); + bias_->getParameterPtr()->incUpdate(callback); + } + + weight_->getParameterPtr()->incUpdate(callback); + sgemm_packed_.reset(new MKLPackedGemm(weight_->getW())); +} + +void MKLPackedRecurrentLayer::backwardSequence(int batchSize, + size_t numSequences, + const int* starts) { + REGISTER_TIMER_INFO("RecurrentBwSequence", getName().c_str()); + for (int i = 0; i < batchSize; ++i) { + frameOutput_[i].grad->setData(output_.grad->getData() + i * getSize()); + } + + for (size_t i = 0; i < numSequences; ++i) { + backwardOneSequence(starts[i], starts[i + 1] - starts[i]); + } +} + +void MKLPackedRecurrentLayer::backwardOneSequence(int start, int length) { + MatrixPtr weightT = weight_->getW()->getTranspose(); + if (!reversed_) { + for (int i = length - 1; i > 0; --i) { + activation_->backward(frameOutput_[start + i]).check(); + frameOutput_[start + i - 1].grad->mul( + *frameOutput_[start + i].grad, *weightT, 1, 1); + } + activation_->backward(frameOutput_[start]).check(); + if (weight_->getWGrad()) { + weight_->getWGrad()->mul( + *output_.value->subMatrix(start, length - 1)->getTranspose(), + *output_.grad->subMatrix(start + 1, length - 1), + 1, + 1); + } + } else { + for (int i = 0; i < length - 1; ++i) { + activation_->backward(frameOutput_[start + i]).check(); + frameOutput_[start + i + 1].grad->mul( + *frameOutput_[start + i].grad, *weightT, 1, 1); + } + activation_->backward(frameOutput_[start + length - 1]).check(); + if (weight_->getWGrad()) { + weight_->getWGrad()->mul( + *output_.value->subMatrix(start + 1, length - 1)->getTranspose(), + *output_.grad->subMatrix(start, length - 1), + 1, + 1); + } + } +} + +void MKLPackedRecurrentLayer::forwardBatch(int batchSize, + size_t numSequences, + const int* starts) { + if (!batchValue_) { + batchValue_.reset(new SequenceToBatch(useGpu_)); + } + + batchValue_->resizeOrCreateBatch(batchSize, numSequences, starts, reversed_); + + batchValue_->copyFromSeq(*output_.value); + + { + REGISTER_TIMER_INFO("RecurrentFwBatch", getName().c_str()); + /* forward one batch */ + for (size_t n = 0; n < batchValue_->getNumBatch(); n++) { + MatrixPtr batch2 = batchValue_->getBatchValue(n); + + if (n != 0) { + MatrixPtr batch1 = + batchValue_->getBatchValue(n - 1, batch2->getHeight()); + + // batch2->mul(*batch1, *weight_->getW(), 1, 1); + sgemm_packed_->compute(batch2, batch1); + } + +#pragma omp parallel for collapse(2) + for (size_t i = 0; i < batch2->getHeight(); i++) { + for (size_t j = 0; j < batch2->getWidth(); j++) { + *(batch2->getData() + i * batch2->getWidth() + j) = + *(batch2->getData() + i * batch2->getWidth() + j) > 0 + ? *(batch2->getData() + i * batch2->getWidth() + j) + : 0; + } + } + } + } + + batchValue_->copyBackSeq(*output_.value); +} + +void MKLPackedRecurrentLayer::backwardBatch(int batchSize, + size_t numSequences, + const int* starts) { + if (!batchGrad_) { + batchGrad_.reset(new SequenceToBatch(useGpu_)); + } + batchGrad_->shareIndexWith(*batchValue_); + + size_t numBatch = batchGrad_->getNumBatch(); + bool backwardByBatch = numBatch < numSequences; + + batchGrad_->copyFromSeq(*output_.grad); + { + REGISTER_TIMER_INFO("RecurrentBwData", getName().c_str()); + /* backward one batch */ + for (int n = (int)numBatch - 1; n >= 0; n--) { + MatrixPtr batch2 = batchGrad_->getBatchValue(n); + MatrixPtr batch1 = batchValue_->getBatchValue(n, batch2->getHeight()); + + Argument arg; + arg.value = batch1; + arg.grad = batch2; + activation_->backward(arg).check(); + + if (n != 0) { + batch1 = batchGrad_->getBatchValue(n - 1, batch2->getHeight()); + // batch1->mul(*batch2, *weightT, 1, 1); + sgemm_packed_->compute(batch1, batch2, true); + } + + if (backwardByBatch && weight_->getWGrad()) { + if (n != 0) { + /* backward weight */ + batch1 = batchValue_->getBatchValue(n - 1, batch2->getHeight()); + weight_->getWGrad()->mul(*batch1->getTranspose(), *batch2, 1, 1); + } + } + } + } + + batchGrad_->copyBackSeq(*output_.grad); + + if (!backwardByBatch && weight_->getWGrad()) { + REGISTER_TIMER_INFO("RecurrentBwWeight", getName().c_str()); + for (size_t seq = 0; seq < numSequences; ++seq) { + int len = starts[seq + 1] - starts[seq]; + if (!reversed_) { + weight_->getWGrad()->mul( + *output_.value->subMatrix(starts[seq], len - 1)->getTranspose(), + *output_.grad->subMatrix(starts[seq] + 1, len - 1), + 1, + 1); + } else { + weight_->getWGrad()->mul( + *output_.value->subMatrix(starts[seq] + 1, len - 1)->getTranspose(), + *output_.grad->subMatrix(starts[seq], len - 1), + 1, + 1); + } + } + } +} + +} // namespace paddle diff --git a/paddle/gserver/layers/MKLPackedRecurrentLayer.h b/paddle/gserver/layers/MKLPackedRecurrentLayer.h new file mode 100644 index 0000000000000000000000000000000000000000..719137f2db74f88ef97a82a430e9f6cc70e2efd1 --- /dev/null +++ b/paddle/gserver/layers/MKLPackedRecurrentLayer.h @@ -0,0 +1,131 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "Layer.h" +#include "MKLPackedGemm.h" +#include "SequenceToBatch.h" +#include "paddle/utils/Stat.h" + +DECLARE_bool(rnn_use_batch); + +namespace paddle { + +/** + * @brief MKLPackedRecurrentLayer takes 1 input layer. The output size is the + * same with + * input layer. + * For each sequence [start, end] it performs the following computation: + * \f[ + * out_{i} = act(in_{i}) \ \ \text{for} \ i = start \\ + * out_{i} = act(in_{i} + out_{i-1} * W) \ \ \text{for} \ start < i <= end + * + * \f] + * If reversed is true, the order is reversed: + * \f[ + * out_{i} = act(in_{i}) \ \ \text{for} \ i = end \\ + * out_{i} = act(in_{i} + out_{i+1} * W) \ \ \text{for} \ start <= i < end + * \f] + * There are two methods to calculate rnn. One way is to compute rnn one + * sequence by one sequence. The other way is to reorganize the input + * into batches, then compute rnn one batch by one batch. Users can select + * them by rnn_use_batch flag. + */ + +class MKLPackedRecurrentLayer : public Layer { +public: + explicit MKLPackedRecurrentLayer(const LayerConfig& config) : Layer(config) {} + + bool init(const LayerMap& layerMap, + const ParameterMap& parameterMap) override; + + void forward(PassType passType) override; + + void backward(const UpdateCallback& callback) override; + + void resetState() override; + + void setState(LayerStatePtr state) override; + + LayerStatePtr getState() override; + +protected: + /** + * @brief If user do not set --rnn_use_batch=true, it will + * compute rnn forward one sequence by one sequence in default. + * @param batchSize Total words number of all samples in this batch. + * @param numSequences The sample number. + * @param starts Each start position of each samples. + */ + void forwardSequence(int batchSize, size_t numSequences, const int* starts); + /** + * @brief Compute rnn forward by one sequence. + * @param start The start position of this sequence (or sample). + * @param length The length of this sequence (or sample), namely the words + * number of this sequence. + */ + void forwardOneSequence(int start, int length); + /** + * @brief Compute rnn backward one sequence by onesequence. + * @param batchSize Total words number of all samples in this batch. + * @param numSequences The sample number. + * @param starts Each start position of each samples. + */ + void backwardSequence(int batchSize, size_t numSequences, const int* starts); + /** + * @brief Compute rnn backward by one sequence. + * @param start The start position of this sequence (or sample). + * @param length The length of this sequence (or sample), namely the words + * number of this sequence. + */ + void backwardOneSequence(int start, int length); + + /** + * @brief Reorganize input into batches and compute rnn forward batch + * by batch. It will convert batch shape to sequence after finishing forward. + * The batch info can refer to SequenceToBatch class. + * @param batchSize Total words number of all samples in this batch. + * @param numSequences The sample number. + * @param starts Each start position of each samples. + */ + void forwardBatch(int batchSize, size_t numSequences, const int* starts); + + /** + * @brief Reorganize input into batches and compute rnn forward batch + * by batch. + * @param batchSize Total words number of all samples in this batch. + * @param numSequences The sample number. + * @param starts Each start position of each samples. + */ + void backwardBatch(int batchSize, size_t numSequences, const int* starts); + +protected: + std::unique_ptr weight_; + std::unique_ptr bias_; + + /// frameOutput_[i] is used to hold the i-th sample of output_ + std::vector frameOutput_; + MatrixPtr prevOutput_; + /// Whether compute rnn by reverse. + bool reversed_; + /// If compute batch by batch, batchValue_ will be used to save the + /// reorganized input value. + std::unique_ptr batchValue_; + /// If compute batch by batch, batchGrad_ will be used to save the + /// gradient with respect to reorganized input value. + std::unique_ptr batchGrad_; + + std::unique_ptr sgemm_packed_; +}; +}