diff --git a/paddle/api/Arguments.cpp b/paddle/api/Arguments.cpp index 6f51d551200696ebafade2a46243b78086975265..b539374cd4aa5a9510cdb728c1b22edf65a9f880 100644 --- a/paddle/api/Arguments.cpp +++ b/paddle/api/Arguments.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "PaddleAPI.h" #include "PaddleAPIPrivate.h" @@ -112,7 +111,7 @@ void Arguments::setSlotSequenceStartPositions(size_t idx, } void Arguments::setSlotSubSequenceStartPositions( - size_t idx, IVector *vec) throw(RangeError) { + size_t idx, IVector* vec) throw(RangeError) { auto& a = m->getArg(idx); auto& v = m->cast(vec->getSharedPtr()); a.subSequenceStartPositions = std::make_shared(v); diff --git a/paddle/api/ConfigParser.cpp b/paddle/api/ConfigParser.cpp index 25d94f5a6a1255f3e2faff9816cfd003b20c0418..bc40d871d180a6bfe21200c866181dc161f5f078 100644 --- a/paddle/api/ConfigParser.cpp +++ b/paddle/api/ConfigParser.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "PaddleAPI.h" #include "PaddleAPIPrivate.h" #include "paddle/trainer/Trainer.h" @@ -44,8 +43,7 @@ TrainerConfig* TrainerConfig::createFromTrainerConfigFile( return retv; } -TrainerConfig* TrainerConfig::createFromProtoString( - const std::string& str) { +TrainerConfig* TrainerConfig::createFromProtoString(const std::string& str) { auto retv = new TrainerConfig(); paddle::TrainerConfig trainerConfigProto; auto conf = std::make_shared(trainerConfigProto); diff --git a/paddle/api/GradientMachine.cpp b/paddle/api/GradientMachine.cpp index bef499c67858b8e2d5432155a8defca56af6019c..9a4846d80980e23e97f89b6134e15af71207ae6b 100644 --- a/paddle/api/GradientMachine.cpp +++ b/paddle/api/GradientMachine.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "PaddleAPI.h" #include "PaddleAPIPrivate.h" @@ -27,7 +26,8 @@ GradientMachine::GradientMachine() : m(new GradientMachinePrivate()) {} GradientMachine::~GradientMachine() { delete m; } GradientMachine* GradientMachine::createFromPaddleModelPtr( - const void* confPtr, GradientMatchineCreateMode mode, + const void* confPtr, + GradientMatchineCreateMode mode, const std::vector& types) { auto& conf = *(const paddle::ModelConfig*)(confPtr); std::vector realTypes; @@ -44,7 +44,8 @@ GradientMachine* GradientMachine::createFromPaddleModelPtr( } GradientMachine* GradientMachine::createByConfigProtoStr( - const std::string& protoStr, GradientMatchineCreateMode mode, + const std::string& protoStr, + GradientMatchineCreateMode mode, const std::vector& types) { paddle::ModelConfig conf; conf.ParseFromString(protoStr); @@ -56,13 +57,15 @@ GradientMachine* GradientMachine::createByConfigProtoStr( } GradientMachine* GradientMachine::createByModelConfig( - ModelConfig* conf, GradientMatchineCreateMode mode, + ModelConfig* conf, + GradientMatchineCreateMode mode, const std::vector& types) { auto confPtr = &conf->m->conf->getModelConfig(); return GradientMachine::createFromPaddleModelPtr(confPtr, mode, types); } -void GradientMachine::forward(const Arguments& inArgs, Arguments* outArgs, +void GradientMachine::forward(const Arguments& inArgs, + Arguments* outArgs, PassType passType) { auto& in = m->cast>(inArgs.getInternalArgumentsPtr()); @@ -99,7 +102,8 @@ void GradientMachine::backward(const UpdateCallback& callback) { } void GradientMachine::forwardBackward(const Arguments& inArgs, - Arguments* outArgs, PassType passType, + Arguments* outArgs, + PassType passType, const UpdateCallback& callback) { auto& in = m->cast>(inArgs.getInternalArgumentsPtr()); @@ -129,7 +133,7 @@ Parameter* GradientMachine::getParameter(size_t i) throw(RangeError) { void GradientMachine::randParameters() { m->machine->randParameters(); } Matrix* GradientMachine::getLayerOutput(const std::string& layerName) const - throw(UnsupportError) { + throw(UnsupportError) { auto nn = std::dynamic_pointer_cast(m->machine); if (nn) { auto mat = nn->getLayerOutput(layerName); @@ -140,8 +144,11 @@ Matrix* GradientMachine::getLayerOutput(const std::string& layerName) const } SequenceGenerator* GradientMachine::asSequenceGenerator( - const std::vector& dict, size_t begin_id, size_t end_id, - size_t max_length, size_t beam_size) { + const std::vector& dict, + size_t begin_id, + size_t end_id, + size_t max_length, + size_t beam_size) { SequenceGenerator* r = SequenceGenerator::createByGradientMachineSharedPtr(&m->machine); r->setDict(dict); diff --git a/paddle/api/Internal.h b/paddle/api/Internal.h index b990f650be9fa401898a8c6d10c21d9c90eb728a..66a13bc603ed5098997f168d3f527160ac3822ef 100644 --- a/paddle/api/Internal.h +++ b/paddle/api/Internal.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "PaddleAPI.h" @@ -23,7 +22,8 @@ limitations under the License. */ template void staticCastVector(std::vector* dest, const std::vector& src) { dest->resize(src.size()); - std::transform(src.begin(), src.end(), dest->begin(), [](T1 t){ - return static_cast(t); - }); + std::transform(src.begin(), + src.end(), + dest->begin(), + [](T1 t) { return static_cast(t); }); } diff --git a/paddle/api/Matrix.cpp b/paddle/api/Matrix.cpp index e5493a381a6f9e3d135c14649a8e1e438494d363..f257ee65aa4a12dfcd1914ddbf0e16461a9b128c 100644 --- a/paddle/api/Matrix.cpp +++ b/paddle/api/Matrix.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "PaddleAPI.h" #include "paddle/math/Matrix.h" #include "paddle/math/SparseMatrix.h" @@ -44,17 +43,21 @@ Matrix* Matrix::createZero(size_t height, size_t width, bool useGpu) { return m; } -Matrix* Matrix::createDense(const std::vector& data, size_t height, - size_t width, bool useGpu) { +Matrix* Matrix::createDense(const std::vector& data, + size_t height, + size_t width, + bool useGpu) { auto m = new Matrix(); m->m->mat = paddle::Matrix::create(height, width, useGpu); m->m->mat->copyFrom(data.data(), data.size()); return m; } -Matrix* Matrix::createDenseFromNumpy(float* data, int dim1, int dim2, - bool copy, bool useGpu) - throw (UnsupportError) { +Matrix* Matrix::createDenseFromNumpy(float* data, + int dim1, + int dim2, + bool copy, + bool useGpu) throw(UnsupportError) { if (useGpu) { /// Gpu mode only supports copy=True if (!copy) { @@ -66,7 +69,9 @@ Matrix* Matrix::createDenseFromNumpy(float* data, int dim1, int dim2, } } -Matrix* Matrix::createCpuDenseFromNumpy(float* data, int dim1, int dim2, +Matrix* Matrix::createCpuDenseFromNumpy(float* data, + int dim1, + int dim2, bool copy) { auto m = new Matrix(); if (copy) { @@ -85,12 +90,20 @@ Matrix* Matrix::createGpuDenseFromNumpy(float* data, int dim1, int dim2) { return m; } -Matrix* Matrix::createSparse(size_t height, size_t width, size_t nnz, - bool isNonVal, bool isTrans, bool useGpu) { +Matrix* Matrix::createSparse(size_t height, + size_t width, + size_t nnz, + bool isNonVal, + bool isTrans, + bool useGpu) { auto m = new Matrix(); m->m->mat = paddle::Matrix::createSparseMatrix( - height, width, nnz, isNonVal ? paddle::NO_VALUE : paddle::FLOAT_VALUE, - isTrans, useGpu); + height, + width, + nnz, + isNonVal ? paddle::NO_VALUE : paddle::FLOAT_VALUE, + isTrans, + useGpu); return m; } @@ -221,7 +234,8 @@ FloatArray Matrix::getData() const { } void Matrix::sparseCopyFrom( - const std::vector& rows, const std::vector& cols, + const std::vector& rows, + const std::vector& cols, const std::vector& vals) throw(UnsupportError) { auto cpuSparseMat = std::dynamic_pointer_cast(m->mat); @@ -240,7 +254,8 @@ void Matrix::sparseCopyFrom( void* Matrix::getSharedPtr() const { return &m->mat; } -void Matrix::toNumpyMatInplace(float** view_data, int* dim1, +void Matrix::toNumpyMatInplace(float** view_data, + int* dim1, int* dim2) throw(UnsupportError) { auto cpuMat = std::dynamic_pointer_cast(m->mat); if (cpuMat) { @@ -251,7 +266,8 @@ void Matrix::toNumpyMatInplace(float** view_data, int* dim1, throw UnsupportError(); } } -void Matrix::copyToNumpyMat(float** view_m_data, int* dim1, +void Matrix::copyToNumpyMat(float** view_m_data, + int* dim1, int* dim2) throw(UnsupportError) { static_assert(sizeof(paddle::real) == sizeof(float), "Currently PaddleAPI only support for single " @@ -269,8 +285,8 @@ void Matrix::copyToNumpyMat(float** view_m_data, int* dim1, } else if (auto gpuMat = dynamic_cast(m->mat.get())) { auto src = gpuMat->getData(); auto dest = *view_m_data; - hl_memcpy_device2host(dest, src, - sizeof(paddle::real) * (*dim1) * (*dim2)); + hl_memcpy_device2host( + dest, src, sizeof(paddle::real) * (*dim1) * (*dim2)); } else { LOG(WARNING) << "Unexpected Situation"; throw UnsupportError(); @@ -278,7 +294,8 @@ void Matrix::copyToNumpyMat(float** view_m_data, int* dim1, } } -void Matrix::copyFromNumpyMat(float* data, int dim1, +void Matrix::copyFromNumpyMat(float* data, + int dim1, int dim2) throw(UnsupportError, RangeError) { if (isSparse()) { throw UnsupportError(); diff --git a/paddle/api/PaddleAPI.h b/paddle/api/PaddleAPI.h index 5688ece44d2d58a2184a9f23d4af26c51c319579..c07facdb1292b34ac31247160a4347ea359e718b 100644 --- a/paddle/api/PaddleAPI.h +++ b/paddle/api/PaddleAPI.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include @@ -61,8 +60,8 @@ class RangeError {}; /// Not support Error, such as access GPU memory directly, etc. class UnsupportError : public std::runtime_error { public: - UnsupportError() : std::runtime_error(" ") {}; - UnsupportError(const std::string& message) : std::runtime_error(message) {}; + UnsupportError() : std::runtime_error(" "){}; + UnsupportError(const std::string& message) : std::runtime_error(message){}; }; /// This type will map to python's list of float. @@ -112,7 +111,8 @@ public: /** * Create A Matrix with height,width, which is filled by zero. */ - static Matrix* createZero(size_t height, size_t width, + static Matrix* createZero(size_t height, + size_t width, bool useGpu = isUsingGpu()); /** @@ -124,8 +124,11 @@ public: * * @note the default sparse type is SPARSE_CSR. */ - static Matrix* createSparse(size_t height, size_t width, size_t nnz, - bool isNonVal = true, bool trans = false, + static Matrix* createSparse(size_t height, + size_t width, + size_t nnz, + bool isNonVal = true, + bool trans = false, bool useGpu = isUsingGpu()); /** @@ -134,13 +137,17 @@ public: * @param data list of float should be passed in python. * @note the value will be copy into a new matrix. */ - static Matrix* createDense(const std::vector& data, size_t height, - size_t width, bool useGpu = isUsingGpu()); - - static Matrix* createDenseFromNumpy(float* data, int dim1, int dim2, - bool copy = true, - bool useGpu = isUsingGpu()) - throw (UnsupportError); + static Matrix* createDense(const std::vector& data, + size_t height, + size_t width, + bool useGpu = isUsingGpu()); + + static Matrix* createDenseFromNumpy( + float* data, + int dim1, + int dim2, + bool copy = true, + bool useGpu = isUsingGpu()) throw(UnsupportError); /** * Create Cpu Dense Matrix from numpy matrix, dtype=float32 @@ -151,7 +158,9 @@ public: * @param copy true if copy into a new matrix, false will create * matrix inplace. */ - static Matrix* createCpuDenseFromNumpy(float* data, int dim1, int dim2, + static Matrix* createCpuDenseFromNumpy(float* data, + int dim1, + int dim2, bool copy = false); /// Create Gpu Dense Matrix from numpy matrix, dtype=float32 @@ -171,11 +180,13 @@ public: * numpy_mat = m.toNumpyMat() * @endcode */ - void toNumpyMatInplace(float** view_data, int* dim1, + void toNumpyMatInplace(float** view_data, + int* dim1, int* dim2) throw(UnsupportError); /// Copy To numpy mat. - void copyToNumpyMat(float** view_m_data, int* dim1, + void copyToNumpyMat(float** view_m_data, + int* dim1, int* dim2) throw(UnsupportError); /// Copy From Numpy Mat @@ -248,15 +259,18 @@ public: static Vector* create(const std::vector& data, bool useGpu = isUsingGpu()); - static Vector* createVectorFromNumpy(float* data, int dim, bool copy = true, - bool useGpu = isUsingGpu()) - throw (UnsupportError); + static Vector* createVectorFromNumpy( + float* data, + int dim, + bool copy = true, + bool useGpu = isUsingGpu()) throw(UnsupportError); /** * Create Cpu Vector from numpy array, which dtype=float32 * * If copy is false, it will create vector inplace. */ - static Vector* createCpuVectorFromNumpy(float* data, int dim, + static Vector* createCpuVectorFromNumpy(float* data, + int dim, bool copy = false); /// Create Gpu Vector from numpy array, which dtype=float32 @@ -312,16 +326,19 @@ public: static IVector* create(const std::vector& data, bool useGpu = isUsingGpu()); - static IVector* createVectorFromNumpy(int* data, int dim, bool copy = true, - bool useGpu = isUsingGpu()) - throw (UnsupportError); + static IVector* createVectorFromNumpy( + int* data, + int dim, + bool copy = true, + bool useGpu = isUsingGpu()) throw(UnsupportError); /** * Create Cpu IVector from numpy array, which dtype=int32 * * If copy is false, it will create vector inplace */ - static IVector* createCpuVectorFromNumpy(int* data, int dim, + static IVector* createCpuVectorFromNumpy(int* data, + int dim, bool copy = false); /** * Create Gpu IVector from numpy array, which dtype=int32 @@ -605,7 +622,8 @@ class ParameterTraverseCallback { public: ~ParameterTraverseCallback(); - void apply(const std::vector& vecs, const ParameterConfig& config, + void apply(const std::vector& vecs, + const ParameterConfig& config, size_t sparseId); private: @@ -638,7 +656,8 @@ public: void finishBatch(); - void update(const std::vector& vecs, const ParameterConfig& conf, + void update(const std::vector& vecs, + const ParameterConfig& conf, size_t sparseId = NO_SPARSE_ID); std::vector getParameterTypes() const; @@ -678,7 +697,8 @@ public: * model config by TrainerConfig */ static GradientMachine* createByModelConfig( - ModelConfig* conf, GradientMatchineCreateMode mode = CREATE_MODE_NORMAL, + ModelConfig* conf, + GradientMatchineCreateMode mode = CREATE_MODE_NORMAL, const std::vector& parameterTypes = defaultParamTypes); /** @@ -701,7 +721,8 @@ public: /** * Combine forward/backward */ - void forwardBackward(const Arguments& inArgs, Arguments* outArgs, + void forwardBackward(const Arguments& inArgs, + Arguments* outArgs, PassType passType, const UpdateCallback& callback = UpdateCallback()); @@ -722,14 +743,17 @@ public: */ SequenceGenerator* asSequenceGenerator( const std::vector& dict = std::vector(), - size_t begin_id = 0UL, size_t end_id = 0UL, size_t max_length = 100UL, + size_t begin_id = 0UL, + size_t end_id = 0UL, + size_t max_length = 100UL, size_t beam_size = -1UL); private: GradientMachinePrivate* m; static GradientMachine* createFromPaddleModelPtr( - const void* confPtr, GradientMatchineCreateMode mode, + const void* confPtr, + GradientMatchineCreateMode mode, const std::vector& types); // Not to use c++ 11 init-list, so we use static var as function default arg. @@ -751,8 +775,8 @@ public: /// Create A Trainer By TrainerConfig. using paddle command line. static Trainer* createByCommandLine() throw(IOError); - static Trainer* create(TrainerConfig* optConfig, GradientMachine* gm) - throw(IOError); + static Trainer* create(TrainerConfig* optConfig, + GradientMachine* gm) throw(IOError); /// Start training void startTrain(); diff --git a/paddle/api/Parameter.cpp b/paddle/api/Parameter.cpp index 8b56adc97c2d6178a9e0b272a9af89732a3573f6..c5876bb1c71438578831ffffd85840c706b6224c 100644 --- a/paddle/api/Parameter.cpp +++ b/paddle/api/Parameter.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "PaddleAPI.h" #include "paddle/parameter/Parameter.h" diff --git a/paddle/api/ParameterOptimizer.cpp b/paddle/api/ParameterOptimizer.cpp index b13761ab0900d57008c17094c5199ef31a040f54..21d031e4bcb897eb693e5cff56bc77a637dc6bd2 100644 --- a/paddle/api/ParameterOptimizer.cpp +++ b/paddle/api/ParameterOptimizer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "PaddleAPI.h" #include "PaddleAPIPrivate.h" #include "paddle/parameter/ParameterOptimizer.h" @@ -32,17 +31,21 @@ struct ParameterTraverseCallbackPrivate { const paddle::ParameterOptimizer::TraverseCallback& callback) : callback(callback) {} - void apply(const std::vector& vecs, const ParameterConfig& conf, + void apply(const std::vector& vecs, + const ParameterConfig& conf, size_t sparseId) { std::vector real_vecs; real_vecs.resize(vecs.size()); - std::transform(vecs.begin(), vecs.end(), real_vecs.begin(), [](Vector* v) { - if (v) { - return *(paddle::VectorPtr*)(v->getSharedPtr()); - } else { - return paddle::VectorPtr(); - } - }); + std::transform(vecs.begin(), + vecs.end(), + real_vecs.begin(), + [](Vector* v) { + if (v) { + return *(paddle::VectorPtr*)(v->getSharedPtr()); + } else { + return paddle::VectorPtr(); + } + }); paddle::ParameterConfig& real_conf = *(paddle::ParameterConfig*)(const_cast(conf) @@ -86,10 +89,12 @@ void ParameterOptimizer::startBatch(size_t numSamplesProcessed) { void ParameterOptimizer::finishBatch() { m->optimizer->finishBatch(); } void ParameterOptimizer::update(const std::vector& vecs, - const ParameterConfig& conf, size_t sparseId) { - ParameterTraverseCallbackPrivate invoker([&]( - const paddle::VectorPtr _vecs[], const paddle::ParameterConfig& config, - size_t sid = -1UL) { m->optimizer->update(_vecs, config, sid); }); + const ParameterConfig& conf, + size_t sparseId) { + ParameterTraverseCallbackPrivate invoker( + [&](const paddle::VectorPtr _vecs[], + const paddle::ParameterConfig& config, + size_t sid = -1UL) { m->optimizer->update(_vecs, config, sid); }); invoker.apply(vecs, conf, sparseId); } @@ -116,8 +121,9 @@ void ParameterTraverseCallback::apply(const std::vector& vecs, ParameterTraverseCallback* ParameterOptimizer::needSpecialTraversal( const ParameterConfig& config) const { - auto& param_config = *(paddle::ParameterConfig*)const_cast( - config).getRawPtr(); + auto& param_config = + *(paddle::ParameterConfig*)const_cast(config) + .getRawPtr(); auto callback = m->optimizer->needSpecialTraversal(param_config); if (callback) { auto retCallback = new ParameterTraverseCallback(); diff --git a/paddle/api/SequenceGenerator.cpp b/paddle/api/SequenceGenerator.cpp index 9d353ccc8e281e72a207ba19f45517fd256d6df2..d51be78d45902967107f4bf0af995958faed931a 100644 --- a/paddle/api/SequenceGenerator.cpp +++ b/paddle/api/SequenceGenerator.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "PaddleAPI.h" #include "paddle/gserver/gradientmachines/GradientMachine.h" #include "paddle/parameter/Argument.h" @@ -42,8 +41,10 @@ struct Path { // position static void findNBest(paddle::GradientMachine* gradMachine, std::vector& inArgs, - std::vector& finalPaths, size_t bos_id, - size_t eos_id, size_t max_length) { + std::vector& finalPaths, + size_t bos_id, + size_t eos_id, + size_t max_length) { std::vector paths; Path emptyPath; paths.push_back(emptyPath); @@ -166,7 +167,8 @@ public: if (id < getSize()) { Path& p = (*path_)[id]; std::ostringstream sout; - std::transform(p.ids.begin(), p.ids.end(), + std::transform(p.ids.begin(), + p.ids.end(), std::ostream_iterator(sout, split ? " " : ""), [&](int id) { return (*dict_)[id]; }); return sout.str(); diff --git a/paddle/api/Trainer.cpp b/paddle/api/Trainer.cpp index b61f36f740d47fe785b30361f26059bf0b64829d..7a6aa69fb652313748b1fa787847ffd74fda7a22 100644 --- a/paddle/api/Trainer.cpp +++ b/paddle/api/Trainer.cpp @@ -64,12 +64,11 @@ Trainer* Trainer::createByCommandLine() throw(IOError) { Trainer::Trainer(TrainerConfig* config, GradientMachine* gm) : m(new TrainerPrivate()) { - m->init(config->m->conf, /* testing= */false, gm ? gm->m->machine : nullptr); + m->init(config->m->conf, /* testing= */ false, gm ? gm->m->machine : nullptr); } -Trainer* Trainer::create(TrainerConfig* config, GradientMachine* gm) - throw(IOError) -{ +Trainer* Trainer::create(TrainerConfig* config, + GradientMachine* gm) throw(IOError) { auto retv = new Trainer(config, gm); if (retv->m->getConfig().IsInitialized()) { return retv; @@ -134,15 +133,17 @@ void Trainer::finishTestPeriod() { m->finishTestPeriod(); } Matrix* Trainer::getLayerOutput(const std::string& layerName) { auto nn = std::dynamic_pointer_cast( - this->m->getGradientMachine()); + this->m->getGradientMachine()); CHECK(nn) << "trainerInternal_.getGradientMachine() is not NeuralNetwork"; auto m = nn->getLayerOutput(layerName); return Matrix::createByPaddleMatrixPtr(&m); } -void Trainer::forwardOneBatch(size_t batchSize) { m->forwardOneBatch(batchSize); } +void Trainer::forwardOneBatch(size_t batchSize) { + m->forwardOneBatch(batchSize); +} -bool TrainerPrivate::forwardOneBatch(size_t batchSize) { +bool TrainerPrivate::forwardOneBatch(size_t batchSize) { CHECK(dataProvider_) << "data_provider is not specified"; paddle::DataBatch dataBatch; int num = dataProvider_->getNextBatch(batchSize, &dataBatch); @@ -156,7 +157,6 @@ bool TrainerPrivate::forwardOneBatch(size_t batchSize) { void TrainerPrivate::forwardOneDataBatch( const std::vector& inArgs) { - std::vector& outArgs = forwardOutput_; if (config_->getOptConfig().use_sparse_remote_updater()) { diff --git a/paddle/api/Util.cpp b/paddle/api/Util.cpp index a8932351a685474a756c3f5b0e5e8c42bbf58237..1bba1df2e1c0a2d3cd2d8307ed3a0d784bb949b4 100644 --- a/paddle/api/Util.cpp +++ b/paddle/api/Util.cpp @@ -37,13 +37,15 @@ FloatArray::FloatArray(const float* b, const size_t l) IntArray::IntArray(const int* b, const size_t l, bool f) : buf(b), length(l), needFree(f) {} -IntWithFloatArray::IntWithFloatArray(const float* v, const int* i, size_t l, +IntWithFloatArray::IntWithFloatArray(const float* v, + const int* i, + size_t l, bool f) : valBuf(v), idxBuf(i), length(l), needFree(f) {} -bool isUsingGpu() {return FLAGS_use_gpu;} +bool isUsingGpu() { return FLAGS_use_gpu; } -void setUseGpu(bool useGpu) {FLAGS_use_gpu = useGpu;} +void setUseGpu(bool useGpu) { FLAGS_use_gpu = useGpu; } bool isGpuVersion() { #ifdef PADDLE_ONLY_CPU diff --git a/paddle/api/Vector.cpp b/paddle/api/Vector.cpp index d44cdefc35bd09e04412b52fb9981947caf89588..cc1c098223826a06fea291a95730d7fc1fd1beb3 100644 --- a/paddle/api/Vector.cpp +++ b/paddle/api/Vector.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "PaddleAPI.h" #include "paddle/math/Vector.h" @@ -39,8 +38,10 @@ IVector* IVector::create(const std::vector& data, bool useGpu) { return v; } -IVector* IVector::createVectorFromNumpy(int* data, int dim, bool copy, - bool useGpu) throw (UnsupportError){ +IVector* IVector::createVectorFromNumpy(int* data, + int dim, + bool copy, + bool useGpu) throw(UnsupportError) { if (useGpu) { /// if use gpu only copy=true is supported if (!copy) { @@ -137,8 +138,8 @@ void IVector::copyToNumpyArray(int** view_m_data, int* dim1) { if (auto cpuVec = dynamic_cast(m->vec.get())) { std::memcpy(*view_m_data, cpuVec->getData(), sizeof(int) * (*dim1)); } else if (auto gpuVec = dynamic_cast(m->vec.get())) { - hl_memcpy_device2host(*view_m_data, gpuVec->getData(), - sizeof(int) * (*dim1)); + hl_memcpy_device2host( + *view_m_data, gpuVec->getData(), sizeof(int) * (*dim1)); } else { LOG(INFO) << "Unexpected situation"; } @@ -201,8 +202,10 @@ Vector* Vector::createByPaddleVectorPtr(void* ptr) { } } -Vector* Vector::createVectorFromNumpy(float* data, int dim, bool copy, - bool useGpu) throw (UnsupportError){ +Vector* Vector::createVectorFromNumpy(float* data, + int dim, + bool copy, + bool useGpu) throw(UnsupportError) { if (useGpu) { /// if use gpu only copy=True is supported if (!copy) { @@ -251,8 +254,8 @@ void Vector::copyToNumpyArray(float** view_m_data, int* dim1) { if (auto cpuVec = dynamic_cast(m->vec.get())) { std::memcpy(*view_m_data, cpuVec->getData(), sizeof(float) * (*dim1)); } else if (auto gpuVec = dynamic_cast(m->vec.get())) { - hl_memcpy_device2host(*view_m_data, gpuVec->getData(), - sizeof(float) * (*dim1)); + hl_memcpy_device2host( + *view_m_data, gpuVec->getData(), sizeof(float) * (*dim1)); } else { LOG(INFO) << "Unexpected situation"; } diff --git a/paddle/cuda/include/hl_activation_functions.h b/paddle/cuda/include/hl_activation_functions.h index c8aabc7844cd48d7ebdd0077684f9efa50f941a2..03e15b2223a50625c6999f6b081ae984e76b182b 100644 --- a/paddle/cuda/include/hl_activation_functions.h +++ b/paddle/cuda/include/hl_activation_functions.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #ifndef HL_ACTIVATION_FUNCTIONS_H_ #define HL_ACTIVATION_FUNCTIONS_H_ @@ -21,11 +20,8 @@ limitations under the License. */ /** * Active functions: sigmoid, relu, tanh and linear. */ -#define HPPL_ACTIVE_FUNCTION {hppl::sigmoid, \ - hppl::relu, \ - hppl::tanh, \ - hppl::linear \ - } +#define HPPL_ACTIVE_FUNCTION \ + { hppl::sigmoid, hppl::relu, hppl::tanh, hppl::linear } namespace hppl { @@ -42,18 +38,18 @@ public: #ifdef __NVCC__ namespace gpu { -static __device__ Active::forward forward[] = HPPL_ACTIVE_FUNCTION; +static __device__ Active::forward forward[] = HPPL_ACTIVE_FUNCTION; static __device__ Active::backward backward[] = HPPL_ACTIVE_FUNCTION; } #else namespace cpu { -static Active::forward forward[] = HPPL_ACTIVE_FUNCTION; +static Active::forward forward[] = HPPL_ACTIVE_FUNCTION; static Active::backward backward[] = HPPL_ACTIVE_FUNCTION; } #ifdef __AVX__ namespace avx { -static Active<__m256>::forward forward[] = HPPL_ACTIVE_FUNCTION; +static Active<__m256>::forward forward[] = HPPL_ACTIVE_FUNCTION; static Active<__m256>::backward backward[] = HPPL_ACTIVE_FUNCTION; } #endif diff --git a/paddle/cuda/include/hl_aggregate.h b/paddle/cuda/include/hl_aggregate.h index db75809f5de195d41577ed6569e8508f48241b69..a6d9ff8483eee28b2c8a380f0aca097c7662a02e 100644 --- a/paddle/cuda/include/hl_aggregate.h +++ b/paddle/cuda/include/hl_aggregate.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #ifndef HL_AGGREGATE_H_ #define HL_AGGREGATE_H_ diff --git a/paddle/cuda/include/hl_avx_functions.h b/paddle/cuda/include/hl_avx_functions.h index cf062dd969bf79554e00369367e3b85c2ae7fc0d..ed339e312a7639cf9b78f130a43d67a7446576bb 100644 --- a/paddle/cuda/include/hl_avx_functions.h +++ b/paddle/cuda/include/hl_avx_functions.h @@ -12,22 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #ifndef HL_AVX_FUNCTIONS_H_ #define HL_AVX_FUNCTIONS_H_ #include namespace hppl { - __m256 relu(const __m256 a); - __m256 sigmoid(const __m256 a); - __m256 tanh(const __m256 a); - __m256 linear(const __m256 a); - - __m256 relu(const __m256 a, const __m256 b); - __m256 sigmoid(const __m256 a, const __m256 b); - __m256 tanh(const __m256 a, const __m256 b); - __m256 linear(const __m256 a, const __m256 b); +__m256 relu(const __m256 a); +__m256 sigmoid(const __m256 a); +__m256 tanh(const __m256 a); +__m256 linear(const __m256 a); + +__m256 relu(const __m256 a, const __m256 b); +__m256 sigmoid(const __m256 a, const __m256 b); +__m256 tanh(const __m256 a, const __m256 b); +__m256 linear(const __m256 a, const __m256 b); } // namespace hppl #endif // HL_AVX_FUNCTIONS_H_ diff --git a/paddle/cuda/include/hl_base.h b/paddle/cuda/include/hl_base.h index 9f80898a1f927a0e8bbf86108567a04ccecc38f5..a076952467a5ce10dc1f58007dda2170aa694fbb 100644 --- a/paddle/cuda/include/hl_base.h +++ b/paddle/cuda/include/hl_base.h @@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - - #ifndef HL_BASE_H_ #define HL_BASE_H_ @@ -33,36 +31,36 @@ limitations under the License. */ * HPPL_STREAM_DEFAULT is HPPL default stream. */ typedef enum { - HPPL_STREAM_DEFAULT = 0, /* Thread Default Stream*/ - HPPL_STREAM_1 = 1, - HPPL_STREAM_2 = 2, - HPPL_STREAM_3 = 3, - HPPL_STREAM_4 = 4, - HPPL_THREAD_STREAM_1 = 5, - HPPL_THREAD_STREAM_2 = 6, - HPPL_THREAD_STREAM_3 = 7, - HPPL_THREAD_STREAM_4 = 8, - HPPL_STREAM_END + HPPL_STREAM_DEFAULT = 0, /* Thread Default Stream*/ + HPPL_STREAM_1 = 1, + HPPL_STREAM_2 = 2, + HPPL_STREAM_3 = 3, + HPPL_STREAM_4 = 4, + HPPL_THREAD_STREAM_1 = 5, + HPPL_THREAD_STREAM_2 = 6, + HPPL_THREAD_STREAM_3 = 7, + HPPL_THREAD_STREAM_4 = 8, + HPPL_STREAM_END } hl_stream_t; /** * @brief HPPL activation mode. */ typedef enum { - HL_ACTIVATION_SIGMOID = 0, - HL_ACTIVATION_RELU = 1, - HL_ACTIVATION_TANH = 2, - HL_ACTIVATION_LINEAR = 3, - HL_ACTIVATION_END + HL_ACTIVATION_SIGMOID = 0, + HL_ACTIVATION_RELU = 1, + HL_ACTIVATION_TANH = 2, + HL_ACTIVATION_LINEAR = 3, + HL_ACTIVATION_END } hl_activation_mode_t; /** * @brief Transpose type. */ typedef enum { - HPPL_OP_N = 0, /* transpose */ - HPPL_OP_T = 1, /* non transpose */ - HPPL_OP_END + HPPL_OP_N = 0, /* transpose */ + HPPL_OP_T = 1, /* non transpose */ + HPPL_OP_END } hl_trans_op_t; /** @@ -148,23 +146,21 @@ typedef struct { * @brief Sparse matrix value type. */ typedef enum { - HL_NO_VALUE = 0, /* matrix values only 0 or 1 */ - HL_FLOAT_VALUE = 1, - HL_VALUE_END + HL_NO_VALUE = 0, /* matrix values only 0 or 1 */ + HL_FLOAT_VALUE = 1, + HL_VALUE_END } hl_matrix_value_t; - /** * @brief HPPL matrix format. */ typedef enum { - HL_SPARSE_CSR = 0, - HL_SPARSE_CSC = 1, - HL_SPARSE_END + HL_SPARSE_CSR = 0, + HL_SPARSE_CSC = 1, + HL_SPARSE_END } hl_matrix_format_t; - -typedef struct _hl_matrix_s * hl_matrix_s; +typedef struct _hl_matrix_s *hl_matrix_s; /** * @brief HPPL sparse matrix. @@ -177,12 +173,12 @@ typedef struct _hl_matrix_s * hl_matrix_s; * @param nnz nonzero values of sparse matrix. */ typedef struct { - hl_matrix_s matrix; - hl_matrix_format_t format; - hl_matrix_value_t type; - int rows; - int cols; - size_t nnz; + hl_matrix_s matrix; + hl_matrix_format_t format; + hl_matrix_value_t type; + int rows; + int cols; + size_t nnz; } _hl_sparse_matrix_s, *hl_sparse_matrix_s; #ifndef PADDLE_TYPE_DOUBLE @@ -195,7 +191,7 @@ typedef struct { * * HL_FLOAT_MIN: 1.17549435e-38F */ -#define HL_FLOAT_MAX 3.40282347e+38F +#define HL_FLOAT_MAX 3.40282347e+38F /** * if real == double * @@ -203,20 +199,18 @@ typedef struct { * * HL_FLOAT_MIN: 2.2250738585072014e-308 */ -#define HL_FLOAT_MIN 1.17549435e-38F +#define HL_FLOAT_MIN 1.17549435e-38F #else -#define HL_FLOAT_MAX 1.7976931348623157e+308 -#define HL_FLOAT_MIN 2.2250738585072014e-308 +#define HL_FLOAT_MAX 1.7976931348623157e+308 +#define HL_FLOAT_MIN 2.2250738585072014e-308 #endif - /** * The maximum input value for exp, used to avoid overflow problem. * * Currently only used for tanh function. */ -#define EXP_MAX_INPUT 40.0 - +#define EXP_MAX_INPUT 40.0 /** * @brief DIVUP(x, y) is similar to ceil(x / y). @@ -224,7 +218,7 @@ typedef struct { * the size of blockDim. */ #ifndef DIVUP -#define DIVUP(x, y) (((x) + (y) - 1) / (y)) +#define DIVUP(x, y) (((x) + (y)-1) / (y)) #endif #ifdef __NVCC__ @@ -233,7 +227,7 @@ typedef struct { #include "hl_cuda.h" #include "cuda_runtime.h" -extern __thread bool g_sync_flag; +extern __thread bool g_sync_flag; extern __thread cudaStream_t default_stream; #define STREAM_DEFAULT default_stream @@ -241,16 +235,15 @@ extern __thread cudaStream_t default_stream; * @brief Check cuda kernel execution. * @param msg error string */ -#define CHECK_SYNC(msg) \ - if (true == g_sync_flag) { \ - hl_stream_synchronize(HPPL_STREAM_DEFAULT); \ - cudaError_t err \ - = (cudaError_t)hl_get_device_last_error(); \ - CHECK_EQ(cudaSuccess, err) << "[" << msg << "] " \ - << "CUDA error: " \ - << hl_get_device_error_string((size_t)err); \ +#define CHECK_SYNC(msg) \ + if (true == g_sync_flag) { \ + hl_stream_synchronize(HPPL_STREAM_DEFAULT); \ + cudaError_t err = (cudaError_t)hl_get_device_last_error(); \ + CHECK_EQ(cudaSuccess, err) \ + << "[" << msg << "] " \ + << "CUDA error: " << hl_get_device_error_string((size_t)err); \ } -#endif /* __NVCC__ */ +#endif /* __NVCC__ */ -#endif /* HL_BASE_H_ */ +#endif /* HL_BASE_H_ */ diff --git a/paddle/cuda/include/hl_batch_transpose.h b/paddle/cuda/include/hl_batch_transpose.h index 414c7996acee4ccbe2d7dbd093e25a23119fea3c..f3630e9762508fd39935e62e0007de04f9140fff 100644 --- a/paddle/cuda/include/hl_batch_transpose.h +++ b/paddle/cuda/include/hl_batch_transpose.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #ifndef HL_BATCH_TRANSPOSE_H_ #define HL_BATCH_TRANSPOSE_H_ @@ -31,10 +30,7 @@ limitations under the License. */ * order. Each batch has height * width data, which are * arranged in height-first (or row-first) manner. */ -extern void batchTranspose(const real* input, - real* output, - int width, - int height, - int batchSize); +extern void batchTranspose( + const real* input, real* output, int width, int height, int batchSize); #endif // HL_BATCH_TRANSPOSE_H_ diff --git a/paddle/cuda/include/hl_cnn.h b/paddle/cuda/include/hl_cnn.h index 70b5be6fda2509853029a68d31129df28d580942..cffaac634f0f64be5ddab961d549ae43775bb7b0 100644 --- a/paddle/cuda/include/hl_cnn.h +++ b/paddle/cuda/include/hl_cnn.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #ifndef HL_CNN_H_ #define HL_CNN_H_ @@ -37,15 +36,21 @@ limitations under the License. */ * @param[in] alpha * @param[in] beta */ -extern void hl_shrink_col2feature( - const real * dataCol, size_t channels, - size_t height, size_t width, - size_t blockH, size_t blockW, - size_t strideH, size_t strideW, - size_t paddingH, size_t paddingW, - size_t outputH, size_t outputW, - real* dataIm, - real alpha = 1.0f, real beta = 0.0f); +extern void hl_shrink_col2feature(const real* dataCol, + size_t channels, + size_t height, + size_t width, + size_t blockH, + size_t blockW, + size_t strideH, + size_t strideW, + size_t paddingH, + size_t paddingW, + size_t outputH, + size_t outputW, + real* dataIm, + real alpha = 1.0f, + real beta = 0.0f); /** * @brief Expand feature to column. @@ -65,14 +70,19 @@ extern void hl_shrink_col2feature( * @param[out] dataCol expand data. * */ -extern void hl_expand_feature2col( - const real* dataIm, size_t channels, - size_t height, size_t width, - size_t blockH, size_t blockW, - size_t strideH, size_t strideW, - size_t paddingH, size_t paddingW, - size_t outputH, size_t outputW, - real* dataCol); +extern void hl_expand_feature2col(const real* dataIm, + size_t channels, + size_t height, + size_t width, + size_t blockH, + size_t blockW, + size_t strideH, + size_t strideW, + size_t paddingH, + size_t paddingW, + size_t outputH, + size_t outputW, + real* dataCol); /** * @brief Maximum pool forward. @@ -94,15 +104,21 @@ extern void hl_expand_feature2col( * @param[in] tgtStride stride between output data samples. * */ -extern void hl_maxpool_forward( - const int frameCnt, const real* inputData, - const int channels, - const int height, const int width, - const int pooledH, const int pooledW, - const int sizeX, const int sizeY, - const int strideH, const int strideW, - const int paddingH, const int paddingW, - real* tgtData, const int tgtStride); +extern void hl_maxpool_forward(const int frameCnt, + const real* inputData, + const int channels, + const int height, + const int width, + const int pooledH, + const int pooledW, + const int sizeX, + const int sizeY, + const int strideH, + const int strideW, + const int paddingH, + const int paddingW, + real* tgtData, + const int tgtStride); /** * @brief Maximum pool backward. @@ -125,20 +141,28 @@ extern void hl_maxpool_forward( * @param[in] paddingH padding height. * @param[in] paddingW padding width. * @param[out] targetGrad output grad. - * @param[in] outStride stride between output data samples. + * @param[in] outStride stride between output data samples. * */ -extern void hl_maxpool_backward( - const int frameCnt, const real* inputData, - const real* outData, const real* outGrad, - const int channels, const int height, - const int width, - const int pooledH, const int pooledW, - const int sizeX, const int sizeY, - const int strideH, const int strideW, - const int paddingH, const int paddingW, - real scaleA, real scaleB, - real* targetGrad, const int outStride); +extern void hl_maxpool_backward(const int frameCnt, + const real* inputData, + const real* outData, + const real* outGrad, + const int channels, + const int height, + const int width, + const int pooledH, + const int pooledW, + const int sizeX, + const int sizeY, + const int strideH, + const int strideW, + const int paddingH, + const int paddingW, + real scaleA, + real scaleB, + real* targetGrad, + const int outStride); /** * @brief Averge pool forward. @@ -160,15 +184,21 @@ extern void hl_maxpool_backward( * @param[in] tgtStride stride between output data samples. * */ -extern void hl_avgpool_forward( - const int frameCnt, const real* inputData, - const int channels, - const int height, const int width, - const int pooledH, const int pooledW, - const int sizeX, const int sizeY, - const int strideH, const int strideW, - const int paddingH, const int paddingW, - real* tgtData, const int tgtStride); +extern void hl_avgpool_forward(const int frameCnt, + const real* inputData, + const int channels, + const int height, + const int width, + const int pooledH, + const int pooledW, + const int sizeX, + const int sizeY, + const int strideH, + const int strideW, + const int paddingH, + const int paddingW, + real* tgtData, + const int tgtStride); /** * @brief Maximum pool backward. @@ -189,19 +219,26 @@ extern void hl_avgpool_forward( * @param[in] scaleA scale. * @param[in] scaleB scale. * @param[out] backGrad output grad. - * @param[in] outStride stride between output data samples. + * @param[in] outStride stride between output data samples. * */ -extern void hl_avgpool_backward( - const int frameCnt, const real* outGrad, - const int channels, const int height, - const int width, - const int pooledH, const int pooledW, - const int sizeX, const int sizeY, - const int strideH, const int strideW, - int paddingH, int paddingW, - real scaleA, real scaleB, - real* backGrad, const int outStride); +extern void hl_avgpool_backward(const int frameCnt, + const real* outGrad, + const int channels, + const int height, + const int width, + const int pooledH, + const int pooledW, + const int sizeX, + const int sizeY, + const int strideH, + const int strideW, + int paddingH, + int paddingW, + real scaleA, + real scaleB, + real* backGrad, + const int outStride); /** * @brief Cross-map-respose normalize forward. @@ -218,10 +255,16 @@ extern void hl_avgpool_backward( * @param[in] beta scale. * */ -extern void hl_CMRNorm_forward( - size_t frameCnt, const real* in, real* scale, real* out, - size_t channels, size_t height, size_t width, size_t sizeX, - real alpha, real beta); +extern void hl_CMRNorm_forward(size_t frameCnt, + const real* in, + real* scale, + real* out, + size_t channels, + size_t height, + size_t width, + size_t sizeX, + real alpha, + real beta); /** * @brief Cross-map-respose normalize backward. @@ -240,11 +283,18 @@ extern void hl_CMRNorm_forward( * @param[in] beta scale. * */ -extern void hl_CMRNorm_backward( - size_t frameCnt, const real* inV, const real* scale, - const real* outV, const real* outDiff, real *inDiff, - size_t channels, size_t height, size_t width, size_t sizeX, - real alpha, real beta); +extern void hl_CMRNorm_backward(size_t frameCnt, + const real* inV, + const real* scale, + const real* outV, + const real* outDiff, + real* inDiff, + size_t channels, + size_t height, + size_t width, + size_t sizeX, + real alpha, + real beta); /** * @brief Bilinear interpolation forward. @@ -278,24 +328,24 @@ extern void hl_bilinear_forward(const real* inData, const real ratioH, const real ratioW); - /** - * @brief Bilinear interpolation backward. - * - * @param[out] inGrad input gradient. - * @param[in] inImgH input image height. - * @param[in] inImgW input image width. - * @param[in] inputH input batchSize. - * @param[in] inputW input image data dim. - * @param[in] outGrad output gradient. - * @param[in] outImgH output image height. - * @param[in] outImgW output image width. - * @param[in] outputH output batchSize. - * @param[in] outputW output image data dim. - * @param[in] numChannels number of channels. - * @param[in] ratioH inImgH / outImgH. - * @param[in] ratioW inImgW / outImgW. - * - */ +/** +* @brief Bilinear interpolation backward. +* +* @param[out] inGrad input gradient. +* @param[in] inImgH input image height. +* @param[in] inImgW input image width. +* @param[in] inputH input batchSize. +* @param[in] inputW input image data dim. +* @param[in] outGrad output gradient. +* @param[in] outImgH output image height. +* @param[in] outImgW output image width. +* @param[in] outputH output batchSize. +* @param[in] outputW output image data dim. +* @param[in] numChannels number of channels. +* @param[in] ratioH inImgH / outImgH. +* @param[in] ratioW inImgW / outImgW. +* +*/ extern void hl_bilinear_backward(real* inGrad, const size_t inImgH, const size_t inImgW, @@ -321,9 +371,13 @@ extern void hl_bilinear_backward(real* inGrad, * @param[in] featLen feature length = image height * image width. * @param[in] groups number of groups. */ -extern void hl_maxout_forward( - const real* inData, real* outData, int* idData, - size_t batchSize, size_t size, size_t featLen, size_t groups); +extern void hl_maxout_forward(const real* inData, + real* outData, + int* idData, + size_t batchSize, + size_t size, + size_t featLen, + size_t groups); /** * @brief MaxOut backward. @@ -336,8 +390,12 @@ extern void hl_maxout_forward( * @param[in] featLen feature length = image height * image width. * @param[in] groups number of groups. */ -extern void hl_maxout_backward( - real* inGrad, const real* outGrad, const int* idData, - size_t batchSize, size_t size, size_t featLen, size_t groups); +extern void hl_maxout_backward(real* inGrad, + const real* outGrad, + const int* idData, + size_t batchSize, + size_t size, + size_t featLen, + size_t groups); #endif /* HL_CNN_H_ */ diff --git a/paddle/cuda/include/hl_cuda.h b/paddle/cuda/include/hl_cuda.h index 3196db67f61fd2e6b75df4abb3652df4456a0366..357286e3188a6f3184bc56e75232bf2e1ec54e44 100644 --- a/paddle/cuda/include/hl_cuda.h +++ b/paddle/cuda/include/hl_cuda.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #ifndef HL_CUDA_H_ #define HL_CUDA_H_ @@ -22,8 +21,7 @@ limitations under the License. */ /** * @brief HPPL event. */ -typedef struct _hl_event_st * hl_event_t; - +typedef struct _hl_event_st *hl_event_t; /** * @brief return cuda runtime api version. @@ -42,7 +40,7 @@ extern void hl_start(); * if device is NULL, will start all GPU. * @param[in] number number of devices. */ -extern void hl_specify_devices_start(int* device, int number); +extern void hl_specify_devices_start(int *device, int number); /** * @brief Queries if a device may directly access a peer device's memory. @@ -126,7 +124,7 @@ extern int hl_get_device(); * * @return dest_d pointer to device memory. */ -extern void* hl_malloc_device(size_t size); +extern void *hl_malloc_device(size_t size); /** * @brief Free device memory. @@ -143,7 +141,7 @@ extern void hl_free_mem_device(void *dest_d); * * @return dest_h pointer to host memory. */ -extern void* hl_malloc_host(size_t size); +extern void *hl_malloc_host(size_t size); /** * @brief Free host page-lock memory. @@ -228,9 +226,9 @@ extern void hl_srand(unsigned int seed); * @param[in] stream stream id. */ extern void hl_memcpy_async(void *dst, - void *src, - size_t size, - hl_stream_t stream); + void *src, + size_t size, + hl_stream_t stream); /** * @brief Waits for stream tasks to complete. @@ -261,8 +259,7 @@ extern void hl_destroy_event(hl_event_t event); * * @return time Time between start and end in ms. */ -extern float hl_event_elapsed_time(hl_event_t start, - hl_event_t end); +extern float hl_event_elapsed_time(hl_event_t start, hl_event_t end); /** * @brief Records an event. @@ -300,7 +297,7 @@ extern void hl_set_device_flags_block(); /** * @brief Returns the last error string from a cuda runtime call. */ -extern const char* hl_get_device_error_string(); +extern const char *hl_get_device_error_string(); /** * @brief Returns the last error string from a cuda runtime call. @@ -309,7 +306,7 @@ extern const char* hl_get_device_error_string(); * * @see hl_get_device_last_error() */ -extern const char* hl_get_device_error_string(size_t err); +extern const char *hl_get_device_error_string(size_t err); /** * @brief Returns the last error number. diff --git a/paddle/cuda/include/hl_cuda_cublas.h b/paddle/cuda/include/hl_cuda_cublas.h index d757317eb4a97559feef22d4fd8edf7c10ca6745..db8c03c2c01c67788622d37b5330e22c31e03f34 100644 --- a/paddle/cuda/include/hl_cuda_cublas.h +++ b/paddle/cuda/include/hl_cuda_cublas.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #ifndef HL_CUDA_CUBLAS_H_ #define HL_CUDA_CUBLAS_H_ @@ -29,12 +28,8 @@ limitations under the License. */ * @param[in] ldc the first dimension of C_d. * */ -extern void hl_matrix_transpose(real *A_d, - real *C_d, - int dimM, - int dimN, - int lda, - int ldc); +extern void hl_matrix_transpose( + real *A_d, real *C_d, int dimM, int dimN, int lda, int ldc); /* * @brief Matrix transpose, while lda = dimN, ldc = dimM. @@ -45,10 +40,7 @@ extern void hl_matrix_transpose(real *A_d, * @param[in] dimN matrix width. * */ -extern void hl_matrix_transpose(real *A_d, - real *C_d, - int dimM, - int dimN); +extern void hl_matrix_transpose(real *A_d, real *C_d, int dimM, int dimN); /* * @brief Matrix inverse @@ -60,11 +52,7 @@ extern void hl_matrix_transpose(real *A_d, * @param[in] ldc the first dimension of C_d * */ -extern void hl_matrix_inverse(real *A_d, - real *C_d, - int dimN, - int lda, - int ldc); +extern void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc); /** * @brief C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d @@ -84,12 +72,19 @@ extern void hl_matrix_inverse(real *A_d, * @param[in] ldc the first dimension of C_d. * */ -extern void hl_matrix_mul(real *A_d, hl_trans_op_t transa, - real *B_d, hl_trans_op_t transb, +extern void hl_matrix_mul(real *A_d, + hl_trans_op_t transa, + real *B_d, + hl_trans_op_t transb, real *C_d, - int dimM, int dimN, int dimK, - real alpha, real beta, - int lda, int ldb, int ldc); + int dimM, + int dimN, + int dimK, + real alpha, + real beta, + int lda, + int ldb, + int ldc); /** * @brief C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d @@ -106,11 +101,16 @@ extern void hl_matrix_mul(real *A_d, hl_trans_op_t transa, * @param[in] beta scalar used for multiplication. * */ -extern void hl_matrix_mul(real *A_d, hl_trans_op_t transa, - real *B_d, hl_trans_op_t transb, +extern void hl_matrix_mul(real *A_d, + hl_trans_op_t transa, + real *B_d, + hl_trans_op_t transb, real *C_d, - int dimM, int dimN, int dimK, - real alpha, real beta); + int dimM, + int dimN, + int dimK, + real alpha, + real beta); /** * @brief This function performs the matrix-vector multiplication. @@ -132,11 +132,17 @@ extern void hl_matrix_mul(real *A_d, hl_trans_op_t transa, * */ -extern void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans, - real *B_d, real *C_d, - int dimM, int dimN, - real alpha, real beta, - int lda, int incb, int incc); +extern void hl_matrix_mul_vector(real *A_d, + hl_trans_op_t trans, + real *B_d, + real *C_d, + int dimM, + int dimN, + real alpha, + real beta, + int lda, + int incb, + int incc); /** * @brief This function performs the matrix-vector multiplication. @@ -154,9 +160,13 @@ extern void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans, * @param[in] beta scalar used for multiplication. * */ -extern void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans, - real *B_d, real *C_d, - int dimM, int dimN, - real alpha, real beta); +extern void hl_matrix_mul_vector(real *A_d, + hl_trans_op_t trans, + real *B_d, + real *C_d, + int dimM, + int dimN, + real alpha, + real beta); #endif /* HL_CUDA_CUBLAS_H_ */ diff --git a/paddle/cuda/include/hl_cuda_cudnn.h b/paddle/cuda/include/hl_cuda_cudnn.h index f256cb54dfe69e8df7cc7fcc0ed0a58f3574acd3..3a2f916210277145efa8f6d7663a2698ea546b0b 100644 --- a/paddle/cuda/include/hl_cuda_cudnn.h +++ b/paddle/cuda/include/hl_cuda_cudnn.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #ifndef HL_CUDA_CUDNN_H_ #define HL_CUDA_CUDNN_H_ @@ -22,7 +21,7 @@ limitations under the License. */ * hppl pooling mode */ typedef enum { - HL_POOLING_MAX = 0, + HL_POOLING_MAX = 0, // average includes padded values HL_POOLING_AVERAGE = 1, // average does not include padded values @@ -324,17 +323,16 @@ extern void hl_convolution_forward_add_bias(hl_tensor_descriptor bias, * @param[in] sizeInBytes gpu workspace size (bytes). * @param[in] convBwdFilterAlgo backward filter algorithm. */ -extern void hl_convolution_backward_filter( - hl_tensor_descriptor input, - real* input_data, - hl_tensor_descriptor output, - real* output_grad_data, - hl_filter_descriptor filter, - real* filter_grad_data, - hl_convolution_descriptor conv, - void* gpuWorkSpace, - size_t sizeInBytes, - int convBwdFilterAlgo); +extern void hl_convolution_backward_filter(hl_tensor_descriptor input, + real* input_data, + hl_tensor_descriptor output, + real* output_grad_data, + hl_filter_descriptor filter, + real* filter_grad_data, + hl_convolution_descriptor conv, + void* gpuWorkSpace, + size_t sizeInBytes, + int convBwdFilterAlgo); /** * @brief convolution backward data(calculate input image grad data). @@ -350,17 +348,16 @@ extern void hl_convolution_backward_filter( * @param[in] sizeInBytes gpu workspace size (bytes). * @param[in] convBwdDataAlgo backward data algorithm. */ -extern void hl_convolution_backward_data( - hl_tensor_descriptor input, - real* input_data_grad, - hl_tensor_descriptor output, - real* output_grad_data, - hl_filter_descriptor filter, - real* filter_data, - hl_convolution_descriptor conv, - void* gpuWorkSpace, - size_t sizeInBytes, - int convBwdDataAlgo); +extern void hl_convolution_backward_data(hl_tensor_descriptor input, + real* input_data_grad, + hl_tensor_descriptor output, + real* output_grad_data, + hl_filter_descriptor filter, + real* filter_data, + hl_convolution_descriptor conv, + void* gpuWorkSpace, + size_t sizeInBytes, + int convBwdDataAlgo); /** * @brief convolution backward bias(calculate bias grad data). @@ -383,8 +380,8 @@ extern void hl_convolution_backward_bias(hl_tensor_descriptor bias, * @param[in] height matrix height. * @param[in] width matrix width. */ -extern void hl_softmax_forward(real *input, - real *output, +extern void hl_softmax_forward(real* input, + real* output, int height, int width); @@ -396,8 +393,8 @@ extern void hl_softmax_forward(real *input, * @param[in] height matrix height. * @param[in] width matrix width. */ -extern void hl_softmax_backward(real *output_value, - real *output_grad, +extern void hl_softmax_backward(real* output_value, + real* output_grad, int height, int width); @@ -426,18 +423,18 @@ extern void hl_softmax_backward(real *output_value, * */ extern void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc, - real *input, + real* input, hl_tensor_descriptor outputDesc, - real *output, + real* output, hl_tensor_descriptor bnParamDesc, - real *scale, - real *bias, + real* scale, + real* bias, double factor, - real *runningMean, - real *runningInvVar, + real* runningMean, + real* runningInvVar, double epsilon, - real *savedMean, - real *savedVar); + real* savedMean, + real* savedVar); /** * @brief cudnn batch norm forward. @@ -463,14 +460,14 @@ extern void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc, * */ extern void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc, - real *input, + real* input, hl_tensor_descriptor outputDesc, - real *output, + real* output, hl_tensor_descriptor bnParamDesc, - real *scale, - real *bias, - real *estimatedMean, - real *estimatedVar, + real* scale, + real* bias, + real* estimatedMean, + real* estimatedVar, double epsilon); /** @@ -483,7 +480,8 @@ extern void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc, * @param[in] inGradDesc input tensor descriptor desc. * @param[in] inGrad input data. * @param[in] dBnParamDesc tensor descriptor desc. - * bnScale, bnBias, running mean/var, save_mean/var. + * bnScale, bnBias, running mean/var, + * save_mean/var. * @param[in] scale batch normalization scale parameter (in original * paper scale is referred to as gamma). * @param[in] scaleGrad batch normalization scale parameter (in original @@ -497,17 +495,17 @@ extern void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc, * */ extern void hl_batch_norm_backward(hl_tensor_descriptor inputDesc, - real *input, + real* input, hl_tensor_descriptor outGradDesc, - real *outGrad, + real* outGrad, hl_tensor_descriptor inGradDesc, - real *inGrad, + real* inGrad, hl_tensor_descriptor dBnParamDesc, - real *scale, - real *scaleGrad, - real *biasGrad, + real* scale, + real* scaleGrad, + real* biasGrad, double epsilon, - real *savedMean, - real *savedInvVar); + real* savedMean, + real* savedInvVar); #endif // HL_CUDA_CUDNN_H_ diff --git a/paddle/cuda/include/hl_dso_loader.h b/paddle/cuda/include/hl_dso_loader.h index f36c724e2da3dce11696fcda7daf98f5cda36dd6..1eb9f9ca888d3a93f04621e10346b5f9ff34cdca 100644 --- a/paddle/cuda/include/hl_dso_loader.h +++ b/paddle/cuda/include/hl_dso_loader.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #ifndef HL_DSO_LOADER_H_ #define HL_DSO_LOADER_H_ diff --git a/paddle/cuda/include/hl_functions.h b/paddle/cuda/include/hl_functions.h index 65f366461ced0f9ee31ff9075f6dfaeb6c9b72a2..91ce9a0678463597df88c548aeac322ee19d95de 100644 --- a/paddle/cuda/include/hl_functions.h +++ b/paddle/cuda/include/hl_functions.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #ifndef HL_FUNCTIONS_H_ #define HL_FUNCTIONS_H_ @@ -21,30 +20,30 @@ limitations under the License. */ /** * sigmoid threshold maximum */ -#define SIGMOID_THRESHOLD_MIN -40.0 +#define SIGMOID_THRESHOLD_MIN -40.0 /** * sigmoid threshold minimum */ -#define SIGMOID_THRESHOLD_MAX 13.0 +#define SIGMOID_THRESHOLD_MAX 13.0 #ifndef __NVCC__ namespace hppl { - /* - * forward activation - */ - real relu(const real a); - real sigmoid(const real a); - real tanh(const real a); - real linear(const real a); - - /* - * backward activation - */ - real relu(const real a, const real b); - real sigmoid(const real a, const real b); - real tanh(const real a, const real b); - real linear(const real a, const real b); +/* + * forward activation + */ +real relu(const real a); +real sigmoid(const real a); +real tanh(const real a); +real linear(const real a); + +/* + * backward activation + */ +real relu(const real a, const real b); +real sigmoid(const real a, const real b); +real tanh(const real a, const real b); +real linear(const real a, const real b); } // namespace hppl #ifdef __AVX__ diff --git a/paddle/cuda/include/hl_gpu.h b/paddle/cuda/include/hl_gpu.h index 05039663b6e9f5e4a72f15ab822d723635f9b282..3be0df3b93b69811fb9c36dae223cbd927b02559 100644 --- a/paddle/cuda/include/hl_gpu.h +++ b/paddle/cuda/include/hl_gpu.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #ifndef HL_GPU_H_ #define HL_GPU_H_ diff --git a/paddle/cuda/include/hl_lstm.h b/paddle/cuda/include/hl_lstm.h index 1f95e318a1fe06050bbd31c2e276974f4a8bdc1e..7e527a79025969320f1aca75d313fd9d0194efd1 100644 --- a/paddle/cuda/include/hl_lstm.h +++ b/paddle/cuda/include/hl_lstm.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #ifndef HL_LSTM_H_ #define HL_LSTM_H_ diff --git a/paddle/cuda/include/hl_matrix.h b/paddle/cuda/include/hl_matrix.h index 6195e30b9974d3ad092b4cf604e6b74fa481835c..96648661e345d8fa5d50cb2aae3a56ee53921f90 100644 --- a/paddle/cuda/include/hl_matrix.h +++ b/paddle/cuda/include/hl_matrix.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #ifndef HL_MATRIX_H_ #define HL_MATRIX_H_ @@ -30,13 +29,8 @@ limitations under the License. */ * @param[in] beta scalar used for addition. * */ -extern void hl_matrix_add(real* A_d, - real* B_d, - real* C_d, - int dimM, - int dimN, - real alpha, - real beta); +extern void hl_matrix_add( + real* A_d, real* B_d, real* C_d, int dimM, int dimN, real alpha, real beta); /** * @brief Matrix Softmax. * @@ -46,7 +40,7 @@ extern void hl_matrix_add(real* A_d, * @param[in] dimN matrix width. * */ -extern void hl_matrix_softmax(real *A_d, real *C_d, int dimM, int dimN); +extern void hl_matrix_softmax(real* A_d, real* C_d, int dimM, int dimN); /** * @brief Matrix softmax derivative. @@ -58,11 +52,8 @@ extern void hl_matrix_softmax(real *A_d, real *C_d, int dimM, int dimN); * @param[in] dimN matrix width. * */ -extern void hl_matrix_softmax_derivative(real* grad_d, - real* output_d, - real* sftmaxSum_d, - int dimM, - int dimN); +extern void hl_matrix_softmax_derivative( + real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN); /** * @brief Sequence softmax. @@ -73,8 +64,8 @@ extern void hl_matrix_softmax_derivative(real* grad_d, * @param[in] numSequence sequence number. * */ -extern void hl_sequence_softmax_forward(real *A_d, - real *C_d, +extern void hl_sequence_softmax_forward(real* A_d, + real* C_d, const int* index, int numSequence); @@ -88,11 +79,8 @@ extern void hl_sequence_softmax_forward(real *A_d, * @param[in] dimN matrix width. * */ -extern void hl_matrix_classification_error(real* A_d, - int* B_d, - real* C_d, - int dimM, - int dimN); +extern void hl_matrix_classification_error( + real* A_d, int* B_d, real* C_d, int dimM, int dimN); /** * @brief Matrix cross entropy. @@ -104,11 +92,8 @@ extern void hl_matrix_classification_error(real* A_d, * @param[in] dimN matrix width. * */ -extern void hl_matrix_cross_entropy(real* A_d, - real* C_d, - int* label_d, - int dimM, - int dimN); +extern void hl_matrix_cross_entropy( + real* A_d, real* C_d, int* label_d, int dimM, int dimN); /** * @brief Matrix cross entropy back propagation. @@ -120,11 +105,8 @@ extern void hl_matrix_cross_entropy(real* A_d, * @param[in] dimN matrix width. * */ -extern void hl_matrix_cross_entropy_bp(real* grad_d, - real* output_d, - int* label_d, - int dimM, - int dimN); +extern void hl_matrix_cross_entropy_bp( + real* grad_d, real* output_d, int* label_d, int dimM, int dimN); /** * @brief Matrix multi-binary label cross entropy @@ -135,11 +117,8 @@ extern void hl_matrix_cross_entropy_bp(real* grad_d, * @param[in] dimM matrix height. * @param[in] dimN matrix width. */ -extern void hl_matrix_multi_binary_cross_entropy(real* output, - real* entropy, - hl_sparse_matrix_s mat, - int dimM, - int dimN); +extern void hl_matrix_multi_binary_cross_entropy( + real* output, real* entropy, hl_sparse_matrix_s mat, int dimM, int dimN); /** * @brief Matrix multi-binary label cross entropy backprop @@ -150,11 +129,8 @@ extern void hl_matrix_multi_binary_cross_entropy(real* output, * @param[in] dimM matrix height. * @param[in] dimN matrix width. */ -extern void hl_matrix_multi_binary_cross_entropy_bp(real* output, - real* grad, - hl_sparse_matrix_s mat, - int dimM, - int dimN); +extern void hl_matrix_multi_binary_cross_entropy_bp( + real* output, real* grad, hl_sparse_matrix_s mat, int dimM, int dimN); /** * @brief Matrix zero memory. @@ -176,12 +152,8 @@ extern void hl_matrix_zero_mem(real* data, int num); * @param[in] partial_sum */ -extern void hl_param_relu_forward(real* output, - real* input, - real* w, - int width, - int height, - int partial_sum); +extern void hl_param_relu_forward( + real* output, real* input, real* w, int width, int height, int partial_sum); /** * @brief parameter relu backward w * diff --git a/paddle/cuda/include/hl_sequence.h b/paddle/cuda/include/hl_sequence.h index 46d86b2982f065802eec83ca7554f787d1d02f3a..bb5124df44b492bd8fdeb2a0c75ebcf74d2c8157 100644 --- a/paddle/cuda/include/hl_sequence.h +++ b/paddle/cuda/include/hl_sequence.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #ifndef HL_SEQUENCE_H_ #define HL_SEQUENCE_H_ @@ -32,7 +31,7 @@ limitations under the License. */ extern void hl_max_sequence_forward(real* input, const int* sequence, real* output, - int *index, + int* index, int numSequences, int dim); @@ -46,11 +45,8 @@ extern void hl_max_sequence_forward(real* input, * @param[in] dim input dimension. * */ -extern void hl_max_sequence_backward(real* outputGrad, - int *index, - real* inputGrad, - int numSequences, - int dim); +extern void hl_max_sequence_backward( + real* outputGrad, int* index, real* inputGrad, int numSequences, int dim); /** * @brief Context projection forward. @@ -63,7 +59,8 @@ extern void hl_max_sequence_backward(real* outputGrad, * @param[in] inputDim input sequence dimension. * @param[in] contextLength context length. * @param[in] contextStart context start. - * @param[in] beginPad number of extra timesteps added at the beginning. + * @param[in] beginPad number of extra timesteps added at the + * beginning. * @param[in] isPadding trainable padding. * */ @@ -109,7 +106,8 @@ extern void hl_context_projection_backward_data(real* outputGrad, * @param[in] totalPad number of extra timesteps. * @param[in] contextLength context length. * @param[in] contextStart context start. - * @param[in] beginPad number of extra timesteps added at the beginning. + * @param[in] beginPad number of extra timesteps added at the + * beginning. * */ extern void hl_context_projection_backward_weight(real* outputGrad, @@ -141,9 +139,9 @@ extern void hl_context_projection_backward_weight(real* outputGrad, * @param[in] seq2batch copy direction. * */ -extern void hl_sequence2batch_copy(real *batch, - real *sequence, - const int *batchIndex, +extern void hl_sequence2batch_copy(real* batch, + real* sequence, + const int* batchIndex, int seqWidth, int batchCount, bool seq2batch); @@ -167,9 +165,9 @@ extern void hl_sequence2batch_copy(real *batch, * @param[in] seq2batch copy direction. * */ -extern void hl_sequence2batch_add(real *batch, - real *sequence, - int *batchIndex, +extern void hl_sequence2batch_add(real* batch, + real* sequence, + int* batchIndex, int seqWidth, int batchCount, bool seq2batch); diff --git a/paddle/cuda/include/hl_sparse.h b/paddle/cuda/include/hl_sparse.h index 9acdebdebf37761e1485e3441963586ead9f3c85..c4e0be23e2031cbcb124b532216a23d8a344668d 100644 --- a/paddle/cuda/include/hl_sparse.h +++ b/paddle/cuda/include/hl_sparse.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #ifndef HL_SPARSE_H_ #define HL_SPARSE_H_ @@ -31,7 +30,7 @@ limitations under the License. */ */ extern void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d, hl_matrix_format_t format, - hl_matrix_value_t value_type, + hl_matrix_value_t value_type, int dimM, int dimN, int nnz); @@ -60,10 +59,10 @@ extern void hl_free_sparse_matrix(hl_sparse_matrix_s A_d); * */ extern void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d, - void * dest_d, + void *dest_d, size_t size, hl_matrix_format_t format, - hl_matrix_value_t value_type, + hl_matrix_value_t value_type, int dimM, int dimN, int nnz); @@ -94,11 +93,11 @@ extern void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d, * */ extern void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d, - real* value_d, - int* rows_d, - int* cols_d, + real *value_d, + int *rows_d, + int *cols_d, hl_matrix_format_t format, - hl_matrix_value_t value_type, + hl_matrix_value_t value_type, int dimM, int dimN, int nnz); @@ -259,10 +258,14 @@ extern void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, */ extern void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa, - real *B_d, hl_trans_op_t transb, + real *B_d, + hl_trans_op_t transb, real *C_d, - int dimM, int dimN, int dimK, - real alpha, real beta); + int dimM, + int dimN, + int dimK, + real alpha, + real beta); /** * @brief C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d. @@ -311,11 +314,16 @@ extern void hl_matrix_dense_mul_csc(real *A_d, * @note transb is not support HPPL_OP_T. * */ -extern void hl_sparse_matrix_mul(real* A_d, hl_trans_op_t transa, - real *B_d, hl_trans_op_t transb, +extern void hl_sparse_matrix_mul(real *A_d, + hl_trans_op_t transa, + real *B_d, + hl_trans_op_t transb, hl_sparse_matrix_s C_d, - int dimM, int dimN, int dimK, - real alpha, real beta); + int dimM, + int dimN, + int dimK, + real alpha, + real beta); /** * @brief C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d @@ -336,12 +344,16 @@ extern void hl_sparse_matrix_mul(real* A_d, hl_trans_op_t transa, * @note transa is not support HPPL_OP_T. * */ -extern void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa, +extern void hl_matrix_dense_mul_csr(real *A_d, + hl_trans_op_t transa, hl_sparse_matrix_s B_d, hl_trans_op_t transb, real *C_d, - int dimM, int dimN, int dimK, - real alpha, real beta); + int dimM, + int dimN, + int dimK, + real alpha, + real beta); /** * @brief Memcpy csc_matrix to host. @@ -412,7 +424,6 @@ extern void hl_memcpy_from_csr_matrix(real *csr_val, hl_sparse_matrix_s csr_matrix, hl_stream_t stream); - /** * @brief A_d[j] += B_d[i,j] for i in range(height) * @@ -423,19 +434,13 @@ extern void hl_memcpy_from_csr_matrix(real *csr_val, * @param[in] scale scale of B_d * */ -extern void hl_sparse_matrix_column_sum(real* A_d, - hl_sparse_matrix_s B_d, - int dimM, - int dimN, - real scale); +extern void hl_sparse_matrix_column_sum( + real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale); /** * @brief implementation of csr sparse matrix in hl_sparse_matirx_column_sum */ -extern void hl_matrix_csr_column_sum(real* A_d, - hl_sparse_matrix_s B_d, - int dimM, - int dimN, - real scale); +extern void hl_matrix_csr_column_sum( + real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale); /** * @brief A_d[i,j] += B_d[j] @@ -446,13 +451,13 @@ extern void hl_matrix_csr_column_sum(real* A_d, * */ extern void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d, - real* B_d, + real *B_d, real scale); /** * @brief implementation of csr sparse matrix in hl_sparse_matrix_add_bias */ extern void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d, - real* B_d, + real *B_d, real scale); /** @@ -470,7 +475,7 @@ extern void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d, * */ extern void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d, - real* B_d, + real *B_d, int dimM, int dimN, real alpha, @@ -479,7 +484,7 @@ extern void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d, * @brief implementation of csr sparse matrix in hl_sparse_matrix_add_dense */ extern void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d, - real* B_d, + real *B_d, int dimM, int dimN, real alpha, @@ -493,7 +498,7 @@ extern void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d, * @return return rows pointer, which is gpu address * */ -extern int* hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat); +extern int *hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat); /** * @brief get cols pionter of GpuSparseMatrix @@ -503,7 +508,7 @@ extern int* hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat); * @return return cols pointer, which is gpu address * */ -extern int* hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat); +extern int *hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat); /** * @brief get value pionter of GpuSparseMatrix @@ -513,7 +518,6 @@ extern int* hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat); * @return return value pointer, which is gpu address * */ -extern real* hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat); - +extern real *hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat); #endif /* HL_SPARSE_H_ */ diff --git a/paddle/cuda/include/hl_table_apply.h b/paddle/cuda/include/hl_table_apply.h index 3c9428e9253d5ed563e4e9f62d8842667496b83c..b4ac83a66af13c2a843872faba2ebd972008a738 100644 --- a/paddle/cuda/include/hl_table_apply.h +++ b/paddle/cuda/include/hl_table_apply.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #ifndef HL_TABLE_APPLY_H_ #define HL_TABLE_APPLY_H_ @@ -31,8 +30,10 @@ limitations under the License. */ * @param[in] dim width of table. * */ -extern void hl_matrix_select_rows(real* output, int ldo, - real* table, int ldt, +extern void hl_matrix_select_rows(real* output, + int ldo, + real* table, + int ldt, int* ids, int numSamples, int tableSize, @@ -53,8 +54,10 @@ extern void hl_matrix_select_rows(real* output, int ldo, * @param[in] dim width of table. * */ -extern void hl_matrix_add_to_rows(real* table, int ldt, - real* input, int ldi, +extern void hl_matrix_add_to_rows(real* table, + int ldt, + real* input, + int ldi, int* ids, int numSamples, int tableSize, @@ -72,8 +75,7 @@ extern void hl_matrix_add_to_rows(real* table, int ldt, * */ template -extern void hl_vector_select_from(T* dst, int sized, - const T* src, int sizes, - const int* ids, int sizei); +extern void hl_vector_select_from( + T* dst, int sized, const T* src, int sizes, const int* ids, int sizei); -#endif /* HL_TABLE_APPLY_H_ */ +#endif /* HL_TABLE_APPLY_H_ */ diff --git a/paddle/cuda/include/hl_time.h b/paddle/cuda/include/hl_time.h index 4414b0b2d2ed4ab6a48294ffaed3a43a639e5950..b0a88c66a12fcfec6ea96b877423f907dac8dfa1 100644 --- a/paddle/cuda/include/hl_time.h +++ b/paddle/cuda/include/hl_time.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #ifndef HL_TIME_H_ #define HL_TIME_H_ diff --git a/paddle/cuda/include/hl_top_k.h b/paddle/cuda/include/hl_top_k.h index a38d4cf862278a060f72b970d723895dc3735d0a..e8cfebbf6a3bd27c10a71d7817238bc304681fa4 100644 --- a/paddle/cuda/include/hl_top_k.h +++ b/paddle/cuda/include/hl_top_k.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #ifndef HL_TOP_K_H_ #define HL_TOP_K_H_ @@ -31,9 +30,11 @@ limitations under the License. */ * @param[in] numSamples height of input value. * */ -extern void hl_matrix_top_k(real* topVal, int ldv, - int * topIds, - real* src, int lds, +extern void hl_matrix_top_k(real* topVal, + int ldv, + int* topIds, + real* src, + int lds, int dim, int beamSize, int numSamples); @@ -50,8 +51,9 @@ extern void hl_matrix_top_k(real* topVal, int ldv, * * @note Only support HL_SPARSE_CSR format. */ -extern void hl_sparse_matrix_top_k(real* topVal, int ldv, - int * topIds, +extern void hl_sparse_matrix_top_k(real* topVal, + int ldv, + int* topIds, hl_sparse_matrix_s src, int beamSize, int numSamples); diff --git a/paddle/cuda/include/stub/hl_aggregate_stub.h b/paddle/cuda/include/stub/hl_aggregate_stub.h index 4c0c68f3c98fe95f01060b82c3a1b9822d2a3715..bb53fc581e09905aa7a9b2d8dfe44b04c677c40a 100644 --- a/paddle/cuda/include/stub/hl_aggregate_stub.h +++ b/paddle/cuda/include/stub/hl_aggregate_stub.h @@ -12,29 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #ifndef HL_AGGREGATE_STUB_H_ #define HL_AGGREGATE_STUB_H_ #include "hl_aggregate.h" -inline void hl_matrix_row_sum(real *A_d, real *C_d, - int dimM, int dimN) {} +inline void hl_matrix_row_sum(real *A_d, real *C_d, int dimM, int dimN) {} -inline void hl_matrix_row_max(real *A_d, real *C_d, - int dimM, int dimN) {} +inline void hl_matrix_row_max(real *A_d, real *C_d, int dimM, int dimN) {} -inline void hl_matrix_row_min(real *A_d, real *C_d, - int dimM, int dimN) {} +inline void hl_matrix_row_min(real *A_d, real *C_d, int dimM, int dimN) {} -inline void hl_matrix_column_sum(real *A_d, real *C_d, - int dimM, int dimN) {} +inline void hl_matrix_column_sum(real *A_d, real *C_d, int dimM, int dimN) {} -inline void hl_matrix_column_max(real *A_d, real *C_d, - int dimM, int dimN) {} +inline void hl_matrix_column_max(real *A_d, real *C_d, int dimM, int dimN) {} -inline void hl_matrix_column_min(real *A_d, real *C_d, - int dimM, int dimN) {} +inline void hl_matrix_column_min(real *A_d, real *C_d, int dimM, int dimN) {} inline void hl_vector_sum(real *A_d, real *C_h, int dimM) {} diff --git a/paddle/cuda/include/stub/hl_cnn_stub.h b/paddle/cuda/include/stub/hl_cnn_stub.h index c6f32ad337705ff938b7b370a4785dc7f4393041..2f73b9671edd3609996aebff2913f5262805f869 100644 --- a/paddle/cuda/include/stub/hl_cnn_stub.h +++ b/paddle/cuda/include/stub/hl_cnn_stub.h @@ -12,84 +12,134 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #ifndef HL_CNN_STUB_H_ #define HL_CNN_STUB_H_ #include "hl_cnn.h" -inline void hl_shrink_col2feature( - const real * dataCol, size_t channels, - size_t height, size_t width, - size_t blockH, size_t blockW, - size_t strideH, size_t strideW, - size_t paddingH, size_t paddingW, - size_t outputH, size_t outputW, - real* dataIm, - real alpha, real beta) {} - -inline void hl_expand_feature2col( - const real* dataIm, size_t channels, - size_t height, size_t width, - size_t blockH, size_t blockW, - size_t strideH, size_t strideW, - size_t paddingH, size_t paddingW, - size_t outputH, size_t outputW, - real* dataCol) {} - -inline void hl_maxpool_forward( - const int frameCnt, const real* inputData, - const int channels, - const int height, const int width, - const int pooledH, const int pooledW, - const int sizeX, const int sizeY, - const int strideH, const int strideW, - const int paddingH, const int paddingW, - real* tgtData, const int tgtStride) {} - -inline void hl_maxpool_backward( - const int frameCnt, const real* inputData, - const real* outData, const real* outGrad, - const int channels, const int height, - const int width, - const int pooledH, const int pooledW, - const int sizeX, const int sizeY, - const int strideH, const int strideW, - const int paddingH, const int paddingW, - real scaleA, real scaleB, - real* targetGrad, const int outStride) {} - -inline void hl_avgpool_forward( - const int frameCnt, const real* inputData, - const int channels, - const int height, const int width, - const int pooledH, const int pooledW, - const int sizeX, const int sizeY, - const int strideH, const int strideW, - const int paddingH, const int paddingW, - real* tgtData, const int tgtStride) {} - -inline void hl_avgpool_backward( - const int frameCnt, const real* outGrad, - const int channels, const int height, - const int width, - const int pooledH, const int pooledW, - const int sizeX, const int sizeY, - const int strideH, const int strideW, - int paddingH, int paddingW, - real scaleA, real scaleB, - real* backGrad, const int outStride) {} - -inline void hl_CMRNorm_forward( - size_t frameCnt, const real* in, real* scale, real* out, - size_t channels, size_t height, size_t width, size_t sizeX, - real alpha, real beta) {} - -inline void hl_CMRNorm_backward( - size_t frameCnt, const real* inV, const real* scale, - const real* outV, const real* outDiff, real *inDiff, - size_t channels, size_t height, size_t width, size_t sizeX, - real alpha, real beta) {} +inline void hl_shrink_col2feature(const real* dataCol, + size_t channels, + size_t height, + size_t width, + size_t blockH, + size_t blockW, + size_t strideH, + size_t strideW, + size_t paddingH, + size_t paddingW, + size_t outputH, + size_t outputW, + real* dataIm, + real alpha, + real beta) {} + +inline void hl_expand_feature2col(const real* dataIm, + size_t channels, + size_t height, + size_t width, + size_t blockH, + size_t blockW, + size_t strideH, + size_t strideW, + size_t paddingH, + size_t paddingW, + size_t outputH, + size_t outputW, + real* dataCol) {} + +inline void hl_maxpool_forward(const int frameCnt, + const real* inputData, + const int channels, + const int height, + const int width, + const int pooledH, + const int pooledW, + const int sizeX, + const int sizeY, + const int strideH, + const int strideW, + const int paddingH, + const int paddingW, + real* tgtData, + const int tgtStride) {} + +inline void hl_maxpool_backward(const int frameCnt, + const real* inputData, + const real* outData, + const real* outGrad, + const int channels, + const int height, + const int width, + const int pooledH, + const int pooledW, + const int sizeX, + const int sizeY, + const int strideH, + const int strideW, + const int paddingH, + const int paddingW, + real scaleA, + real scaleB, + real* targetGrad, + const int outStride) {} + +inline void hl_avgpool_forward(const int frameCnt, + const real* inputData, + const int channels, + const int height, + const int width, + const int pooledH, + const int pooledW, + const int sizeX, + const int sizeY, + const int strideH, + const int strideW, + const int paddingH, + const int paddingW, + real* tgtData, + const int tgtStride) {} + +inline void hl_avgpool_backward(const int frameCnt, + const real* outGrad, + const int channels, + const int height, + const int width, + const int pooledH, + const int pooledW, + const int sizeX, + const int sizeY, + const int strideH, + const int strideW, + int paddingH, + int paddingW, + real scaleA, + real scaleB, + real* backGrad, + const int outStride) {} + +inline void hl_CMRNorm_forward(size_t frameCnt, + const real* in, + real* scale, + real* out, + size_t channels, + size_t height, + size_t width, + size_t sizeX, + real alpha, + real beta) {} + +inline void hl_CMRNorm_backward(size_t frameCnt, + const real* inV, + const real* scale, + const real* outV, + const real* outDiff, + real* inDiff, + size_t channels, + size_t height, + size_t width, + size_t sizeX, + real alpha, + real beta) {} inline void hl_bilinear_forward(const real* inData, const size_t inImgH, @@ -106,25 +156,33 @@ inline void hl_bilinear_forward(const real* inData, const real ratioW) {} inline void hl_bilinear_backward(real* inGrad, - const size_t inImgH, - const size_t inImgW, - const size_t inputH, - const size_t inputW, - const real* outGrad, - const size_t outImgH, - const size_t outImgW, - const size_t outputH, - const size_t outputW, - const size_t numChannels, - const real ratioH, - const real ratioW) {} - -inline void hl_maxout_forward( - const real* inData, real* outData, int* idData, - size_t batchSize, size_t size, size_t featLen, size_t group) {} - -inline void hl_maxout_backward( - real* inGrad, const real* outGrad, const int* idData, - size_t batchSize, size_t size, size_t featLen, size_t group) {} + const size_t inImgH, + const size_t inImgW, + const size_t inputH, + const size_t inputW, + const real* outGrad, + const size_t outImgH, + const size_t outImgW, + const size_t outputH, + const size_t outputW, + const size_t numChannels, + const real ratioH, + const real ratioW) {} + +inline void hl_maxout_forward(const real* inData, + real* outData, + int* idData, + size_t batchSize, + size_t size, + size_t featLen, + size_t group) {} + +inline void hl_maxout_backward(real* inGrad, + const real* outGrad, + const int* idData, + size_t batchSize, + size_t size, + size_t featLen, + size_t group) {} #endif // HL_CNN_STUB_H_ diff --git a/paddle/cuda/include/stub/hl_cuda_cublas_stub.h b/paddle/cuda/include/stub/hl_cuda_cublas_stub.h index 903dcbe8355d6f593d96bc1f9e686d54035a9366..85f7c390c47397127487b16fdc933f0afe2fb880 100644 --- a/paddle/cuda/include/stub/hl_cuda_cublas_stub.h +++ b/paddle/cuda/include/stub/hl_cuda_cublas_stub.h @@ -12,41 +12,42 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #ifndef HL_CUDA_CUBLAS_STUB_H_ #define HL_CUDA_CUBLAS_STUB_H_ #include "hl_cuda_cublas.h" -inline void hl_matrix_transpose(real *A_d, - real *C_d, - int dimM, - int dimN, - int lda, - int ldc) {} - -inline void hl_matrix_transpose(real *A_d, - real *C_d, - int dimM, - int dimN) {} - -inline void hl_matrix_inverse(real *A_d, - real *C_d, - int dimN, - int lda, - int ldc) {} - -inline void hl_matrix_mul(real *A_d, hl_trans_op_t transa, - real *B_d, hl_trans_op_t transb, - real *C_d, - int dimM, int dimN, int dimK, - real alpha, real beta, - int lda, int ldb, int ldc) {} +inline void hl_matrix_transpose( + real *A_d, real *C_d, int dimM, int dimN, int lda, int ldc) {} + +inline void hl_matrix_transpose(real *A_d, real *C_d, int dimM, int dimN) {} -inline void hl_matrix_mul(real *A_d, hl_trans_op_t transa, - real *B_d, hl_trans_op_t transb, +inline void hl_matrix_inverse( + real *A_d, real *C_d, int dimN, int lda, int ldc) {} + +inline void hl_matrix_mul(real *A_d, + hl_trans_op_t transa, + real *B_d, + hl_trans_op_t transb, + real *C_d, + int dimM, + int dimN, + int dimK, + real alpha, + real beta, + int lda, + int ldb, + int ldc) {} + +inline void hl_matrix_mul(real *A_d, + hl_trans_op_t transa, + real *B_d, + hl_trans_op_t transb, real *C_d, - int dimM, int dimN, int dimK, - real alpha, real beta) {} + int dimM, + int dimN, + int dimK, + real alpha, + real beta) {} #endif // HL_CUDA_CUBLAS_STUB_H_ diff --git a/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h b/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h index b96804afd86ba5e8c7b7eed7eb768295b4e23096..3beb0e5b5170261a6c453936b8b0347f3e97dbff 100644 --- a/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h +++ b/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h @@ -12,15 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #ifndef HL_CUDA_CUDNN_STUB_H_ #define HL_CUDA_CUDNN_STUB_H_ #include "hl_cuda_cudnn.h" -inline int hl_get_cudnn_lib_version() { - return 0; -} +inline int hl_get_cudnn_lib_version() { return 0; } inline void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc) {} @@ -68,41 +65,41 @@ inline void hl_pooling_backward(hl_tensor_descriptor input, hl_pooling_descriptor pooling) {} inline void hl_create_filter_descriptor(hl_filter_descriptor* filter, - int input_feature_maps, - int output_feature_maps, - int height, - int width) {} + int input_feature_maps, + int output_feature_maps, + int height, + int width) {} inline void hl_destroy_filter_descriptor(hl_filter_descriptor filter) {} inline void hl_create_convolution_descriptor(hl_convolution_descriptor* conv, - hl_tensor_descriptor image, - hl_filter_descriptor filter, - int padding_height, - int padding_width, - int stride_height, - int stride_width) {} + hl_tensor_descriptor image, + hl_filter_descriptor filter, + int padding_height, + int padding_width, + int stride_height, + int stride_width) {} inline void hl_reset_convolution_descriptor(hl_convolution_descriptor conv, - hl_tensor_descriptor image, - hl_filter_descriptor filter, - int padding_height, - int padding_width, - int stride_height, - int stride_width) {} + hl_tensor_descriptor image, + hl_filter_descriptor filter, + int padding_height, + int padding_width, + int stride_height, + int stride_width) {} inline void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv) {} inline void hl_conv_workspace(hl_tensor_descriptor input, - hl_tensor_descriptor output, - hl_filter_descriptor filter, - hl_convolution_descriptor conv, - int* convFwdAlgo, - size_t* fwdLimitBytes, - int* convBwdDataAlgo, - size_t* bwdDataLimitBytes, - int* convBwdFilterAlgo, - size_t* bwdFilterLimitBytes) {} + hl_tensor_descriptor output, + hl_filter_descriptor filter, + hl_convolution_descriptor conv, + int* convFwdAlgo, + size_t* fwdLimitBytes, + int* convBwdDataAlgo, + size_t* bwdDataLimitBytes, + int* convBwdFilterAlgo, + size_t* bwdFilterLimitBytes) {} inline void hl_convolution_forward(hl_tensor_descriptor input, real* input_data, @@ -116,86 +113,84 @@ inline void hl_convolution_forward(hl_tensor_descriptor input, int convFwdAlgo) {} inline void hl_convolution_forward_add_bias(hl_tensor_descriptor bias, - real* bias_data, - hl_tensor_descriptor output, - real* output_data) {} - -inline void hl_convolution_backward_filter( - hl_tensor_descriptor input, - real* input_data, - hl_tensor_descriptor output, - real* output_grad_data, - hl_filter_descriptor filter, - real* filter_grad_data, - hl_convolution_descriptor conv, - void* gpuWorkSpace, - size_t sizeInBytes, - int convBwdFilterAlgo) {} - -inline void hl_convolution_backward_data( - hl_tensor_descriptor input, - real* input_data_grad, - hl_tensor_descriptor output, - real* output_grad_data, - hl_filter_descriptor filter, - real* filter_data, - hl_convolution_descriptor conv, - void* gpuWorkSpace, - size_t sizeInBytes, - int convBwdDataAlgo) {} + real* bias_data, + hl_tensor_descriptor output, + real* output_data) {} + +inline void hl_convolution_backward_filter(hl_tensor_descriptor input, + real* input_data, + hl_tensor_descriptor output, + real* output_grad_data, + hl_filter_descriptor filter, + real* filter_grad_data, + hl_convolution_descriptor conv, + void* gpuWorkSpace, + size_t sizeInBytes, + int convBwdFilterAlgo) {} + +inline void hl_convolution_backward_data(hl_tensor_descriptor input, + real* input_data_grad, + hl_tensor_descriptor output, + real* output_grad_data, + hl_filter_descriptor filter, + real* filter_data, + hl_convolution_descriptor conv, + void* gpuWorkSpace, + size_t sizeInBytes, + int convBwdDataAlgo) {} inline void hl_convolution_backward_bias(hl_tensor_descriptor bias, - real* bias_grad_data, - hl_tensor_descriptor output, - real* output_grad_data) {} + real* bias_grad_data, + hl_tensor_descriptor output, + real* output_grad_data) {} -inline void hl_softmax_forward(real *input, - real *output, - int height, - int width) {} - -inline void hl_softmax_backward(real *output_value, - real *output_grad, +inline void hl_softmax_forward(real* input, + real* output, int height, int width) {} +inline void hl_softmax_backward(real* output_value, + real* output_grad, + int height, + int width) {} + inline void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc, - real *input, + real* input, hl_tensor_descriptor outputDesc, - real *output, + real* output, hl_tensor_descriptor bnParamDesc, - real *scale, - real *bias, + real* scale, + real* bias, double factor, - real *runningMean, - real *runningInvVar, + real* runningMean, + real* runningInvVar, double epsilon, - real *savedMean, - real *savedVar) {} + real* savedMean, + real* savedVar) {} inline void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc, - real *input, + real* input, hl_tensor_descriptor outputDesc, - real *output, + real* output, hl_tensor_descriptor bnParamDesc, - real *scale, - real *bias, - real *estimatedMean, - real *estimatedVar, + real* scale, + real* bias, + real* estimatedMean, + real* estimatedVar, double epsilon) {} inline void hl_batch_norm_backward(hl_tensor_descriptor inputDesc, - real *input, + real* input, hl_tensor_descriptor outGradDesc, - real *outGrad, + real* outGrad, hl_tensor_descriptor inGradDesc, - real *inGrad, + real* inGrad, hl_tensor_descriptor dBnParamDesc, - real *scale, - real *scaleGrad, - real *biasGrad, + real* scale, + real* scaleGrad, + real* biasGrad, double epsilon, - real *savedMean, - real *savedInvVar) {} + real* savedMean, + real* savedInvVar) {} #endif // HL_CUDA_CUDNN_STUB_H_ diff --git a/paddle/cuda/include/stub/hl_cuda_stub.h b/paddle/cuda/include/stub/hl_cuda_stub.h index 675ac03b0e188e9b26038dd4e40264099618e17a..1f91068cdf8b3d472c4b403d1ec7d5293c28c07e 100644 --- a/paddle/cuda/include/stub/hl_cuda_stub.h +++ b/paddle/cuda/include/stub/hl_cuda_stub.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #ifndef HL_CUDA_STUB_H_ #define HL_CUDA_STUB_H_ @@ -24,29 +23,25 @@ inline void hl_specify_devices_start(int *device, int number) {} inline void hl_init(int device) {} -inline int hl_get_cuda_lib_version(int device) { - return 0; -} +inline int hl_get_cuda_lib_version(int device) { return 0; } inline void hl_fini() {} inline void hl_set_sync_flag(bool flag) {} -inline bool hl_get_sync_flag() { - return false; -} +inline bool hl_get_sync_flag() { return false; } -inline int hl_get_device_count() { return 0; } +inline int hl_get_device_count() { return 0; } inline void hl_set_device(int device) {} -inline int hl_get_device() { return 0; } +inline int hl_get_device() { return 0; } -inline void* hl_malloc_device(size_t size) { return NULL; } +inline void *hl_malloc_device(size_t size) { return NULL; } inline void hl_free_mem_device(void *dest_d) {} -inline void* hl_malloc_host(size_t size) { return NULL; } +inline void *hl_malloc_host(size_t size) { return NULL; } inline void hl_free_mem_host(void *dest_h) {} @@ -64,7 +59,9 @@ inline void hl_rand(real *dest_d, size_t num) {} inline void hl_srand(unsigned int seed) {} -inline void hl_memcpy_async(void *dst, void *src, size_t size, +inline void hl_memcpy_async(void *dst, + void *src, + size_t size, hl_stream_t stream) {} inline void hl_stream_synchronize(hl_stream_t stream) {} @@ -83,11 +80,11 @@ inline void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) {} inline void hl_event_synchronize(hl_event_t event) {} -inline int hl_get_device_last_error() { return 0; } +inline int hl_get_device_last_error() { return 0; } -inline const char* hl_get_device_error_string() { return NULL; } +inline const char *hl_get_device_error_string() { return NULL; } -inline const char* hl_get_device_error_string(size_t err) { return NULL; } +inline const char *hl_get_device_error_string(size_t err) { return NULL; } inline bool hl_cuda_event_is_ready(hl_event_t event) { return true; } diff --git a/paddle/cuda/include/stub/hl_lstm_stub.h b/paddle/cuda/include/stub/hl_lstm_stub.h index 2700bef02a5e1e40ee7603ccab7fec754196f8cd..7ccda032d26f2fbbe99136e8481416daea557a78 100644 --- a/paddle/cuda/include/stub/hl_lstm_stub.h +++ b/paddle/cuda/include/stub/hl_lstm_stub.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #ifndef HL_LSTM_STUB_H_ #define HL_LSTM_STUB_H_ diff --git a/paddle/cuda/include/stub/hl_matrix_stub.h b/paddle/cuda/include/stub/hl_matrix_stub.h index 76cac2e57769301fee2e5979e2685976daf35441..1bd78d23fbaf46e6265ba0db25ea399a204bd96f 100644 --- a/paddle/cuda/include/stub/hl_matrix_stub.h +++ b/paddle/cuda/include/stub/hl_matrix_stub.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #ifndef HL_MATRIX_STUB_H_ #define HL_MATRIX_STUB_H_ @@ -26,48 +25,30 @@ inline void hl_matrix_add(real* A_d, real alpha, real beta) {} -inline void hl_matrix_softmax(real *A_d, real *C_d, int dimM, int dimN) {} +inline void hl_matrix_softmax(real* A_d, real* C_d, int dimM, int dimN) {} -inline void hl_sequence_softmax_forward(real *A_d, - real *C_d, +inline void hl_sequence_softmax_forward(real* A_d, + real* C_d, const int* index, int numSequence) {} -inline void hl_matrix_softmax_derivative(real* grad_d, - real* output_d, - real* sftmaxSum_d, - int dimM, - int dimN) {} - -inline void hl_matrix_classification_error(real* A_d, - int* B_d, - real* C_d, - int dimM, - int dimN) {} - -inline void hl_matrix_cross_entropy(real* A_d, - real* C_d, - int* label_d, - int dimM, - int dimN) {} - -inline void hl_matrix_cross_entropy_bp(real* grad_d, - real* output_d, - int* label_d, - int dimM, - int dimN) {} - -inline void hl_matrix_multi_binary_cross_entropy(real* output, - real* entropy, - hl_sparse_matrix_s mat, - int dimM, - int dimN) {} - -inline void hl_matrix_multi_binary_cross_entropy_bp(real* output, - real* grad, - hl_sparse_matrix_s mat, - int dimM, - int dimN) {} +inline void hl_matrix_softmax_derivative( + real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) {} + +inline void hl_matrix_classification_error( + real* A_d, int* B_d, real* C_d, int dimM, int dimN) {} + +inline void hl_matrix_cross_entropy( + real* A_d, real* C_d, int* label_d, int dimM, int dimN) {} + +inline void hl_matrix_cross_entropy_bp( + real* grad_d, real* output_d, int* label_d, int dimM, int dimN) {} + +inline void hl_matrix_multi_binary_cross_entropy( + real* output, real* entropy, hl_sparse_matrix_s mat, int dimM, int dimN) {} + +inline void hl_matrix_multi_binary_cross_entropy_bp( + real* output, real* grad, hl_sparse_matrix_s mat, int dimM, int dimN) {} inline void hl_matrix_zero_mem(real* data, int num) {} @@ -101,7 +82,6 @@ inline void hl_cossim(real* output, int input2_height, real scale) {} - inline void hl_cossim_derivative(real* grad, real* output, real* prevOutX, diff --git a/paddle/cuda/include/stub/hl_sequence_stub.h b/paddle/cuda/include/stub/hl_sequence_stub.h index aabd956c37f7dce48a379b995ab88a53aa65c760..381f0a6f26c5669465f029e972c6ca8b0e6e1776 100644 --- a/paddle/cuda/include/stub/hl_sequence_stub.h +++ b/paddle/cuda/include/stub/hl_sequence_stub.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #ifndef HL_SEQUENCE_STUB_H_ #define HL_SEQUENCE_STUB_H_ @@ -21,15 +20,12 @@ limitations under the License. */ inline void hl_max_sequence_forward(real* input, const int* sequence, real* output, - int *index, + int* index, int numSequences, int dim) {} -inline void hl_max_sequence_backward(real* outputGrad, - int *index, - real* inputGrad, - int numSequences, - int dim) {} +inline void hl_max_sequence_backward( + real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {} inline void hl_context_projection_forward(real* input, const int* sequence, @@ -60,16 +56,16 @@ inline void hl_context_projection_backward_weight(real* outputGrad, int contextStart, int beginPad) {} -inline void hl_sequence2batch_copy(real *batch, - real *sequence, - const int *batchIndex, +inline void hl_sequence2batch_copy(real* batch, + real* sequence, + const int* batchIndex, int seqWidth, int batchCount, bool seq2batch) {} -inline void hl_sequence2batch_add(real *batch, - real *sequence, - int *batchIndex, +inline void hl_sequence2batch_add(real* batch, + real* sequence, + int* batchIndex, int seqWidth, int batchCount, bool seq2batch) {} diff --git a/paddle/cuda/include/stub/hl_sparse_stub.h b/paddle/cuda/include/stub/hl_sparse_stub.h index 346a1900dda5825e9a4311a2c51e8a50e6e7df0b..d47bdd2c47d097c4c68b7b7e88ef888bc18270c2 100644 --- a/paddle/cuda/include/stub/hl_sparse_stub.h +++ b/paddle/cuda/include/stub/hl_sparse_stub.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #ifndef HL_SPARSE_STUB_H_ #define HL_SPARSE_STUB_H_ @@ -20,7 +19,7 @@ limitations under the License. */ inline void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d, hl_matrix_format_t format, - hl_matrix_value_t value_type, + hl_matrix_value_t value_type, int dimM, int dimN, int nnz) {} @@ -28,20 +27,20 @@ inline void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d, inline void hl_free_sparse_matrix(hl_sparse_matrix_s A_d) {} inline void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d, - void * dest_d, + void *dest_d, size_t size, hl_matrix_format_t format, - hl_matrix_value_t value_type, + hl_matrix_value_t value_type, int dimM, int dimN, int nnz) {} inline void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d, - real* value_d, - int* rows_d, - int* cols_d, + real *value_d, + int *rows_d, + int *cols_d, hl_matrix_format_t format, - hl_matrix_value_t value_type, + hl_matrix_value_t value_type, int dimM, int dimN, int nnz) {} @@ -87,10 +86,14 @@ inline void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, inline void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa, - real *B_d, hl_trans_op_t transb, + real *B_d, + hl_trans_op_t transb, real *C_d, - int dimM, int dimN, int dimK, - real alpha, real beta) {} + int dimM, + int dimN, + int dimK, + real alpha, + real beta) {} inline void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa, @@ -103,18 +106,27 @@ inline void hl_matrix_dense_mul_csc(real *A_d, real alpha, real beta) {} -inline void hl_sparse_matrix_mul(real* A_d, hl_trans_op_t transa, - real *B_d, hl_trans_op_t transb, +inline void hl_sparse_matrix_mul(real *A_d, + hl_trans_op_t transa, + real *B_d, + hl_trans_op_t transb, hl_sparse_matrix_s C_d, - int dimM, int dimN, int dimK, - real alpha, real beta) {} + int dimM, + int dimN, + int dimK, + real alpha, + real beta) {} -inline void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa, +inline void hl_matrix_dense_mul_csr(real *A_d, + hl_trans_op_t transa, hl_sparse_matrix_s B_d, hl_trans_op_t transb, real *C_d, - int dimM, int dimN, int dimK, - real alpha, real beta) {} + int dimM, + int dimN, + int dimK, + real alpha, + real beta) {} inline void hl_memcpy_from_csc_matrix(real *csc_val, size_t val_size, @@ -134,49 +146,39 @@ inline void hl_memcpy_from_csr_matrix(real *csr_val, hl_sparse_matrix_s csr_matrix, hl_stream_t stream) {} -inline void hl_sparse_matrix_column_sum(real* A_d, - hl_sparse_matrix_s B_d, - int dimM, - int dimN, - real scale) {} +inline void hl_sparse_matrix_column_sum( + real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale) {} -inline void hl_matrix_csr_column_sum(real* A_d, - hl_sparse_matrix_s B_d, - int dimM, - int dimN, - real scale) {} +inline void hl_matrix_csr_column_sum( + real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale) {} inline void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d, - real* B_d, + real *B_d, real scale) {} inline void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d, - real* B_d, + real *B_d, real scale) {} inline void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d, - real* B_d, + real *B_d, int dimM, int dimN, real alpha, real beta) {} inline void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d, - real* B_d, + real *B_d, int dimM, int dimN, real alpha, real beta) {} -inline int* hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat) { - return NULL; -} +inline int *hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat) { return NULL; } -inline int* hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat) { - return NULL; -} +inline int *hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat) { return NULL; } -inline real* hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat) { +inline real *hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat) { return NULL; } diff --git a/paddle/cuda/src/avx_mathfun.h b/paddle/cuda/src/avx_mathfun.h index 2922d4dc2937662d66fb2433f4883448ba21fa3f..2412ed5abc13b2a83521a75524f581e106788b60 100644 --- a/paddle/cuda/src/avx_mathfun.h +++ b/paddle/cuda/src/avx_mathfun.h @@ -32,32 +32,35 @@ #include /* yes I know, the top of this file is quite ugly */ -# define ALIGN32_BEG -# define ALIGN32_END __attribute__((aligned(32))) +#define ALIGN32_BEG +#define ALIGN32_END __attribute__((aligned(32))) /* __m128 is ugly to write */ -typedef __m256 v8sf; // vector of 8 float (avx) -typedef __m256i v8si; // vector of 8 int (avx) -typedef __m128i v4si; // vector of 8 int (avx) +typedef __m256 v8sf; // vector of 8 float (avx) +typedef __m256i v8si; // vector of 8 int (avx) +typedef __m128i v4si; // vector of 8 int (avx) -#define _PI32AVX_CONST(Name, Val) \ - static const ALIGN32_BEG int _pi32avx_##Name[4] ALIGN32_END = { Val, Val, Val, Val } +#define _PI32AVX_CONST(Name, Val) \ + static const ALIGN32_BEG int _pi32avx_##Name[4] ALIGN32_END = { \ + Val, Val, Val, Val} _PI32AVX_CONST(1, 1); _PI32AVX_CONST(inv1, ~1); _PI32AVX_CONST(2, 2); _PI32AVX_CONST(4, 4); - /* declare some AVX constants -- why can't I figure a better way to do that? */ -#define _PS256_CONST(Name, Val) \ - static const ALIGN32_BEG float _ps256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val } -#define _PI32_CONST256(Name, Val) \ - static const ALIGN32_BEG int _pi32_256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val } -#define _PS256_CONST_TYPE(Name, Type, Val) \ - static const ALIGN32_BEG Type _ps256_##Name[8] ALIGN32_END = { Val, Val, Val, Val, Val, Val, Val, Val } - -_PS256_CONST(1 , 1.0f); +#define _PS256_CONST(Name, Val) \ + static const ALIGN32_BEG float _ps256_##Name[8] ALIGN32_END = { \ + Val, Val, Val, Val, Val, Val, Val, Val} +#define _PI32_CONST256(Name, Val) \ + static const ALIGN32_BEG int _pi32_256_##Name[8] ALIGN32_END = { \ + Val, Val, Val, Val, Val, Val, Val, Val} +#define _PS256_CONST_TYPE(Name, Type, Val) \ + static const ALIGN32_BEG Type _ps256_##Name[8] ALIGN32_END = { \ + Val, Val, Val, Val, Val, Val, Val, Val} + +_PS256_CONST(1, 1.0f); _PS256_CONST(0p5, 0.5f); /* the smallest non denormalized float number */ _PS256_CONST_TYPE(min_norm_pos, int, 0x00800000); @@ -76,14 +79,14 @@ _PI32_CONST256(0x7f, 0x7f); _PS256_CONST(cephes_SQRTHF, 0.707106781186547524); _PS256_CONST(cephes_log_p0, 7.0376836292E-2); -_PS256_CONST(cephes_log_p1, - 1.1514610310E-1); +_PS256_CONST(cephes_log_p1, -1.1514610310E-1); _PS256_CONST(cephes_log_p2, 1.1676998740E-1); -_PS256_CONST(cephes_log_p3, - 1.2420140846E-1); -_PS256_CONST(cephes_log_p4, + 1.4249322787E-1); -_PS256_CONST(cephes_log_p5, - 1.6668057665E-1); -_PS256_CONST(cephes_log_p6, + 2.0000714765E-1); -_PS256_CONST(cephes_log_p7, - 2.4999993993E-1); -_PS256_CONST(cephes_log_p8, + 3.3333331174E-1); +_PS256_CONST(cephes_log_p3, -1.2420140846E-1); +_PS256_CONST(cephes_log_p4, +1.4249322787E-1); +_PS256_CONST(cephes_log_p5, -1.6668057665E-1); +_PS256_CONST(cephes_log_p6, +2.0000714765E-1); +_PS256_CONST(cephes_log_p7, -2.4999993993E-1); +_PS256_CONST(cephes_log_p8, +3.3333331174E-1); _PS256_CONST(cephes_log_q1, -2.12194440e-4); _PS256_CONST(cephes_log_q2, 0.693359375); @@ -94,50 +97,51 @@ typedef union imm_xmm_union { v4si xmm[2]; } imm_xmm_union; -#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) { \ - imm_xmm_union u __attribute__((aligned(32))); \ - u.imm = imm_; \ - xmm0_ = u.xmm[0]; \ - xmm1_ = u.xmm[1]; \ -} - -#define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_) { \ +#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) \ + { \ imm_xmm_union u __attribute__((aligned(32))); \ - u.xmm[0]=xmm0_; u.xmm[1]=xmm1_; imm_ = u.imm; \ + u.imm = imm_; \ + xmm0_ = u.xmm[0]; \ + xmm1_ = u.xmm[1]; \ } +#define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_) \ + { \ + imm_xmm_union u __attribute__((aligned(32))); \ + u.xmm[0] = xmm0_; \ + u.xmm[1] = xmm1_; \ + imm_ = u.imm; \ + } -#define AVX2_BITOP_USING_SSE2(fn) \ -static inline v8si avx2_mm256_##fn(v8si x, int a) \ -{ \ - /* use SSE2 instruction to perform the bitop AVX2 */ \ - v4si x1, x2; \ - v8si ret; \ - COPY_IMM_TO_XMM(x, x1, x2); \ - x1 = _mm_##fn(x1,a); \ - x2 = _mm_##fn(x2,a); \ - COPY_XMM_TO_IMM(x1, x2, ret); \ - return(ret); \ -} +#define AVX2_BITOP_USING_SSE2(fn) \ + static inline v8si avx2_mm256_##fn(v8si x, int a) { \ + /* use SSE2 instruction to perform the bitop AVX2 */ \ + v4si x1, x2; \ + v8si ret; \ + COPY_IMM_TO_XMM(x, x1, x2); \ + x1 = _mm_##fn(x1, a); \ + x2 = _mm_##fn(x2, a); \ + COPY_XMM_TO_IMM(x1, x2, ret); \ + return (ret); \ + } //#warning "Using SSE2 to perform AVX2 bitshift ops" AVX2_BITOP_USING_SSE2(slli_epi32) AVX2_BITOP_USING_SSE2(srli_epi32) -#define AVX2_INTOP_USING_SSE2(fn) \ -static inline v8si avx2_mm256_##fn(v8si x, v8si y) \ -{ \ - /* use SSE2 instructions to perform the AVX2 integer operation */ \ - v4si x1, x2; \ - v4si y1, y2; \ - v8si ret; \ - COPY_IMM_TO_XMM(x, x1, x2); \ - COPY_IMM_TO_XMM(y, y1, y2); \ - x1 = _mm_##fn(x1,y1); \ - x2 = _mm_##fn(x2,y2); \ - COPY_XMM_TO_IMM(x1, x2, ret); \ - return(ret); \ -} +#define AVX2_INTOP_USING_SSE2(fn) \ + static inline v8si avx2_mm256_##fn(v8si x, v8si y) { \ + /* use SSE2 instructions to perform the AVX2 integer operation */ \ + v4si x1, x2; \ + v4si y1, y2; \ + v8si ret; \ + COPY_IMM_TO_XMM(x, x1, x2); \ + COPY_IMM_TO_XMM(y, y1, y2); \ + x1 = _mm_##fn(x1, y1); \ + x2 = _mm_##fn(x2, y2); \ + COPY_XMM_TO_IMM(x1, x2, ret); \ + return (ret); \ + } //#warning "Using SSE2 to perform AVX2 integer ops" AVX2_INTOP_USING_SSE2(and_si128) @@ -157,84 +161,83 @@ AVX2_INTOP_USING_SSE2(add_epi32) #define avx2_mm256_add_epi32 _mm256_add_epi32 #endif /* __AVX2__ */ - -/* natural logarithm computed for 8 simultaneous float +/* natural logarithm computed for 8 simultaneous float return NaN for x <= 0 */ v8sf log256_ps(v8sf x) { v8si imm0; - v8sf one = *(v8sf*)_ps256_1; + v8sf one = *(v8sf *)_ps256_1; - //v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps()); + // v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps()); v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS); - x = _mm256_max_ps(x, *(v8sf*)_ps256_min_norm_pos); /* cut off denormalized stuff */ + x = _mm256_max_ps( + x, *(v8sf *)_ps256_min_norm_pos); /* cut off denormalized stuff */ // can be done with AVX2 imm0 = avx2_mm256_srli_epi32(_mm256_castps_si256(x), 23); /* keep only the fractional part */ - x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_mant_mask); - x = _mm256_or_ps(x, *(v8sf*)_ps256_0p5); + x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_mant_mask); + x = _mm256_or_ps(x, *(v8sf *)_ps256_0p5); // this is again another AVX2 instruction - imm0 = avx2_mm256_sub_epi32(imm0, *(v8si*)_pi32_256_0x7f); + imm0 = avx2_mm256_sub_epi32(imm0, *(v8si *)_pi32_256_0x7f); v8sf e = _mm256_cvtepi32_ps(imm0); e = _mm256_add_ps(e, one); - /* part2: + /* part2: if( x < SQRTHF ) { e -= 1; x = x + x - 1.0; } else { x = x - 1.0; } */ - //v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF); - v8sf mask = _mm256_cmp_ps(x, *(v8sf*)_ps256_cephes_SQRTHF, _CMP_LT_OS); + // v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF); + v8sf mask = _mm256_cmp_ps(x, *(v8sf *)_ps256_cephes_SQRTHF, _CMP_LT_OS); v8sf tmp = _mm256_and_ps(x, mask); x = _mm256_sub_ps(x, one); e = _mm256_sub_ps(e, _mm256_and_ps(one, mask)); x = _mm256_add_ps(x, tmp); - v8sf z = _mm256_mul_ps(x,x); + v8sf z = _mm256_mul_ps(x, x); - v8sf y = *(v8sf*)_ps256_cephes_log_p0; + v8sf y = *(v8sf *)_ps256_cephes_log_p0; y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p1); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p1); y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p2); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p2); y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p3); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p3); y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p4); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p4); y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p5); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p5); y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p6); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p6); y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p7); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p7); y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p8); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p8); y = _mm256_mul_ps(y, x); y = _mm256_mul_ps(y, z); - - tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q1); - y = _mm256_add_ps(y, tmp); + tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q1); + y = _mm256_add_ps(y, tmp); - tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5); + tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5); y = _mm256_sub_ps(y, tmp); - tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q2); + tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q2); x = _mm256_add_ps(x, y); x = _mm256_add_ps(x, tmp); - x = _mm256_or_ps(x, invalid_mask); // negative arg will be NAN + x = _mm256_or_ps(x, invalid_mask); // negative arg will be NAN return x; } -_PS256_CONST(exp_hi, 88.3762626647949f); -_PS256_CONST(exp_lo, -88.3762626647949f); +_PS256_CONST(exp_hi, 88.3762626647949f); +_PS256_CONST(exp_lo, -88.3762626647949f); _PS256_CONST(cephes_LOG2EF, 1.44269504088896341); _PS256_CONST(cephes_exp_C1, 0.693359375); @@ -250,45 +253,45 @@ _PS256_CONST(cephes_exp_p5, 5.0000001201E-1); v8sf exp256_ps(v8sf x) { v8sf tmp = _mm256_setzero_ps(), fx; v8si imm0; - v8sf one = *(v8sf*)_ps256_1; + v8sf one = *(v8sf *)_ps256_1; - x = _mm256_min_ps(x, *(v8sf*)_ps256_exp_hi); - x = _mm256_max_ps(x, *(v8sf*)_ps256_exp_lo); + x = _mm256_min_ps(x, *(v8sf *)_ps256_exp_hi); + x = _mm256_max_ps(x, *(v8sf *)_ps256_exp_lo); /* express exp(x) as exp(g + n*log(2)) */ - fx = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_LOG2EF); - fx = _mm256_add_ps(fx, *(v8sf*)_ps256_0p5); + fx = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_LOG2EF); + fx = _mm256_add_ps(fx, *(v8sf *)_ps256_0p5); /* how to perform a floorf with SSE: just below */ - //imm0 = _mm256_cvttps_epi32(fx); - //tmp = _mm256_cvtepi32_ps(imm0); - + // imm0 = _mm256_cvttps_epi32(fx); + // tmp = _mm256_cvtepi32_ps(imm0); + tmp = _mm256_floor_ps(fx); /* if greater, substract 1 */ - //v8sf mask = _mm256_cmpgt_ps(tmp, fx); - v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS); + // v8sf mask = _mm256_cmpgt_ps(tmp, fx); + v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS); mask = _mm256_and_ps(mask, one); fx = _mm256_sub_ps(tmp, mask); - tmp = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C1); - v8sf z = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C2); + tmp = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C1); + v8sf z = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C2); x = _mm256_sub_ps(x, tmp); x = _mm256_sub_ps(x, z); - z = _mm256_mul_ps(x,x); - - v8sf y = *(v8sf*)_ps256_cephes_exp_p0; + z = _mm256_mul_ps(x, x); + + v8sf y = *(v8sf *)_ps256_cephes_exp_p0; y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p1); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p1); y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p2); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p2); y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p3); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p3); y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p4); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p4); y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p5); + y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p5); y = _mm256_mul_ps(y, z); y = _mm256_add_ps(y, x); y = _mm256_add_ps(y, one); @@ -296,7 +299,7 @@ v8sf exp256_ps(v8sf x) { /* build 2^n */ imm0 = _mm256_cvttps_epi32(fx); // another two AVX2 instructions - imm0 = avx2_mm256_add_epi32(imm0, *(v8si*)_pi32_256_0x7f); + imm0 = avx2_mm256_add_epi32(imm0, *(v8si *)_pi32_256_0x7f); imm0 = avx2_mm256_slli_epi32(imm0, 23); v8sf pow2n = _mm256_castsi256_ps(imm0); y = _mm256_mul_ps(y, pow2n); @@ -307,13 +310,12 @@ _PS256_CONST(minus_cephes_DP1, -0.78515625); _PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4); _PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8); _PS256_CONST(sincof_p0, -1.9515295891E-4); -_PS256_CONST(sincof_p1, 8.3321608736E-3); +_PS256_CONST(sincof_p1, 8.3321608736E-3); _PS256_CONST(sincof_p2, -1.6666654611E-1); -_PS256_CONST(coscof_p0, 2.443315711809948E-005); +_PS256_CONST(coscof_p0, 2.443315711809948E-005); _PS256_CONST(coscof_p1, -1.388731625493765E-003); -_PS256_CONST(coscof_p2, 4.166664568298827E-002); -_PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI - +_PS256_CONST(coscof_p2, 4.166664568298827E-002); +_PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI /* evaluation of 8 sines at onces using AVX intrisics @@ -327,7 +329,7 @@ _PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI surprising but correct result. */ -v8sf sin256_ps(v8sf x) { // any x +v8sf sin256_ps(v8sf x) { // any x v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, sign_bit, y; v8si imm0, imm2; @@ -338,78 +340,78 @@ v8sf sin256_ps(v8sf x) { // any x sign_bit = x; /* take the absolute value */ - x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask); + x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask); /* extract the sign bit (upper one) */ - sign_bit = _mm256_and_ps(sign_bit, *(v8sf*)_ps256_sign_mask); - + sign_bit = _mm256_and_ps(sign_bit, *(v8sf *)_ps256_sign_mask); + /* scale by 4/Pi */ - y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI); + y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI); - /* - Here we start a series of integer operations, which are in the - realm of AVX2. - If we don't have AVX, let's perform them using SSE2 directives - */ +/* + Here we start a series of integer operations, which are in the + realm of AVX2. + If we don't have AVX, let's perform them using SSE2 directives +*/ #ifdef __AVX2__ /* store the integer part of y in mm0 */ imm2 = _mm256_cvttps_epi32(y); /* j=(j+1) & (~1) (see the cephes sources) */ // another two AVX2 instruction - imm2 = avx2_mm256_add_epi32(imm2, *(v8si*)_pi32_256_1); - imm2 = avx2_mm256_and_si256(imm2, *(v8si*)_pi32_256_inv1); + imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1); + imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1); y = _mm256_cvtepi32_ps(imm2); /* get the swap sign flag */ - imm0 = avx2_mm256_and_si256(imm2, *(v8si*)_pi32_256_4); + imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4); imm0 = avx2_mm256_slli_epi32(imm0, 29); - /* get the polynom selection mask + /* get the polynom selection mask there is one polynom for 0 <= x <= Pi/4 and another one for Pi/4 #include "hl_functions.h" namespace hppl { - extern __m256 exp(__m256 a); +extern __m256 exp(__m256 a); - __m256 relu(const __m256 a) { - __m256 tmp = _mm256_set1_ps(0.0f); - return _mm256_max_ps(a, tmp); - } +__m256 relu(const __m256 a) { + __m256 tmp = _mm256_set1_ps(0.0f); + return _mm256_max_ps(a, tmp); +} - __m256 sigmoid(const __m256 a) { - __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); - __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); - __m256 tmp = _mm256_max_ps(a, min); - tmp = _mm256_min_ps(tmp, max); - tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), tmp); - tmp = exp(tmp); - tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp); - tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp); - return tmp; - } +__m256 sigmoid(const __m256 a) { + __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); + __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); + __m256 tmp = _mm256_max_ps(a, min); + tmp = _mm256_min_ps(tmp, max); + tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), tmp); + tmp = exp(tmp); + tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp); + tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp); + return tmp; +} - __m256 tanh(const __m256 a) { - __m256 max = _mm256_set1_ps(EXP_MAX_INPUT); - __m256 tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), a); - tmp = _mm256_min_ps(tmp, max); - tmp = exp(tmp); - return _mm256_sub_ps( - _mm256_div_ps(_mm256_set1_ps(2.0f), - _mm256_add_ps(_mm256_set1_ps(1.0f), tmp)), _mm256_set1_ps(1.0f)); - } +__m256 tanh(const __m256 a) { + __m256 max = _mm256_set1_ps(EXP_MAX_INPUT); + __m256 tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), a); + tmp = _mm256_min_ps(tmp, max); + tmp = exp(tmp); + return _mm256_sub_ps(_mm256_div_ps(_mm256_set1_ps(2.0f), + _mm256_add_ps(_mm256_set1_ps(1.0f), tmp)), + _mm256_set1_ps(1.0f)); +} - __m256 linear(const __m256 a) { - return a; - } +__m256 linear(const __m256 a) { return a; } - __m256 relu(const __m256 a, const __m256 b) { - return _mm256_mul_ps(a, +__m256 relu(const __m256 a, const __m256 b) { + return _mm256_mul_ps( + a, _mm256_and_ps(_mm256_cmp_ps(b, _mm256_set1_ps(0.0f), _CMP_GT_OS), - _mm256_set1_ps(1.0f))); - } + _mm256_set1_ps(1.0f))); +} - __m256 sigmoid(const __m256 a, const __m256 b) { - return _mm256_mul_ps(_mm256_mul_ps(a, b), - _mm256_sub_ps(_mm256_set1_ps(1.0f), b)); - } +__m256 sigmoid(const __m256 a, const __m256 b) { + return _mm256_mul_ps(_mm256_mul_ps(a, b), + _mm256_sub_ps(_mm256_set1_ps(1.0f), b)); +} - __m256 tanh(const __m256 a, const __m256 b) { - return _mm256_mul_ps(a, - _mm256_sub_ps(_mm256_set1_ps(1.0f), _mm256_mul_ps(b, b))); - } +__m256 tanh(const __m256 a, const __m256 b) { + return _mm256_mul_ps( + a, _mm256_sub_ps(_mm256_set1_ps(1.0f), _mm256_mul_ps(b, b))); +} - __m256 linear(const __m256 a, const __m256 b) { - return a; - } +__m256 linear(const __m256 a, const __m256 b) { return a; } } // namespace hppl diff --git a/paddle/cuda/src/hl_cpu_functions.cc b/paddle/cuda/src/hl_cpu_functions.cc index b8352c2d537fba5ec9cd3237fe8f3fa9c25cbffe..af00f352e536bf342e15315d1f6804225b87eb0b 100644 --- a/paddle/cuda/src/hl_cpu_functions.cc +++ b/paddle/cuda/src/hl_cpu_functions.cc @@ -12,46 +12,33 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include #include "hl_functions.h" namespace hppl { - real relu(const real a) { - return a > 0.0f ? a : 0.0f; - } - - real sigmoid(const real a) { - const real min = SIGMOID_THRESHOLD_MIN; - const real max = SIGMOID_THRESHOLD_MAX; - real tmp = (a < min) ? min : ((a > max) ? max : a); - return 1.0 / (1.0 + exp(-tmp)); - } - - real tanh(const real a) { - real tmp = -2.0 * a; - tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp; - return (2.0 / (1.0 + exp(tmp))) - 1.0; - } - - real linear(const real a) { - return a; - } - - real relu(const real a, const real b) { - return a * (b > 0.0f ? 1.0f : 0.0f); - } - - real sigmoid(const real a, const real b) { - return a * b * (1 - b); - } - - real tanh(const real a, const real b) { - return a * (1.0f - b * b); - } - - real linear(const real a, const real b) { - return a; - } +real relu(const real a) { return a > 0.0f ? a : 0.0f; } + +real sigmoid(const real a) { + const real min = SIGMOID_THRESHOLD_MIN; + const real max = SIGMOID_THRESHOLD_MAX; + real tmp = (a < min) ? min : ((a > max) ? max : a); + return 1.0 / (1.0 + exp(-tmp)); +} + +real tanh(const real a) { + real tmp = -2.0 * a; + tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp; + return (2.0 / (1.0 + exp(tmp))) - 1.0; +} + +real linear(const real a) { return a; } + +real relu(const real a, const real b) { return a * (b > 0.0f ? 1.0f : 0.0f); } + +real sigmoid(const real a, const real b) { return a * b * (1 - b); } + +real tanh(const real a, const real b) { return a * (1.0f - b * b); } + +real linear(const real a, const real b) { return a; } } // namespace hppl diff --git a/paddle/cuda/src/hl_cuda_cublas.cc b/paddle/cuda/src/hl_cuda_cublas.cc index abf6afadc218f615dc6b3cf734d09f072214be40..f82d6c9402fe392ba3d9e55cd551ff1b052fef65 100644 --- a/paddle/cuda/src/hl_cuda_cublas.cc +++ b/paddle/cuda/src/hl_cuda_cublas.cc @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include #include #include "hl_cuda.h" @@ -24,7 +23,7 @@ limitations under the License. */ namespace dynload { std::once_flag cublas_dso_flag; -void* cublas_dso_handle = nullptr; +void *cublas_dso_handle = nullptr; /** * The following macro definition can generate structs @@ -34,38 +33,32 @@ void* cublas_dso_handle = nullptr; * note: default dynamic linked libs */ #ifdef PADDLE_USE_DSO -#define DYNAMIC_LOAD_CUBLAS_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - cublasStatus_t operator()(Args... args) { \ - typedef cublasStatus_t (*cublasFunc)(Args...); \ - std::call_once(cublas_dso_flag, GetCublasDsoHandle, \ - &cublas_dso_handle); \ - void* p_##__name = dlsym(cublas_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ +#define DYNAMIC_LOAD_CUBLAS_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + cublasStatus_t operator()(Args... args) { \ + typedef cublasStatus_t (*cublasFunc)(Args...); \ + std::call_once(cublas_dso_flag, GetCublasDsoHandle, &cublas_dso_handle); \ + void *p_##__name = dlsym(cublas_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ } __name; // struct DynLoad__##__name #else -#define DYNAMIC_LOAD_CUBLAS_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - cublasStatus_t operator()(Args... args) { \ - return __name(args...); \ - } \ +#define DYNAMIC_LOAD_CUBLAS_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + cublasStatus_t operator()(Args... args) { \ + return __name(args...); \ + } \ } __name; // struct DynLoad__##__name #endif -#define DYNAMIC_LOAD_CUBLAS_V2_WRAP(__name) \ - DYNAMIC_LOAD_CUBLAS_WRAP(__name) +#define DYNAMIC_LOAD_CUBLAS_V2_WRAP(__name) DYNAMIC_LOAD_CUBLAS_WRAP(__name) // include all needed cublas functions in HPPL -#define CUBLAS_BLAS_ROUTINE_EACH(__macro) \ - __macro(cublasSgemv) \ - __macro(cublasDgemv) \ - __macro(cublasSgemm) \ - __macro(cublasDgemm) \ - __macro(cublasSgeam) \ - __macro(cublasDgeam) \ +#define CUBLAS_BLAS_ROUTINE_EACH(__macro) \ + __macro(cublasSgemv) __macro(cublasDgemv) __macro(cublasSgemm) \ + __macro(cublasDgemm) __macro(cublasSgeam) __macro(cublasDgeam) DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasCreate) DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasDestroy) @@ -88,41 +81,40 @@ CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP) } /* namespace dynload */ - #ifndef PADDLE_TYPE_DOUBLE -#define CUBLAS_GEAM dynload::cublasSgeam -#define CUBLAS_GEMV dynload::cublasSgemv -#define CUBLAS_GEMM dynload::cublasSgemm -#define CUBLAS_GETRF dynload::cublasSgetrfBatched -#define CUBLAS_GETRI dynload::cublasSgetriBatched +#define CUBLAS_GEAM dynload::cublasSgeam +#define CUBLAS_GEMV dynload::cublasSgemv +#define CUBLAS_GEMM dynload::cublasSgemm +#define CUBLAS_GETRF dynload::cublasSgetrfBatched +#define CUBLAS_GETRI dynload::cublasSgetriBatched #else -#define CUBLAS_GEAM dynload::cublasDgeam -#define CUBLAS_GEMV dynload::cublasDgemv -#define CUBLAS_GEMM dynload::cublasDgemm -#define CUBLAS_GETRF dynload::cublasDgetrfBatched -#define CUBLAS_GETRI dynload::cublasDgetriBatched +#define CUBLAS_GEAM dynload::cublasDgeam +#define CUBLAS_GEMV dynload::cublasDgemv +#define CUBLAS_GEMM dynload::cublasDgemm +#define CUBLAS_GETRF dynload::cublasDgetrfBatched +#define CUBLAS_GETRI dynload::cublasDgetriBatched #endif -const char* hl_cublas_get_error_string(cublasStatus_t status) { +const char *hl_cublas_get_error_string(cublasStatus_t status) { switch (status) { - case CUBLAS_STATUS_NOT_INITIALIZED: - return "[cublas status]: not initialized"; - case CUBLAS_STATUS_ALLOC_FAILED: - return "[cublas status]: allocate failed"; - case CUBLAS_STATUS_INVALID_VALUE: - return "[cublas status]: invalid value"; - case CUBLAS_STATUS_ARCH_MISMATCH: - return "[cublas status]: arch mismatch"; - case CUBLAS_STATUS_MAPPING_ERROR: - return "[cublas status]: mapping error"; - case CUBLAS_STATUS_EXECUTION_FAILED: - return "[cublas status]: execution failed"; - case CUBLAS_STATUS_INTERNAL_ERROR: - return "[cublas status]: internal error"; - case CUBLAS_STATUS_SUCCESS: - return "[cublas status]: success"; - default: - return "[cublas status]: unknown error"; + case CUBLAS_STATUS_NOT_INITIALIZED: + return "[cublas status]: not initialized"; + case CUBLAS_STATUS_ALLOC_FAILED: + return "[cublas status]: allocate failed"; + case CUBLAS_STATUS_INVALID_VALUE: + return "[cublas status]: invalid value"; + case CUBLAS_STATUS_ARCH_MISMATCH: + return "[cublas status]: arch mismatch"; + case CUBLAS_STATUS_MAPPING_ERROR: + return "[cublas status]: mapping error"; + case CUBLAS_STATUS_EXECUTION_FAILED: + return "[cublas status]: execution failed"; + case CUBLAS_STATUS_INTERNAL_ERROR: + return "[cublas status]: internal error"; + case CUBLAS_STATUS_SUCCESS: + return "[cublas status]: success"; + default: + return "[cublas status]: unknown error"; } } @@ -131,27 +123,21 @@ const char* hl_cublas_get_error_string(cublasStatus_t status) { * support << operator for more details error info. */ cublasStatus_t g_cublasStat; -#define CHECK_CUBLAS(cublas_func) \ - g_cublasStat = cublas_func; \ - CHECK_EQ(CUBLAS_STATUS_SUCCESS, g_cublasStat) \ - << "Cublas Error: " \ - << hl_cublas_get_error_string(g_cublasStat) \ - << " " +#define CHECK_CUBLAS(cublas_func) \ + g_cublasStat = cublas_func; \ + CHECK_EQ(CUBLAS_STATUS_SUCCESS, g_cublasStat) \ + << "Cublas Error: " << hl_cublas_get_error_string(g_cublasStat) << " " void hl_cublas_init(cublasHandle_t *cublas_handle, cudaStream_t stream) { CHECK_CUBLAS(dynload::cublasCreate(cublas_handle)) - << "[cublas init] Cublas create handle faild!"; + << "[cublas init] Cublas create handle faild!"; CHECK_CUBLAS(dynload::cublasSetStream(*cublas_handle, stream)) - << "[cublas init] Cublas set stream faild!"; + << "[cublas init] Cublas set stream faild!"; } -void hl_matrix_transpose(real *A_d, - real *C_d, - int dimM, - int dimN, - int lda, - int ldc) { +void hl_matrix_transpose( + real *A_d, real *C_d, int dimM, int dimN, int lda, int ldc) { real alpha = 1.0; real beta = 0.0; @@ -159,11 +145,18 @@ void hl_matrix_transpose(real *A_d, CHECK_NOTNULL(C_d); CHECK_CUBLAS(CUBLAS_GEAM(t_resource.handle, - CUBLAS_OP_T, CUBLAS_OP_N, - dimM, dimN, - &alpha, A_d, lda, - &beta, nullptr, dimM, - C_d, ldc)); + CUBLAS_OP_T, + CUBLAS_OP_N, + dimM, + dimN, + &alpha, + A_d, + lda, + &beta, + nullptr, + dimM, + C_d, + ldc)); CHECK_SYNC("hl_matrix_transpose failed"); } @@ -188,13 +181,13 @@ void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) { small-sized matrices. There may be a better way to reconstruct the API for better performance. */ - CHECK_CUBLAS(CUBLAS_GETRF(t_resource.handle, - dimN, inout_d, lda, pivot_d, info_d, 1)); + CHECK_CUBLAS( + CUBLAS_GETRF(t_resource.handle, dimN, inout_d, lda, pivot_d, info_d, 1)); int info_h; hl_memcpy(&info_h, info_d, sizeof(int)); if (info_h != 0) { - LOG(FATAL) << "Factorization of matrix failed: matrix may be singular.\n"; + LOG(FATAL) << "Factorization of matrix failed: matrix may be singular.\n"; } /* Step 2: Compute the inverse of the matrix given its LU decomposition */ @@ -203,12 +196,18 @@ void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) { hl_memcpy(out_d, out_h, sizeof(real *)); CHECK_CUBLAS(CUBLAS_GETRI(t_resource.handle, - dimN, (const real **)inout_d, lda, pivot_d, - out_d, ldc, info_d, 1)); + dimN, + (const real **)inout_d, + lda, + pivot_d, + out_d, + ldc, + info_d, + 1)); hl_memcpy(&info_h, info_d, sizeof(int)); if (info_h != 0) { - LOG(FATAL) << "Inversion of matrix failed: matrix may be singular.\n"; + LOG(FATAL) << "Inversion of matrix failed: matrix may be singular.\n"; } hl_free_mem_device(inout_d); @@ -218,12 +217,19 @@ void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) { CHECK_SYNC("hl_matrix_inverse failed"); } -void hl_matrix_mul(real *A_d, hl_trans_op_t transa, - real *B_d, hl_trans_op_t transb, +void hl_matrix_mul(real *A_d, + hl_trans_op_t transa, + real *B_d, + hl_trans_op_t transb, real *C_d, - int dimM, int dimN, int dimK, - real alpha, real beta, - int lda, int ldb, int ldc) { + int dimM, + int dimN, + int dimK, + real alpha, + real beta, + int lda, + int ldb, + int ldc) { CHECK_NOTNULL(A_d); CHECK_NOTNULL(B_d); CHECK_NOTNULL(C_d); @@ -231,8 +237,8 @@ void hl_matrix_mul(real *A_d, hl_trans_op_t transa, if (dimN == 1 && dimM != 1 && dimK != 1 && transb == HPPL_OP_N) { int m = (transa == HPPL_OP_N) ? dimM : dimK; int n = (transa == HPPL_OP_N) ? dimK : dimM; - hl_matrix_mul_vector(A_d, transa, B_d, C_d, m, n, - alpha, beta, lda, ldb, ldc); + hl_matrix_mul_vector( + A_d, transa, B_d, C_d, m, n, alpha, beta, lda, ldb, ldc); return; } @@ -240,8 +246,7 @@ void hl_matrix_mul(real *A_d, hl_trans_op_t transa, int m = (transb == HPPL_OP_N) ? dimK : dimN; int n = (transb == HPPL_OP_N) ? dimN : dimK; hl_trans_op_t trans = (transb == HPPL_OP_N) ? HPPL_OP_T : HPPL_OP_N; - hl_matrix_mul_vector(B_d, trans, A_d, C_d, m, n, - alpha, beta, ldb, 1, 1); + hl_matrix_mul_vector(B_d, trans, A_d, C_d, m, n, alpha, beta, ldb, 1, 1); return; } @@ -250,26 +255,47 @@ void hl_matrix_mul(real *A_d, hl_trans_op_t transa, stat = CUBLAS_GEMM(t_resource.handle, CUBLAS_OP_N, CUBLAS_OP_N, - dimN, dimM, dimK, - &alpha, B_d, ldb, - A_d, lda, - &beta, C_d, ldc); + dimN, + dimM, + dimK, + &alpha, + B_d, + ldb, + A_d, + lda, + &beta, + C_d, + ldc); } else if ((HPPL_OP_T == transa) && (HPPL_OP_N == transb)) { stat = CUBLAS_GEMM(t_resource.handle, CUBLAS_OP_N, CUBLAS_OP_T, - dimN, dimM, dimK, - &alpha, B_d, ldb, - A_d, lda, - &beta, C_d, ldc); + dimN, + dimM, + dimK, + &alpha, + B_d, + ldb, + A_d, + lda, + &beta, + C_d, + ldc); } else if ((HPPL_OP_N == transa) && (HPPL_OP_T == transb)) { stat = CUBLAS_GEMM(t_resource.handle, CUBLAS_OP_T, CUBLAS_OP_N, - dimN, dimM, dimK, - &alpha, B_d, ldb, - A_d, lda, - &beta, C_d, ldc); + dimN, + dimM, + dimK, + &alpha, + B_d, + ldb, + A_d, + lda, + &beta, + C_d, + ldc); } else { LOG(FATAL) << "parameter transa error!"; } @@ -277,24 +303,46 @@ void hl_matrix_mul(real *A_d, hl_trans_op_t transa, CHECK_SYNC("hl_matrix_mul failed"); } -void hl_matrix_mul(real *A_d, hl_trans_op_t transa, - real *B_d, hl_trans_op_t transb, +void hl_matrix_mul(real *A_d, + hl_trans_op_t transa, + real *B_d, + hl_trans_op_t transb, real *C_d, - int dimM, int dimN, int dimK, - real alpha, real beta) { + int dimM, + int dimN, + int dimK, + real alpha, + real beta) { int lda = (HPPL_OP_N == transa) ? dimK : dimM; int ldb = (HPPL_OP_N == transb) ? dimN : dimK; int ldc = dimN; - hl_matrix_mul(A_d, transa, B_d, transb, C_d, dimM, dimN, - dimK, alpha, beta, lda, ldb, ldc); + hl_matrix_mul(A_d, + transa, + B_d, + transb, + C_d, + dimM, + dimN, + dimK, + alpha, + beta, + lda, + ldb, + ldc); } -void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans, - real *B_d, real *C_d, - int dimM, int dimN, - real alpha, real beta, - int lda, int incb, int incc) { +void hl_matrix_mul_vector(real *A_d, + hl_trans_op_t trans, + real *B_d, + real *C_d, + int dimM, + int dimN, + real alpha, + real beta, + int lda, + int incb, + int incc) { CHECK_NOTNULL(A_d); CHECK_NOTNULL(B_d); CHECK_NOTNULL(C_d); @@ -303,21 +351,29 @@ void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans, if (HPPL_OP_N == trans) { stat = CUBLAS_GEMV(t_resource.handle, CUBLAS_OP_T, - dimN, dimM, + dimN, + dimM, &alpha, - A_d, lda, - B_d, incb, + A_d, + lda, + B_d, + incb, &beta, - C_d, incc); + C_d, + incc); } else if (HPPL_OP_T == trans) { stat = CUBLAS_GEMV(t_resource.handle, CUBLAS_OP_N, - dimN, dimM, + dimN, + dimM, &alpha, - A_d, lda, - B_d, incb, + A_d, + lda, + B_d, + incb, &beta, - C_d, incc); + C_d, + incc); } else { LOG(FATAL) << "parameter transa error!"; } @@ -326,10 +382,14 @@ void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans, CHECK_SYNC("hl_matrix_mul_vector"); } -void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans, - real *B_d, real *C_d, - int dimM, int dimN, - real alpha, real beta) { - hl_matrix_mul_vector(A_d, trans, B_d, C_d, dimM, dimN, - alpha, beta, dimN, 1, 1); +void hl_matrix_mul_vector(real *A_d, + hl_trans_op_t trans, + real *B_d, + real *C_d, + int dimM, + int dimN, + real alpha, + real beta) { + hl_matrix_mul_vector( + A_d, trans, B_d, C_d, dimM, dimN, alpha, beta, dimN, 1, 1); } diff --git a/paddle/cuda/src/hl_cuda_cudnn.cc b/paddle/cuda/src/hl_cuda_cudnn.cc index 1829fe23ac594e63253df23b350b16cb28eaebc1..9d4ff08a78d641896e946e9bf04590d4ba93350f 100644 --- a/paddle/cuda/src/hl_cuda_cudnn.cc +++ b/paddle/cuda/src/hl_cuda_cudnn.cc @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include #include #include "hl_cuda_cudnn.h" @@ -22,9 +21,10 @@ limitations under the License. */ #include "paddle/utils/Logging.h" #include "paddle/utils/CommandLineParser.h" -P_DEFINE_int32(cudnn_conv_workspace_limit_in_mb, 4096, - "Specify cuDNN max workspace limit, in units MB, " - "4096MB=4GB by default."); +P_DEFINE_int32(cudnn_conv_workspace_limit_in_mb, + 4096, + "Specify cuDNN max workspace limit, in units MB, " + "4096MB=4GB by default."); namespace dynload { @@ -41,16 +41,15 @@ void* cudnn_dso_handle = nullptr; #ifdef PADDLE_USE_DSO -#define DYNAMIC_LOAD_CUDNN_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - auto operator()(Args... args) -> decltype(__name(args...)) { \ - using cudnn_func = decltype(__name(args...))(*)(Args...); \ - std::call_once(cudnn_dso_flag, GetCudnnDsoHandle, \ - &cudnn_dso_handle); \ - void* p_##__name = dlsym(cudnn_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ +#define DYNAMIC_LOAD_CUDNN_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> decltype(__name(args...)) { \ + using cudnn_func = decltype(__name(args...)) (*)(Args...); \ + std::call_once(cudnn_dso_flag, GetCudnnDsoHandle, &cudnn_dso_handle); \ + void* p_##__name = dlsym(cudnn_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ } __name; /* struct DynLoad__##__name */ #else @@ -69,6 +68,7 @@ void* cudnn_dso_handle = nullptr; * include all needed cudnn functions in HPPL * different cudnn version has different interfaces **/ +// clang-format off #define CUDNN_DNN_ROUTINE_EACH(__macro) \ __macro(cudnnSetTensor4dDescriptor) \ __macro(cudnnSetTensor4dDescriptorEx) \ @@ -141,56 +141,53 @@ CUDNN_DNN_ROUTINE_EACH_R5(DYNAMIC_LOAD_CUDNN_WRAP) #endif #undef CUDNN_DNN_ROUTINE_EACH - +// clang-format on } /* namespace dynload */ /** * Check build-in cudnn function using glog and it **does not** * support << operator for more details error info. */ -#define CHECK_CUDNN(cudnnFunc) \ - do { \ - cudnnStatus_t cudnnStat = cudnnFunc; \ - CHECK_EQ(CUDNN_STATUS_SUCCESS, cudnnStat) \ - << "Cudnn Error: " \ - << dynload::cudnnGetErrorString(cudnnStat); \ +#define CHECK_CUDNN(cudnnFunc) \ + do { \ + cudnnStatus_t cudnnStat = cudnnFunc; \ + CHECK_EQ(CUDNN_STATUS_SUCCESS, cudnnStat) \ + << "Cudnn Error: " << dynload::cudnnGetErrorString(cudnnStat); \ } while (0) bool g_is_libcudnn_init = false; int g_cudnn_lib_version = 0; -void hl_cudnn_desc_init(cudnnTensorDescriptor_t* cudnn_desc) { - CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(cudnn_desc)); +void hl_cudnn_desc_init(cudnnTensorDescriptor_t* cudnn_desc) { + CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(cudnn_desc)); } -void hl_cudnn_init(cudnnHandle_t *cudnn_handle, cudaStream_t stream) { - size_t cudnn_dso_ver = dynload::cudnnGetVersion(); - size_t cudnn_dso_major = cudnn_dso_ver / 1000; - size_t cudnn_cuh_major = CUDNN_VERSION / 1000; - - // Compare cudnn header version with that of cudnn.so. - CHECK((cudnn_cuh_major < 4 && cudnn_dso_major < 4) || - (cudnn_cuh_major == cudnn_dso_major)) - << "[cudnn init] libcudnn v" << cudnn_dso_major << - " with header v" << cudnn_cuh_major << " unmatched!\n" - << "PaddlePaddle Requirement: " - << "(header v[2-3] with libcudnn v[2-3]) Or " - << "(header v4 with libcudnn v4) Or " - << "(header v5 with libcudnn v5)."; - - CHECK(!(CUDNN_VERSION >= 5000 && CUDA_VERSION < 7050)) - << "cudnn v5 requires cuda version >= 7.5"; - - CHECK_CUDNN(dynload::cudnnCreate(cudnn_handle)); - CHECK_CUDNN(dynload::cudnnSetStream(*cudnn_handle, stream)); - - g_is_libcudnn_init = true; - g_cudnn_lib_version = cudnn_dso_ver; +void hl_cudnn_init(cudnnHandle_t* cudnn_handle, cudaStream_t stream) { + size_t cudnn_dso_ver = dynload::cudnnGetVersion(); + size_t cudnn_dso_major = cudnn_dso_ver / 1000; + size_t cudnn_cuh_major = CUDNN_VERSION / 1000; + + // Compare cudnn header version with that of cudnn.so. + CHECK((cudnn_cuh_major < 4 && cudnn_dso_major < 4) || + (cudnn_cuh_major == cudnn_dso_major)) + << "[cudnn init] libcudnn v" << cudnn_dso_major << " with header v" + << cudnn_cuh_major << " unmatched!\n" + << "PaddlePaddle Requirement: " + << "(header v[2-3] with libcudnn v[2-3]) Or " + << "(header v4 with libcudnn v4) Or " + << "(header v5 with libcudnn v5)."; + + CHECK(!(CUDNN_VERSION >= 5000 && CUDA_VERSION < 7050)) + << "cudnn v5 requires cuda version >= 7.5"; + + CHECK_CUDNN(dynload::cudnnCreate(cudnn_handle)); + CHECK_CUDNN(dynload::cudnnSetStream(*cudnn_handle, stream)); + + g_is_libcudnn_init = true; + g_cudnn_lib_version = cudnn_dso_ver; } -int hl_get_cudnn_lib_version() { - return g_cudnn_lib_version; -} +int hl_get_cudnn_lib_version() { return g_cudnn_lib_version; } void hl_conv_workspace(hl_tensor_descriptor input, hl_tensor_descriptor output, @@ -204,99 +201,91 @@ void hl_conv_workspace(hl_tensor_descriptor input, size_t* bwdFilterLimitBytes) { #if CUDNN_VERSION >= 4000 - CHECK_NOTNULL(input); - CHECK_NOTNULL(output); - CHECK_NOTNULL(filter); - CHECK_NOTNULL(conv); - - // Specify workspace limit directly - size_t memoryLimitBytes = - (1LL << 20) * FLAGS_cudnn_conv_workspace_limit_in_mb; - - // cudnn convolution forward configuration - cudnnTensorDescriptor_t fwd_src_desc = - GET_TENSOR_DESCRIPTOR(input); - cudnnTensorDescriptor_t fwd_dest_desc = - GET_TENSOR_DESCRIPTOR(output); - cudnnFilterDescriptor_t fwd_filter_desc = - GET_FILTER_DESCRIPTOR(filter); - cudnnConvolutionDescriptor_t fwd_conv_desc = - GET_CONVOLUTION_DESCRIPTOR(conv); - - CHECK_CUDNN(dynload::cudnnGetConvolutionForwardAlgorithm( - t_resource.cudnn_handle, - fwd_src_desc, - fwd_filter_desc, - fwd_conv_desc, - fwd_dest_desc, - CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, - memoryLimitBytes, - reinterpret_cast(convFwdAlgo))); - - CHECK_CUDNN(dynload::cudnnGetConvolutionForwardWorkspaceSize( - t_resource.cudnn_handle, - fwd_src_desc, - fwd_filter_desc, - fwd_conv_desc, - fwd_dest_desc, - static_cast(*convFwdAlgo), - fwdLimitBytes)); - - // cudnn convolution backward data configuration - cudnnFilterDescriptor_t bwd_data_filter_desc = - GET_FILTER_DESCRIPTOR(filter); - cudnnTensorDescriptor_t bwd_data_diff_desc = - GET_TENSOR_DESCRIPTOR(output); - cudnnTensorDescriptor_t bwd_data_grad_desc = - GET_TENSOR_DESCRIPTOR(input); - cudnnConvolutionDescriptor_t bwd_data_conv_desc = - GET_CONVOLUTION_DESCRIPTOR(conv); - - CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataAlgorithm( - t_resource.cudnn_handle, - bwd_data_filter_desc, - bwd_data_diff_desc, - bwd_data_conv_desc, - bwd_data_grad_desc, - CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, - memoryLimitBytes, - reinterpret_cast(convBwdDataAlgo))); - - CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( - t_resource.cudnn_handle, - bwd_data_filter_desc, - bwd_data_diff_desc, - bwd_data_conv_desc, - bwd_data_grad_desc, - static_cast(*convBwdDataAlgo), - bwdDataLimitBytes)); - - // cudnn convolution backward filter configuration - cudnnTensorDescriptor_t bwd_filter_src_desc = - GET_TENSOR_DESCRIPTOR(input); - cudnnTensorDescriptor_t bwd_filter_diff_desc = - GET_TENSOR_DESCRIPTOR(output); - cudnnConvolutionDescriptor_t bwd_filter_conv_desc = - GET_CONVOLUTION_DESCRIPTOR(conv); - cudnnFilterDescriptor_t bwd_filter_grad_desc = - GET_FILTER_DESCRIPTOR(filter); - - CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterAlgorithm( - t_resource.cudnn_handle, - bwd_filter_src_desc, - bwd_filter_diff_desc, - bwd_filter_conv_desc, - bwd_filter_grad_desc, - CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, - memoryLimitBytes, - reinterpret_cast(convBwdFilterAlgo))); - - CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize( - t_resource.cudnn_handle, bwd_filter_src_desc, - bwd_filter_diff_desc, bwd_filter_conv_desc, - bwd_filter_grad_desc, - static_cast(*convBwdFilterAlgo), - bwdFilterLimitBytes)); + CHECK_NOTNULL(input); + CHECK_NOTNULL(output); + CHECK_NOTNULL(filter); + CHECK_NOTNULL(conv); + + // Specify workspace limit directly + size_t memoryLimitBytes = + (1LL << 20) * FLAGS_cudnn_conv_workspace_limit_in_mb; + + // cudnn convolution forward configuration + cudnnTensorDescriptor_t fwd_src_desc = GET_TENSOR_DESCRIPTOR(input); + cudnnTensorDescriptor_t fwd_dest_desc = GET_TENSOR_DESCRIPTOR(output); + cudnnFilterDescriptor_t fwd_filter_desc = GET_FILTER_DESCRIPTOR(filter); + cudnnConvolutionDescriptor_t fwd_conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv); + + CHECK_CUDNN(dynload::cudnnGetConvolutionForwardAlgorithm( + t_resource.cudnn_handle, + fwd_src_desc, + fwd_filter_desc, + fwd_conv_desc, + fwd_dest_desc, + CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, + memoryLimitBytes, + reinterpret_cast(convFwdAlgo))); + + CHECK_CUDNN(dynload::cudnnGetConvolutionForwardWorkspaceSize( + t_resource.cudnn_handle, + fwd_src_desc, + fwd_filter_desc, + fwd_conv_desc, + fwd_dest_desc, + static_cast(*convFwdAlgo), + fwdLimitBytes)); + + // cudnn convolution backward data configuration + cudnnFilterDescriptor_t bwd_data_filter_desc = GET_FILTER_DESCRIPTOR(filter); + cudnnTensorDescriptor_t bwd_data_diff_desc = GET_TENSOR_DESCRIPTOR(output); + cudnnTensorDescriptor_t bwd_data_grad_desc = GET_TENSOR_DESCRIPTOR(input); + cudnnConvolutionDescriptor_t bwd_data_conv_desc = + GET_CONVOLUTION_DESCRIPTOR(conv); + + CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataAlgorithm( + t_resource.cudnn_handle, + bwd_data_filter_desc, + bwd_data_diff_desc, + bwd_data_conv_desc, + bwd_data_grad_desc, + CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, + memoryLimitBytes, + reinterpret_cast(convBwdDataAlgo))); + + CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( + t_resource.cudnn_handle, + bwd_data_filter_desc, + bwd_data_diff_desc, + bwd_data_conv_desc, + bwd_data_grad_desc, + static_cast(*convBwdDataAlgo), + bwdDataLimitBytes)); + + // cudnn convolution backward filter configuration + cudnnTensorDescriptor_t bwd_filter_src_desc = GET_TENSOR_DESCRIPTOR(input); + cudnnTensorDescriptor_t bwd_filter_diff_desc = GET_TENSOR_DESCRIPTOR(output); + cudnnConvolutionDescriptor_t bwd_filter_conv_desc = + GET_CONVOLUTION_DESCRIPTOR(conv); + cudnnFilterDescriptor_t bwd_filter_grad_desc = GET_FILTER_DESCRIPTOR(filter); + + CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterAlgorithm( + t_resource.cudnn_handle, + bwd_filter_src_desc, + bwd_filter_diff_desc, + bwd_filter_conv_desc, + bwd_filter_grad_desc, + CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, + memoryLimitBytes, + reinterpret_cast(convBwdFilterAlgo))); + + CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize( + t_resource.cudnn_handle, + bwd_filter_src_desc, + bwd_filter_diff_desc, + bwd_filter_conv_desc, + bwd_filter_grad_desc, + static_cast(*convBwdFilterAlgo), + bwdFilterLimitBytes)); #endif } @@ -306,55 +295,54 @@ void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc, int feature_maps, int height, int width) { - CHECK_NOTNULL(image_desc); + CHECK_NOTNULL(image_desc); - cudnn_tensor_descriptor hl_desc = - (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor)); - CHECK_NOTNULL(hl_desc); + cudnn_tensor_descriptor hl_desc = + (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor)); + CHECK_NOTNULL(hl_desc); #ifndef PADDLE_TYPE_DOUBLE - cudnnDataType_t data_type = CUDNN_DATA_FLOAT; + cudnnDataType_t data_type = CUDNN_DATA_FLOAT; #else - cudnnDataType_t data_type = CUDNN_DATA_DOUBLE; + cudnnDataType_t data_type = CUDNN_DATA_DOUBLE; #endif - CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(&hl_desc->desc)); - - CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor( - hl_desc->desc, - CUDNN_TENSOR_NCHW, - data_type, - batch_size, - feature_maps, - height, - width)); - - hl_desc->format = CUDNN_TENSOR_NCHW; - hl_desc->data_type = data_type; - hl_desc->batch_size = batch_size; - hl_desc->feature_maps = feature_maps; - hl_desc->height = height; - hl_desc->width = width; - - *image_desc = (hl_tensor_descriptor)hl_desc; + CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(&hl_desc->desc)); + + CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(hl_desc->desc, + CUDNN_TENSOR_NCHW, + data_type, + batch_size, + feature_maps, + height, + width)); + + hl_desc->format = CUDNN_TENSOR_NCHW; + hl_desc->data_type = data_type; + hl_desc->batch_size = batch_size; + hl_desc->feature_maps = feature_maps; + hl_desc->height = height; + hl_desc->width = width; + + *image_desc = (hl_tensor_descriptor)hl_desc; } void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc) { - CHECK_NOTNULL(image_desc); + CHECK_NOTNULL(image_desc); - cudnn_tensor_descriptor hl_desc = - (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor)); - CHECK_NOTNULL(hl_desc); + cudnn_tensor_descriptor hl_desc = + (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor)); + CHECK_NOTNULL(hl_desc); #ifndef PADDLE_TYPE_DOUBLE - cudnnDataType_t data_type = CUDNN_DATA_FLOAT; + cudnnDataType_t data_type = CUDNN_DATA_FLOAT; #else - cudnnDataType_t data_type = CUDNN_DATA_DOUBLE; + cudnnDataType_t data_type = CUDNN_DATA_DOUBLE; #endif - CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(&hl_desc->desc)); + CHECK_CUDNN(dynload::cudnnCreateTensorDescriptor(&hl_desc->desc)); - hl_desc->data_type = data_type; + hl_desc->data_type = data_type; - *image_desc = (hl_tensor_descriptor)hl_desc; + *image_desc = (hl_tensor_descriptor)hl_desc; } void hl_tensor_reshape(hl_tensor_descriptor image_desc, @@ -362,19 +350,19 @@ void hl_tensor_reshape(hl_tensor_descriptor image_desc, int feature_maps, int height, int width) { - const int stride_w = 1; - const int stride_h = width * stride_w; - const int stride_c = height * stride_h; - const int stride_n = feature_maps * stride_c; - return hl_tensor_reshape(image_desc, - batch_size, - feature_maps, - height, - width, - stride_n, - stride_c, - stride_h, - stride_w); + const int stride_w = 1; + const int stride_h = width * stride_w; + const int stride_c = height * stride_h; + const int stride_n = feature_maps * stride_c; + return hl_tensor_reshape(image_desc, + batch_size, + feature_maps, + height, + width, + stride_n, + stride_c, + stride_h, + stride_w); } void hl_tensor_reshape(hl_tensor_descriptor image_desc, @@ -386,42 +374,41 @@ void hl_tensor_reshape(hl_tensor_descriptor image_desc, int cStride, int hStride, int wStride) { - CHECK_NOTNULL(image_desc); - - cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc; - CHECK_NOTNULL(hl_desc->desc); - - CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptorEx(hl_desc->desc, - hl_desc->data_type, - batch_size, - feature_maps, - height, - width, - nStride, - cStride, - hStride, - wStride)); - - hl_desc->batch_size = batch_size; - hl_desc->feature_maps = feature_maps; - hl_desc->height = height; - hl_desc->width = width; + CHECK_NOTNULL(image_desc); + + cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc; + CHECK_NOTNULL(hl_desc->desc); + + CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptorEx(hl_desc->desc, + hl_desc->data_type, + batch_size, + feature_maps, + height, + width, + nStride, + cStride, + hStride, + wStride)); + + hl_desc->batch_size = batch_size; + hl_desc->feature_maps = feature_maps; + hl_desc->height = height; + hl_desc->width = width; } void hl_destroy_tensor_descriptor(hl_tensor_descriptor image_desc) { - CHECK_NOTNULL(image_desc); + CHECK_NOTNULL(image_desc); - cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc; - CHECK_NOTNULL(hl_desc->desc); + cudnn_tensor_descriptor hl_desc = (cudnn_tensor_descriptor)image_desc; + CHECK_NOTNULL(hl_desc->desc); - CHECK_CUDNN(dynload::cudnnDestroyTensorDescriptor(hl_desc->desc)); + CHECK_CUDNN(dynload::cudnnDestroyTensorDescriptor(hl_desc->desc)); - hl_desc->desc = NULL; + hl_desc->desc = NULL; - free(image_desc); + free(image_desc); } - void hl_create_pooling_descriptor(hl_pooling_descriptor* pooling_desc, hl_pooling_mode_t mode, int height, @@ -430,63 +417,61 @@ void hl_create_pooling_descriptor(hl_pooling_descriptor* pooling_desc, int width_padding, int stride_height, int stride_width) { - cudnnPoolingMode_t cudnn_mode; - switch (mode) { - case HL_POOLING_MAX: - cudnn_mode = CUDNN_POOLING_MAX; - break; - case HL_POOLING_AVERAGE: - cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING; - break; - case HL_POOLING_AVERAGE_EXCLUDE_PADDING: - cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; - break; - default: - LOG(FATAL) << "parameter mode error"; - } - - CHECK_NOTNULL(pooling_desc); - - cudnn_pooling_descriptor hl_pooling_desc = - (cudnn_pooling_descriptor)malloc(sizeof(_cudnn_pooling_descriptor)); - CHECK_NOTNULL(hl_pooling_desc); - - CHECK_CUDNN(dynload::cudnnCreatePoolingDescriptor(&hl_pooling_desc->desc)); - - CHECK_CUDNN(dynload::cudnnSetPooling2dDescriptor( - hl_pooling_desc->desc, - cudnn_mode, + cudnnPoolingMode_t cudnn_mode; + switch (mode) { + case HL_POOLING_MAX: + cudnn_mode = CUDNN_POOLING_MAX; + break; + case HL_POOLING_AVERAGE: + cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING; + break; + case HL_POOLING_AVERAGE_EXCLUDE_PADDING: + cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; + break; + default: + LOG(FATAL) << "parameter mode error"; + } + + CHECK_NOTNULL(pooling_desc); + + cudnn_pooling_descriptor hl_pooling_desc = + (cudnn_pooling_descriptor)malloc(sizeof(_cudnn_pooling_descriptor)); + CHECK_NOTNULL(hl_pooling_desc); + + CHECK_CUDNN(dynload::cudnnCreatePoolingDescriptor(&hl_pooling_desc->desc)); + + CHECK_CUDNN(dynload::cudnnSetPooling2dDescriptor(hl_pooling_desc->desc, + cudnn_mode, #if CUDNN_VERSION >= 5000 - CUDNN_PROPAGATE_NAN, + CUDNN_PROPAGATE_NAN, #endif - height, - width, - height_padding, - width_padding, - stride_height, - stride_width)); - - hl_pooling_desc->mode = cudnn_mode; - hl_pooling_desc->window_height = height; - hl_pooling_desc->window_width = width; - hl_pooling_desc->stride_height = stride_height; - hl_pooling_desc->stride_width = stride_width; - - *pooling_desc = (hl_pooling_descriptor)hl_pooling_desc; + height, + width, + height_padding, + width_padding, + stride_height, + stride_width)); + + hl_pooling_desc->mode = cudnn_mode; + hl_pooling_desc->window_height = height; + hl_pooling_desc->window_width = width; + hl_pooling_desc->stride_height = stride_height; + hl_pooling_desc->stride_width = stride_width; + + *pooling_desc = (hl_pooling_descriptor)hl_pooling_desc; } void hl_destroy_pooling_descriptor(hl_pooling_descriptor pooling_desc) { - CHECK_NOTNULL(pooling_desc); + CHECK_NOTNULL(pooling_desc); - cudnn_pooling_descriptor hl_pooling = - (cudnn_pooling_descriptor)pooling_desc; + cudnn_pooling_descriptor hl_pooling = (cudnn_pooling_descriptor)pooling_desc; - CHECK_NOTNULL(hl_pooling->desc); - CHECK_CUDNN(dynload::cudnnDestroyPoolingDescriptor(hl_pooling->desc)); + CHECK_NOTNULL(hl_pooling->desc); + CHECK_CUDNN(dynload::cudnnDestroyPoolingDescriptor(hl_pooling->desc)); - hl_pooling->desc = NULL; + hl_pooling->desc = NULL; - free(pooling_desc); + free(pooling_desc); } void hl_pooling_forward(hl_tensor_descriptor input, @@ -494,31 +479,30 @@ void hl_pooling_forward(hl_tensor_descriptor input, hl_tensor_descriptor output, real* output_image, hl_pooling_descriptor pooling) { - cudnnPoolingDescriptor_t pooling_desc; - cudnnTensorDescriptor_t input_desc; - cudnnTensorDescriptor_t output_desc; - - CHECK_NOTNULL(input); - CHECK_NOTNULL(output); - CHECK_NOTNULL(pooling); - CHECK_NOTNULL(input_image); - CHECK_NOTNULL(output_image); - - real alpha = 1.0f; - real beta = 1.0f; - input_desc = ((cudnn_tensor_descriptor)input)->desc; - output_desc = ((cudnn_tensor_descriptor)output)->desc; - pooling_desc = ((cudnn_pooling_descriptor)pooling)->desc; - CHECK_CUDNN(dynload::cudnnPoolingForward( - t_resource.cudnn_handle, - pooling_desc, - &alpha, - input_desc, - input_image, - &beta, - output_desc, - output_image)); - CHECK_SYNC("hl_pooling_forward failed"); + cudnnPoolingDescriptor_t pooling_desc; + cudnnTensorDescriptor_t input_desc; + cudnnTensorDescriptor_t output_desc; + + CHECK_NOTNULL(input); + CHECK_NOTNULL(output); + CHECK_NOTNULL(pooling); + CHECK_NOTNULL(input_image); + CHECK_NOTNULL(output_image); + + real alpha = 1.0f; + real beta = 1.0f; + input_desc = ((cudnn_tensor_descriptor)input)->desc; + output_desc = ((cudnn_tensor_descriptor)output)->desc; + pooling_desc = ((cudnn_pooling_descriptor)pooling)->desc; + CHECK_CUDNN(dynload::cudnnPoolingForward(t_resource.cudnn_handle, + pooling_desc, + &alpha, + input_desc, + input_image, + &beta, + output_desc, + output_image)); + CHECK_SYNC("hl_pooling_forward failed"); } void hl_pooling_backward(hl_tensor_descriptor input, @@ -528,90 +512,86 @@ void hl_pooling_backward(hl_tensor_descriptor input, real* output_image, real* output_image_grad, hl_pooling_descriptor pooling) { - cudnnPoolingDescriptor_t pooling_desc; - cudnnTensorDescriptor_t input_desc; - cudnnTensorDescriptor_t output_desc; - - CHECK_NOTNULL(input); - CHECK_NOTNULL(output); - CHECK_NOTNULL(pooling); - CHECK_NOTNULL(input_image); - CHECK_NOTNULL(input_image_grad); - CHECK_NOTNULL(output_image); - CHECK_NOTNULL(output_image_grad); - - real alpha = 1.0f; - real beta = 1.0f; - input_desc = ((cudnn_tensor_descriptor)input)->desc; - output_desc = ((cudnn_tensor_descriptor)output)->desc; - pooling_desc = ((cudnn_pooling_descriptor)pooling)->desc; - CHECK_CUDNN(dynload::cudnnPoolingBackward( - t_resource.cudnn_handle, - pooling_desc, - &alpha, - output_desc, - output_image, - output_desc, - output_image_grad, - input_desc, - input_image, - &beta, - input_desc, - input_image_grad)); + cudnnPoolingDescriptor_t pooling_desc; + cudnnTensorDescriptor_t input_desc; + cudnnTensorDescriptor_t output_desc; + + CHECK_NOTNULL(input); + CHECK_NOTNULL(output); + CHECK_NOTNULL(pooling); + CHECK_NOTNULL(input_image); + CHECK_NOTNULL(input_image_grad); + CHECK_NOTNULL(output_image); + CHECK_NOTNULL(output_image_grad); + + real alpha = 1.0f; + real beta = 1.0f; + input_desc = ((cudnn_tensor_descriptor)input)->desc; + output_desc = ((cudnn_tensor_descriptor)output)->desc; + pooling_desc = ((cudnn_pooling_descriptor)pooling)->desc; + CHECK_CUDNN(dynload::cudnnPoolingBackward(t_resource.cudnn_handle, + pooling_desc, + &alpha, + output_desc, + output_image, + output_desc, + output_image_grad, + input_desc, + input_image, + &beta, + input_desc, + input_image_grad)); CHECK_SYNC("hl_pooling_backward failed"); } - void hl_create_filter_descriptor(hl_filter_descriptor* filter, int input_feature_maps, int output_feature_maps, int height, int width) { - CHECK_NOTNULL(filter); + CHECK_NOTNULL(filter); - cudnn_filter_descriptor hl_filter = - (cudnn_filter_descriptor)malloc(sizeof(_cudnn_filter_descriptor)); - CHECK_NOTNULL(hl_filter); + cudnn_filter_descriptor hl_filter = + (cudnn_filter_descriptor)malloc(sizeof(_cudnn_filter_descriptor)); + CHECK_NOTNULL(hl_filter); - CHECK_CUDNN(dynload::cudnnCreateFilterDescriptor(&hl_filter->desc)); + CHECK_CUDNN(dynload::cudnnCreateFilterDescriptor(&hl_filter->desc)); #ifndef PADDLE_TYPE_DOUBLE - cudnnDataType_t data_type = CUDNN_DATA_FLOAT; + cudnnDataType_t data_type = CUDNN_DATA_FLOAT; #else - cudnnDataType_t data_type = CUDNN_DATA_DOUBLE; + cudnnDataType_t data_type = CUDNN_DATA_DOUBLE; #endif - CHECK_CUDNN(dynload::cudnnSetFilter4dDescriptor( - hl_filter->desc, - data_type, + CHECK_CUDNN(dynload::cudnnSetFilter4dDescriptor(hl_filter->desc, + data_type, #if CUDNN_VERSION >= 5000 - CUDNN_TENSOR_NCHW, + CUDNN_TENSOR_NCHW, #endif - output_feature_maps, - input_feature_maps, - height, - width)); - - hl_filter->data_type = data_type; - hl_filter->output_feature_maps = output_feature_maps; - hl_filter->input_feature_maps = input_feature_maps; - hl_filter->filter_height = height; - hl_filter->filter_width = width; - - *filter = (hl_filter_descriptor)hl_filter; + output_feature_maps, + input_feature_maps, + height, + width)); + + hl_filter->data_type = data_type; + hl_filter->output_feature_maps = output_feature_maps; + hl_filter->input_feature_maps = input_feature_maps; + hl_filter->filter_height = height; + hl_filter->filter_width = width; + + *filter = (hl_filter_descriptor)hl_filter; } - void hl_destroy_filter_descriptor(hl_filter_descriptor filter) { - CHECK_NOTNULL(filter); + CHECK_NOTNULL(filter); - cudnn_filter_descriptor hl_filter = (cudnn_filter_descriptor)filter; - CHECK_NOTNULL(hl_filter->desc); + cudnn_filter_descriptor hl_filter = (cudnn_filter_descriptor)filter; + CHECK_NOTNULL(hl_filter->desc); - CHECK_CUDNN(dynload::cudnnDestroyFilterDescriptor(hl_filter->desc)); + CHECK_CUDNN(dynload::cudnnDestroyFilterDescriptor(hl_filter->desc)); - hl_filter->desc = NULL; + hl_filter->desc = NULL; - free(filter); + free(filter); } void hl_create_convolution_descriptor(hl_convolution_descriptor* conv, @@ -621,36 +601,35 @@ void hl_create_convolution_descriptor(hl_convolution_descriptor* conv, int padding_width, int stride_height, int stride_width) { - CHECK_NOTNULL(conv); - - cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor) - malloc(sizeof(_cudnn_convolution_descriptor)); - - CHECK_NOTNULL(hl_conv); - CHECK_CUDNN(dynload::cudnnCreateConvolutionDescriptor(&hl_conv->desc)); - - cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION; - CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor( - hl_conv->desc, - padding_height, - padding_width, - stride_height, - stride_width, - 1, - 1, - mode)); - - hl_conv->input_image = image; - hl_conv->filter = filter; - hl_conv->padding_height = padding_height; - hl_conv->padding_width = padding_width; - hl_conv->stride_height = stride_height; - hl_conv->stride_width = stride_width; - hl_conv->upscalex = 1; - hl_conv->upscaley = 1; - hl_conv->mode = mode; - - *conv = (hl_convolution_descriptor)hl_conv; + CHECK_NOTNULL(conv); + + cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)malloc( + sizeof(_cudnn_convolution_descriptor)); + + CHECK_NOTNULL(hl_conv); + CHECK_CUDNN(dynload::cudnnCreateConvolutionDescriptor(&hl_conv->desc)); + + cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION; + CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(hl_conv->desc, + padding_height, + padding_width, + stride_height, + stride_width, + 1, + 1, + mode)); + + hl_conv->input_image = image; + hl_conv->filter = filter; + hl_conv->padding_height = padding_height; + hl_conv->padding_width = padding_width; + hl_conv->stride_height = stride_height; + hl_conv->stride_width = stride_width; + hl_conv->upscalex = 1; + hl_conv->upscaley = 1; + hl_conv->mode = mode; + + *conv = (hl_convolution_descriptor)hl_conv; } void hl_reset_convolution_descriptor(hl_convolution_descriptor conv, @@ -660,44 +639,43 @@ void hl_reset_convolution_descriptor(hl_convolution_descriptor conv, int padding_width, int stride_height, int stride_width) { - CHECK_NOTNULL(conv); - CHECK_NOTNULL(image); - CHECK_NOTNULL(filter); - - cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv); - cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION; - CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor( - conv_desc, - padding_height, - padding_width, - stride_height, - stride_width, - 1, - 1, - mode)); - - cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv; - hl_conv->input_image = image; - hl_conv->filter = filter; - hl_conv->padding_height = padding_height; - hl_conv->padding_width = padding_width; - hl_conv->stride_height = stride_height; - hl_conv->stride_width = stride_width; - hl_conv->upscalex = 1; - hl_conv->upscaley = 1; - hl_conv->mode = mode; + CHECK_NOTNULL(conv); + CHECK_NOTNULL(image); + CHECK_NOTNULL(filter); + + cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv); + cudnnConvolutionMode_t mode = CUDNN_CROSS_CORRELATION; + CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(conv_desc, + padding_height, + padding_width, + stride_height, + stride_width, + 1, + 1, + mode)); + + cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv; + hl_conv->input_image = image; + hl_conv->filter = filter; + hl_conv->padding_height = padding_height; + hl_conv->padding_width = padding_width; + hl_conv->stride_height = stride_height; + hl_conv->stride_width = stride_width; + hl_conv->upscalex = 1; + hl_conv->upscaley = 1; + hl_conv->mode = mode; } void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv) { - CHECK_NOTNULL(conv); + CHECK_NOTNULL(conv); - cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv; - CHECK_NOTNULL(hl_conv->desc); + cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)conv; + CHECK_NOTNULL(hl_conv->desc); - CHECK_CUDNN(dynload::cudnnDestroyConvolutionDescriptor(hl_conv->desc)); - hl_conv->desc = NULL; + CHECK_CUDNN(dynload::cudnnDestroyConvolutionDescriptor(hl_conv->desc)); + hl_conv->desc = NULL; - free(conv); + free(conv); } void hl_convolution_forward(hl_tensor_descriptor input, @@ -710,33 +688,33 @@ void hl_convolution_forward(hl_tensor_descriptor input, void* gpuWorkSpace, size_t sizeInBytes, int convFwdAlgo) { - CHECK_NOTNULL(input); - CHECK_NOTNULL(output); - CHECK_NOTNULL(filter); - CHECK_NOTNULL(conv); - CHECK_NOTNULL(input_data); - CHECK_NOTNULL(output_data); - CHECK_NOTNULL(filter_data); - cudnnTensorDescriptor_t src_desc = GET_TENSOR_DESCRIPTOR(input); - cudnnTensorDescriptor_t dest_desc = GET_TENSOR_DESCRIPTOR(output); - cudnnFilterDescriptor_t filter_desc = GET_FILTER_DESCRIPTOR(filter); - cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv); - real alpha = 1.0f; - real beta = 1.0f; - CHECK_CUDNN(dynload::cudnnConvolutionForward( - t_resource.cudnn_handle, - &alpha, - src_desc, - input_data, - filter_desc, - filter_data, - conv_desc, - static_cast(convFwdAlgo), - gpuWorkSpace, - sizeInBytes, - &beta, - dest_desc, - output_data)); + CHECK_NOTNULL(input); + CHECK_NOTNULL(output); + CHECK_NOTNULL(filter); + CHECK_NOTNULL(conv); + CHECK_NOTNULL(input_data); + CHECK_NOTNULL(output_data); + CHECK_NOTNULL(filter_data); + cudnnTensorDescriptor_t src_desc = GET_TENSOR_DESCRIPTOR(input); + cudnnTensorDescriptor_t dest_desc = GET_TENSOR_DESCRIPTOR(output); + cudnnFilterDescriptor_t filter_desc = GET_FILTER_DESCRIPTOR(filter); + cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv); + real alpha = 1.0f; + real beta = 1.0f; + CHECK_CUDNN(dynload::cudnnConvolutionForward( + t_resource.cudnn_handle, + &alpha, + src_desc, + input_data, + filter_desc, + filter_data, + conv_desc, + static_cast(convFwdAlgo), + gpuWorkSpace, + sizeInBytes, + &beta, + dest_desc, + output_data)); CHECK_SYNC("hl_convolution_forward failed"); } @@ -744,27 +722,26 @@ void hl_convolution_forward_add_bias(hl_tensor_descriptor bias, real* bias_data, hl_tensor_descriptor output, real* output_data) { - CHECK_NOTNULL(bias); - CHECK_NOTNULL(output); - CHECK_NOTNULL(bias_data); - CHECK_NOTNULL(output_data); - - cudnnTensorDescriptor_t output_desc = GET_TENSOR_DESCRIPTOR(output); - cudnnTensorDescriptor_t bias_desc = GET_TENSOR_DESCRIPTOR(bias); - real alpha = 1.0f; - real beta = 1.0f; - - CHECK_CUDNN(dynload::cudnnAddTensor( - t_resource.cudnn_handle, + CHECK_NOTNULL(bias); + CHECK_NOTNULL(output); + CHECK_NOTNULL(bias_data); + CHECK_NOTNULL(output_data); + + cudnnTensorDescriptor_t output_desc = GET_TENSOR_DESCRIPTOR(output); + cudnnTensorDescriptor_t bias_desc = GET_TENSOR_DESCRIPTOR(bias); + real alpha = 1.0f; + real beta = 1.0f; + + CHECK_CUDNN(dynload::cudnnAddTensor(t_resource.cudnn_handle, #if CUDNN_VERSION < 4000 - CUDNN_ADD_SAME_C, + CUDNN_ADD_SAME_C, #endif - &alpha, - bias_desc, - bias_data, - &beta, - output_desc, - output_data)); + &alpha, + bias_desc, + bias_data, + &beta, + output_desc, + output_data)); CHECK_SYNC("hl_convolution_forward_add_bias failed"); } @@ -772,23 +749,22 @@ void hl_convolution_backward_bias(hl_tensor_descriptor bias, real* bias_grad_data, hl_tensor_descriptor output, real* output_grad_data) { - CHECK_NOTNULL(bias); - CHECK_NOTNULL(output); - CHECK_NOTNULL(bias_grad_data); - CHECK_NOTNULL(output_grad_data); - - real alpha = 1.0f; - real beta = 1.0f; - cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output); - cudnnTensorDescriptor_t bias_desc = GET_TENSOR_DESCRIPTOR(bias); - CHECK_CUDNN(dynload::cudnnConvolutionBackwardBias( - t_resource.cudnn_handle, - &alpha, - diff_desc, - output_grad_data, - &beta, - bias_desc, - bias_grad_data)); + CHECK_NOTNULL(bias); + CHECK_NOTNULL(output); + CHECK_NOTNULL(bias_grad_data); + CHECK_NOTNULL(output_grad_data); + + real alpha = 1.0f; + real beta = 1.0f; + cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output); + cudnnTensorDescriptor_t bias_desc = GET_TENSOR_DESCRIPTOR(bias); + CHECK_CUDNN(dynload::cudnnConvolutionBackwardBias(t_resource.cudnn_handle, + &alpha, + diff_desc, + output_grad_data, + &beta, + bias_desc, + bias_grad_data)); CHECK_SYNC("hl_convolution_backward_bias failed"); } @@ -802,37 +778,37 @@ void hl_convolution_backward_filter(hl_tensor_descriptor input, void* gpuWorkSpace, size_t sizeInBytes, int convBwdFilterAlgo) { - CHECK_NOTNULL(input); - CHECK_NOTNULL(output); - CHECK_NOTNULL(filter); - CHECK_NOTNULL(conv); - CHECK_NOTNULL(input_data); - CHECK_NOTNULL(output_grad_data); - CHECK_NOTNULL(filter_grad_data); - - real alpha = 1.0f; - real beta = 1.0f; - cudnnTensorDescriptor_t src_desc = GET_TENSOR_DESCRIPTOR(input); - cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output); - cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv); - cudnnFilterDescriptor_t grad_desc = GET_FILTER_DESCRIPTOR(filter); - - CHECK_CUDNN(dynload::cudnnConvolutionBackwardFilter( - t_resource.cudnn_handle, - &alpha, - src_desc, - input_data, - diff_desc, - output_grad_data, - conv_desc, + CHECK_NOTNULL(input); + CHECK_NOTNULL(output); + CHECK_NOTNULL(filter); + CHECK_NOTNULL(conv); + CHECK_NOTNULL(input_data); + CHECK_NOTNULL(output_grad_data); + CHECK_NOTNULL(filter_grad_data); + + real alpha = 1.0f; + real beta = 1.0f; + cudnnTensorDescriptor_t src_desc = GET_TENSOR_DESCRIPTOR(input); + cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output); + cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv); + cudnnFilterDescriptor_t grad_desc = GET_FILTER_DESCRIPTOR(filter); + + CHECK_CUDNN(dynload::cudnnConvolutionBackwardFilter( + t_resource.cudnn_handle, + &alpha, + src_desc, + input_data, + diff_desc, + output_grad_data, + conv_desc, #if CUDNN_VERSION >= 4000 - static_cast(convBwdFilterAlgo), - gpuWorkSpace, - sizeInBytes, + static_cast(convBwdFilterAlgo), + gpuWorkSpace, + sizeInBytes, #endif - &beta, - grad_desc, - filter_grad_data)); + &beta, + grad_desc, + filter_grad_data)); CHECK_SYNC("hl_convolution_backward_filter failed"); } @@ -846,119 +822,111 @@ void hl_convolution_backward_data(hl_tensor_descriptor input, void* gpuWorkSpace, size_t sizeInBytes, int convBwdDataAlgo) { - real alpha = 1.0f; - real beta = 1.0f; - cudnnFilterDescriptor_t filter_desc = GET_FILTER_DESCRIPTOR(filter); - cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output); - cudnnTensorDescriptor_t grad_desc = GET_TENSOR_DESCRIPTOR(input); - cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv); - - CHECK_CUDNN(dynload::cudnnConvolutionBackwardData( - t_resource.cudnn_handle, - &alpha, - filter_desc, - filter_data, - diff_desc, - output_grad_data, - conv_desc, + real alpha = 1.0f; + real beta = 1.0f; + cudnnFilterDescriptor_t filter_desc = GET_FILTER_DESCRIPTOR(filter); + cudnnTensorDescriptor_t diff_desc = GET_TENSOR_DESCRIPTOR(output); + cudnnTensorDescriptor_t grad_desc = GET_TENSOR_DESCRIPTOR(input); + cudnnConvolutionDescriptor_t conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv); + + CHECK_CUDNN(dynload::cudnnConvolutionBackwardData( + t_resource.cudnn_handle, + &alpha, + filter_desc, + filter_data, + diff_desc, + output_grad_data, + conv_desc, #if CUDNN_VERSION >= 4000 - static_cast(convBwdDataAlgo), - gpuWorkSpace, - sizeInBytes, + static_cast(convBwdDataAlgo), + gpuWorkSpace, + sizeInBytes, #endif - &beta, - grad_desc, - input_data_grad)); + &beta, + grad_desc, + input_data_grad)); CHECK_SYNC("hl_convolution_backward_data failed"); } - -void hl_softmax_forward(real *input, - real *output, - int height, - int width) { +void hl_softmax_forward(real* input, real* output, int height, int width) { #ifndef PADDLE_TYPE_DOUBLE - cudnnDataType_t data_type = CUDNN_DATA_FLOAT; + cudnnDataType_t data_type = CUDNN_DATA_FLOAT; #else - cudnnDataType_t data_type = CUDNN_DATA_DOUBLE; + cudnnDataType_t data_type = CUDNN_DATA_DOUBLE; #endif - CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor( - t_resource.cudnn_desc, - CUDNN_TENSOR_NCHW, - data_type, - height, - width, - 1, - 1)); - - real alpha = 1.0f; - real beta = 0.0f; - CHECK_CUDNN(dynload::cudnnSoftmaxForward( - t_resource.cudnn_handle, - CUDNN_SOFTMAX_ACCURATE, - CUDNN_SOFTMAX_MODE_CHANNEL, - &alpha, - t_resource.cudnn_desc, - input, - &beta, - t_resource.cudnn_desc, - output)); + CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(t_resource.cudnn_desc, + CUDNN_TENSOR_NCHW, + data_type, + height, + width, + 1, + 1)); + + real alpha = 1.0f; + real beta = 0.0f; + CHECK_CUDNN(dynload::cudnnSoftmaxForward(t_resource.cudnn_handle, + CUDNN_SOFTMAX_ACCURATE, + CUDNN_SOFTMAX_MODE_CHANNEL, + &alpha, + t_resource.cudnn_desc, + input, + &beta, + t_resource.cudnn_desc, + output)); CHECK_SYNC("hl_softmax_forward failed"); } -void hl_softmax_backward(real *output_value, - real *output_grad, +void hl_softmax_backward(real* output_value, + real* output_grad, int height, int width) { #ifndef PADDLE_TYPE_DOUBLE - cudnnDataType_t data_type = CUDNN_DATA_FLOAT; + cudnnDataType_t data_type = CUDNN_DATA_FLOAT; #else - cudnnDataType_t data_type = CUDNN_DATA_DOUBLE; + cudnnDataType_t data_type = CUDNN_DATA_DOUBLE; #endif - CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor( - t_resource.cudnn_desc, - CUDNN_TENSOR_NCHW, - data_type, - height, - width, - 1, - 1)); - - real alpha = 1.0f; - real beta = 0.0f; - CHECK_CUDNN(dynload::cudnnSoftmaxBackward( - t_resource.cudnn_handle, - CUDNN_SOFTMAX_ACCURATE, - CUDNN_SOFTMAX_MODE_CHANNEL, - &alpha, - t_resource.cudnn_desc, - output_value, - t_resource.cudnn_desc, - output_grad, - &beta, - t_resource.cudnn_desc, - output_grad)); + CHECK_CUDNN(dynload::cudnnSetTensor4dDescriptor(t_resource.cudnn_desc, + CUDNN_TENSOR_NCHW, + data_type, + height, + width, + 1, + 1)); + + real alpha = 1.0f; + real beta = 0.0f; + CHECK_CUDNN(dynload::cudnnSoftmaxBackward(t_resource.cudnn_handle, + CUDNN_SOFTMAX_ACCURATE, + CUDNN_SOFTMAX_MODE_CHANNEL, + &alpha, + t_resource.cudnn_desc, + output_value, + t_resource.cudnn_desc, + output_grad, + &beta, + t_resource.cudnn_desc, + output_grad)); CHECK_SYNC("hl_softmax_backward failed"); } void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc, - real *input, + real* input, hl_tensor_descriptor outputDesc, - real *output, + real* output, hl_tensor_descriptor bnParamDesc, - real *scale, - real *bias, + real* scale, + real* bias, double factor, - real *runningMean, - real *runningInvVar, + real* runningMean, + real* runningInvVar, double epsilon, - real *savedMean, - real *savedVar) { + real* savedMean, + real* savedVar) { #if CUDNN_VERSION >= 4007 if ((NULL != runningMean && NULL == runningInvVar) || (NULL == runningMean && NULL != runningInvVar)) { LOG(FATAL) << "runningMean and runningInvVar can be NULL " - << "but only at the same time."; + << "but only at the same time."; } if ((NULL != savedMean && NULL == savedVar) || (NULL == savedMean && NULL != savedVar)) { @@ -972,10 +940,24 @@ void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc, real alpha = 1.0f; real beta = 1.0f; cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL; - CHECK_CUDNN(dynload::cudnnBatchNormalizationForwardTraining( - t_resource.cudnn_handle, mode, &alpha, &beta, xDesc, - input, yDesc, output, bnDesc, scale, bias, factor, - runningMean, runningInvVar, epsilon, savedMean, savedVar)); + CHECK_CUDNN( + dynload::cudnnBatchNormalizationForwardTraining(t_resource.cudnn_handle, + mode, + &alpha, + &beta, + xDesc, + input, + yDesc, + output, + bnDesc, + scale, + bias, + factor, + runningMean, + runningInvVar, + epsilon, + savedMean, + savedVar)); CHECK_SYNC("hl_batch_norm_forward_training failed"); #else @@ -985,15 +967,15 @@ void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc, } void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc, - real *input, - hl_tensor_descriptor outputDesc, - real *output, - hl_tensor_descriptor bnParamDesc, - real *scale, - real *bias, - real *estimatedMean, - real *estimatedInvVar, - double epsilon) { + real* input, + hl_tensor_descriptor outputDesc, + real* output, + hl_tensor_descriptor bnParamDesc, + real* scale, + real* bias, + real* estimatedMean, + real* estimatedInvVar, + double epsilon) { #if CUDNN_VERSION >= 4007 cudnnTensorDescriptor_t xDesc = GET_TENSOR_DESCRIPTOR(inputDesc); cudnnTensorDescriptor_t yDesc = GET_TENSOR_DESCRIPTOR(outputDesc); @@ -1001,10 +983,21 @@ void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc, real alpha = 1.0f; real beta = 1.0f; cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL; - CHECK_CUDNN(dynload::cudnnBatchNormalizationForwardInference( - t_resource.cudnn_handle, mode, &alpha, &beta, xDesc, - input, yDesc, output, bnDesc, scale, bias, - estimatedMean, estimatedInvVar, epsilon)); + CHECK_CUDNN( + dynload::cudnnBatchNormalizationForwardInference(t_resource.cudnn_handle, + mode, + &alpha, + &beta, + xDesc, + input, + yDesc, + output, + bnDesc, + scale, + bias, + estimatedMean, + estimatedInvVar, + epsilon)); CHECK_SYNC("hl_batch_norm_forward_inference failed"); #else @@ -1014,18 +1007,18 @@ void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc, } void hl_batch_norm_backward(hl_tensor_descriptor inputDesc, - real *input, + real* input, hl_tensor_descriptor outGradDesc, - real *outGrad, + real* outGrad, hl_tensor_descriptor inGradDesc, - real *inGrad, + real* inGrad, hl_tensor_descriptor dBnParamDesc, - real *scale, - real *scaleGrad, - real *biasGrad, + real* scale, + real* scaleGrad, + real* biasGrad, double epsilon, - real *savedMean, - real *savedInvVar) { + real* savedMean, + real* savedInvVar) { #if CUDNN_VERSION >= 4007 if ((NULL != savedMean && NULL == savedInvVar) || (NULL == savedMean && NULL != savedInvVar)) { @@ -1040,12 +1033,25 @@ void hl_batch_norm_backward(hl_tensor_descriptor inputDesc, real alpha = 1.0f; real beta = 1.0f; cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL; - CHECK_CUDNN(dynload::cudnnBatchNormalizationBackward( - t_resource.cudnn_handle, mode, &alpha, &beta, - &alpha, &beta, - xDesc, input, dyDesc, outGrad, dxDesc, inGrad, - bnDesc, scale, scaleGrad, biasGrad, epsilon, - savedMean, savedInvVar)); + CHECK_CUDNN(dynload::cudnnBatchNormalizationBackward(t_resource.cudnn_handle, + mode, + &alpha, + &beta, + &alpha, + &beta, + xDesc, + input, + dyDesc, + outGrad, + dxDesc, + inGrad, + bnDesc, + scale, + scaleGrad, + biasGrad, + epsilon, + savedMean, + savedInvVar)); CHECK_SYNC("hl_batch_norm_backward failed"); #else diff --git a/paddle/cuda/src/hl_cuda_device.cc b/paddle/cuda/src/hl_cuda_device.cc index ca19f210c5c9d5151b01ce81a4f44663e2df97cc..85d4860b5bff6109663c46be01081558a58093ac 100644 --- a/paddle/cuda/src/hl_cuda_device.cc +++ b/paddle/cuda/src/hl_cuda_device.cc @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include #include #include @@ -27,7 +26,7 @@ limitations under the License. */ namespace dynload { std::once_flag curand_dso_flag; -void* curand_dso_handle = nullptr; +void *curand_dso_handle = nullptr; /** * The following macro definition can generate structs @@ -37,34 +36,31 @@ void* curand_dso_handle = nullptr; * note: default dynamic linked libs */ #ifdef PADDLE_USE_DSO -#define DYNAMIC_LOAD_CURAND_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - curandStatus_t operator()(Args... args) { \ - typedef curandStatus_t (*curandFunc)(Args...); \ - std::call_once(curand_dso_flag, GetCurandDsoHandle, \ - &curand_dso_handle); \ - void* p_##__name = dlsym(curand_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - } __name; /* struct DynLoad__##__name */ +#define DYNAMIC_LOAD_CURAND_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + curandStatus_t operator()(Args... args) { \ + typedef curandStatus_t (*curandFunc)(Args...); \ + std::call_once(curand_dso_flag, GetCurandDsoHandle, &curand_dso_handle); \ + void *p_##__name = dlsym(curand_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + } __name; /* struct DynLoad__##__name */ #else -#define DYNAMIC_LOAD_CURAND_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - curandStatus_t operator()(Args... args) { \ - return __name(args...); \ - } \ - } __name; /* struct DynLoad__##__name */ +#define DYNAMIC_LOAD_CURAND_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + curandStatus_t operator()(Args... args) { \ + return __name(args...); \ + } \ + } __name; /* struct DynLoad__##__name */ #endif /* include all needed curand functions in HPPL */ -#define CURAND_RAND_ROUTINE_EACH(__macro) \ - __macro(curandCreateGenerator) \ - __macro(curandSetStream) \ - __macro(curandSetPseudoRandomGeneratorSeed)\ - __macro(curandGenerateUniform) \ - __macro(curandGenerateUniformDouble) +#define CURAND_RAND_ROUTINE_EACH(__macro) \ + __macro(curandCreateGenerator) __macro(curandSetStream) \ + __macro(curandSetPseudoRandomGeneratorSeed) \ + __macro(curandGenerateUniform) __macro(curandGenerateUniformDouble) CURAND_RAND_ROUTINE_EACH(DYNAMIC_LOAD_CURAND_WRAP) @@ -72,7 +68,7 @@ CURAND_RAND_ROUTINE_EACH(DYNAMIC_LOAD_CURAND_WRAP) #undef DYNAMIC_LOAD_CURAND_WRAP std::once_flag cudart_dso_flag; -void* cudart_dso_handle = nullptr; +void *cudart_dso_handle = nullptr; /** * The following macro definition can generate structs @@ -82,109 +78,96 @@ void* cudart_dso_handle = nullptr; * note: default dynamic linked libs */ #ifdef PADDLE_USE_DSO -#define DYNAMIC_LOAD_CUDART_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - auto operator()(Args... args) -> decltype(__name(args...)) { \ - using cudart_func = decltype(__name(args...))(*)(Args...); \ - std::call_once(cudart_dso_flag, GetCudartDsoHandle, \ - &cudart_dso_handle); \ - void* p_##__name = dlsym(cudart_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - } __name; /* struct DynLoad__##__name */ +#define DYNAMIC_LOAD_CUDART_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> decltype(__name(args...)) { \ + using cudart_func = decltype(__name(args...)) (*)(Args...); \ + std::call_once(cudart_dso_flag, GetCudartDsoHandle, &cudart_dso_handle); \ + void *p_##__name = dlsym(cudart_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + } __name; /* struct DynLoad__##__name */ #else -#define DYNAMIC_LOAD_CUDART_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - auto operator()(Args... args) -> decltype(__name(args...)) { \ - return __name(args...); \ - } \ - } __name; /* struct DynLoad__##__name */ +#define DYNAMIC_LOAD_CUDART_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> decltype(__name(args...)) { \ + return __name(args...); \ + } \ + } __name; /* struct DynLoad__##__name */ #endif /* include all needed cuda functions in HPPL */ -#define CUDA_ROUTINE_EACH(__macro) \ - __macro(cudaMalloc) \ - __macro(cudaHostAlloc) \ - __macro(cudaFree) \ - __macro(cudaFreeHost) \ - __macro(cudaMemcpy) \ - __macro(cudaMemset) \ - __macro(cudaMemcpyAsync) \ - __macro(cudaSetDevice) \ - __macro(cudaGetDevice) \ - __macro(cudaGetDeviceCount) \ - __macro(cudaGetDeviceProperties) \ - __macro(cudaDeviceSynchronize) \ - __macro(cudaDeviceCanAccessPeer) \ - __macro(cudaDeviceEnablePeerAccess) \ - __macro(cudaStreamCreate) \ - __macro(cudaStreamDestroy) \ - __macro(cudaStreamSynchronize) \ - __macro(cudaStreamWaitEvent) \ - __macro(cudaEventCreate) \ - __macro(cudaEventRecord) \ - __macro(cudaEventQuery) \ - __macro(cudaEventDestroy) \ - __macro(cudaEventSynchronize) \ - __macro(cudaEventElapsedTime) \ - __macro(cudaSetDeviceFlags) \ - __macro(cudaGetLastError) \ - __macro(cudaFuncSetCacheConfig) \ - __macro(cudaRuntimeGetVersion) \ - __macro(cudaGetErrorString) +#define CUDA_ROUTINE_EACH(__macro) \ + __macro(cudaMalloc) __macro(cudaHostAlloc) __macro(cudaFree) \ + __macro(cudaFreeHost) __macro(cudaMemcpy) __macro(cudaMemset) __macro( \ + cudaMemcpyAsync) __macro(cudaSetDevice) __macro(cudaGetDevice) \ + __macro(cudaGetDeviceCount) __macro(cudaGetDeviceProperties) \ + __macro(cudaDeviceSynchronize) __macro(cudaDeviceCanAccessPeer) \ + __macro(cudaDeviceEnablePeerAccess) \ + __macro(cudaStreamCreate) __macro(cudaStreamDestroy) \ + __macro(cudaStreamSynchronize) __macro( \ + cudaStreamWaitEvent) __macro(cudaEventCreate) \ + __macro(cudaEventRecord) __macro(cudaEventQuery) \ + __macro(cudaEventDestroy) __macro( \ + cudaEventSynchronize) \ + __macro(cudaEventElapsedTime) __macro( \ + cudaSetDeviceFlags) \ + __macro(cudaGetLastError) __macro( \ + cudaFuncSetCacheConfig) \ + __macro(cudaRuntimeGetVersion) \ + __macro(cudaGetErrorString) CUDA_ROUTINE_EACH(DYNAMIC_LOAD_CUDART_WRAP) #undef CUDA_ROUNTINE_EACH #undef DYNAMIC_LOAD_CUDART_WRAP -} /* namespace dynload */ +} /* namespace dynload */ /** * @brief global resource. */ -int g_system_device_num = 0; /* system device number */ -int device_num = 0; /* use device number */ -hl_device_prop *g_device; /* device info table */ -__thread thread_device_resources *t_device; /* device resources table */ +int g_system_device_num = 0; /* system device number */ +int device_num = 0; /* use device number */ +hl_device_prop *g_device; /* device info table */ +__thread thread_device_resources *t_device; /* device resources table */ int g_cuda_lib_version = 0; /* number of global stream */ -#define NUMBER_OF_GLOBAL_STREAM (HPPL_THREAD_STREAM_1) +#define NUMBER_OF_GLOBAL_STREAM (HPPL_THREAD_STREAM_1) /* number of thread stream */ -#define NUMBER_OF_THREAD_STREAM (HPPL_STREAM_END - HPPL_THREAD_STREAM_1) +#define NUMBER_OF_THREAD_STREAM (HPPL_STREAM_END - HPPL_THREAD_STREAM_1) /* sizeof of device memory */ -#define HPPL_GPU_MEMORY_SIZE (256*4) +#define HPPL_GPU_MEMORY_SIZE (256 * 4) /** * Check build-in cuda function using glog and it **does not** * support << operator for more details error info. */ -#define CHECK_CUDA(cudaFunc) \ - do { \ - cudaError_t cudaStat = cudaFunc; \ - CHECK_EQ(cudaSuccess, cudaStat) << "Cuda Error: " \ - << dynload::cudaGetErrorString(cudaStat); \ +#define CHECK_CUDA(cudaFunc) \ + do { \ + cudaError_t cudaStat = cudaFunc; \ + CHECK_EQ(cudaSuccess, cudaStat) << "Cuda Error: " \ + << dynload::cudaGetErrorString(cudaStat); \ } while (0) /** * @brief thread resource. */ -__thread _hl_thread_resource t_resource = { - {0}, /* stream */ - 0, /* handle */ - 0, /* gen */ - 0, /* cudnn_handle */ - 0, /* cudnn_desc */ - NULL, /* gen_mutex */ - NULL, /* gpu_mem */ - NULL, /* cpu_mem */ - 0, /* event */ - -1, /* device */ - 0, /* major */ - false}; /* is_init */ +__thread _hl_thread_resource t_resource = {{0}, /* stream */ + 0, /* handle */ + 0, /* gen */ + 0, /* cudnn_handle */ + 0, /* cudnn_desc */ + NULL, /* gen_mutex */ + NULL, /* gpu_mem */ + NULL, /* cpu_mem */ + 0, /* event */ + -1, /* device */ + 0, /* major */ + false}; /* is_init */ __thread cudaStream_t default_stream = 0; __thread bool g_sync_flag = true; @@ -198,9 +181,9 @@ inline pid_t gettid() { uint64_t tid; pthread_threadid_np(NULL, &tid); #else - #ifndef __NR_gettid - #define __NR_gettid 224 - #endif +#ifndef __NR_gettid +#define __NR_gettid 224 +#endif pid_t tid = syscall(__NR_gettid); #endif CHECK_NE((int)tid, -1); @@ -208,8 +191,7 @@ inline pid_t gettid() { } void hl_init(int device) { - CHECK(hl_start_flag) - << "[Init failed] hl_start() did not succeed."; + CHECK(hl_start_flag) << "[Init failed] hl_start() did not succeed."; /* thread has been initialized */ if (true == t_resource.is_init) { @@ -220,16 +202,16 @@ void hl_init(int device) { /* create thread devcie resources */ char *tmp; thread_device_resources device_res; - tmp = (char *)malloc(g_system_device_num*sizeof(thread_device_resources*) + - device_num*sizeof(_thread_device_resources)); + tmp = (char *)malloc(g_system_device_num * sizeof(thread_device_resources *) + + device_num * sizeof(_thread_device_resources)); CHECK_NOTNULL(tmp); - t_device = (thread_device_resources*)tmp; - device_res = (thread_device_resources)((char*)tmp + - g_system_device_num*sizeof(thread_device_resources*)); - memset(t_device, 0, g_system_device_num*sizeof(thread_device_resources*)); + t_device = (thread_device_resources *)tmp; + device_res = (thread_device_resources)( + (char *)tmp + g_system_device_num * sizeof(thread_device_resources *)); + memset(t_device, 0, g_system_device_num * sizeof(thread_device_resources *)); - char *tmp_stream = (char *) - malloc(device_num*NUMBER_OF_THREAD_STREAM*sizeof(cudaStream_t)); + char *tmp_stream = (char *)malloc(device_num * NUMBER_OF_THREAD_STREAM * + sizeof(cudaStream_t)); CHECK_NOTNULL(tmp_stream); int num = 0; @@ -239,8 +221,9 @@ void hl_init(int device) { } t_device[dev] = &device_res[num]; - t_device[dev]->stream = (cudaStream_t*)(tmp_stream + - num*NUMBER_OF_THREAD_STREAM*sizeof(cudaStream_t)); + t_device[dev]->stream = + (cudaStream_t *)(tmp_stream + + num * NUMBER_OF_THREAD_STREAM * sizeof(cudaStream_t)); hl_create_thread_resources(dev, t_device[dev]); num++; @@ -266,14 +249,14 @@ void hl_fini() { t_resource.stream[i] = 0; } - char* tmp = (char*)t_device; - char* tmp_stream = NULL; + char *tmp = (char *)t_device; + char *tmp_stream = NULL; for (int dev = 0; dev < g_system_device_num; dev++) { if (!t_device[dev]) { continue; } if (!tmp_stream) { - tmp_stream = (char*)t_device[dev]->stream; + tmp_stream = (char *)t_device[dev]->stream; } for (int j = 0; j < NUMBER_OF_THREAD_STREAM; j++) { CHECK_CUDA(dynload::cudaStreamDestroy(t_device[dev]->stream[j])); @@ -290,9 +273,7 @@ void hl_fini() { t_resource.is_init = false; } -int hl_get_device_count() { - return device_num; -} +int hl_get_device_count() { return device_num; } void hl_set_device(int device) { if (device == t_resource.device) { @@ -300,7 +281,7 @@ void hl_set_device(int device) { } CHECK(device >= 0 && device < g_system_device_num && g_device[device]) - << "Device: " << device << " is not specified in startup."; + << "Device: " << device << " is not specified in startup."; CHECK_CUDA(dynload::cudaSetDevice(device)); @@ -312,11 +293,11 @@ void hl_set_device(int device) { if (true == t_resource.is_init) { for (int i = NUMBER_OF_GLOBAL_STREAM; i < HPPL_STREAM_END; i++) { t_resource.stream[i] = - t_device[device]->stream[i - NUMBER_OF_GLOBAL_STREAM]; + t_device[device]->stream[i - NUMBER_OF_GLOBAL_STREAM]; } t_resource.gpu_mem = t_device[device]->gpu_mem; t_resource.cpu_mem = t_device[device]->cpu_mem; - t_resource.event = t_device[device]->mem_event; + t_resource.event = t_device[device]->mem_event; } t_resource.handle = g_device[device]->device_resources->handle; @@ -334,11 +315,11 @@ int hl_get_device() { return device; } -void* hl_malloc_device(size_t size) { +void *hl_malloc_device(size_t size) { void *dest_d; CHECK(size) << __func__ << ": the size for device memory is 0, please check."; - CHECK_CUDA(dynload::cudaMalloc((void**)&dest_d, size)); + CHECK_CUDA(dynload::cudaMalloc((void **)&dest_d, size)); return dest_d; } @@ -348,15 +329,15 @@ void hl_free_mem_device(void *dest_d) { cudaError_t err = dynload::cudaFree(dest_d); CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err) - << hl_get_device_error_string(); + << hl_get_device_error_string(); } -void* hl_malloc_host(size_t size) { +void *hl_malloc_host(size_t size) { void *dest_h; CHECK(size) << __func__ << ": the size for device memory is 0, please check."; - CHECK_CUDA(dynload::cudaHostAlloc( - (void**)&dest_h, size, cudaHostAllocDefault)); + CHECK_CUDA( + dynload::cudaHostAlloc((void **)&dest_h, size, cudaHostAllocDefault)); return dest_h; } @@ -366,7 +347,7 @@ void hl_free_mem_host(void *dest_h) { cudaError_t err = dynload::cudaFreeHost(dest_h); CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err) - << hl_get_device_error_string(); + << hl_get_device_error_string(); } void hl_memcpy(void *dst, void *src, size_t size) { @@ -388,8 +369,7 @@ void hl_memcpy_host2device(void *dest_d, void *src_h, size_t size) { } CHECK_NOTNULL(src_h); CHECK_NOTNULL(dest_d); - CHECK_CUDA(dynload::cudaMemcpy(dest_d, src_h, size, - cudaMemcpyHostToDevice)); + CHECK_CUDA(dynload::cudaMemcpy(dest_d, src_h, size, cudaMemcpyHostToDevice)); } void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size) { @@ -398,8 +378,7 @@ void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size) { } CHECK_NOTNULL(dest_h); CHECK_NOTNULL(src_d); - CHECK_CUDA(dynload::cudaMemcpy(dest_h, src_d, size, - cudaMemcpyDeviceToHost)); + CHECK_CUDA(dynload::cudaMemcpy(dest_h, src_d, size, cudaMemcpyDeviceToHost)); } void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size) { @@ -408,8 +387,8 @@ void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size) { } CHECK_NOTNULL(dest_d); CHECK_NOTNULL(src_d); - CHECK_CUDA(dynload::cudaMemcpy(dest_d, src_d, size, - cudaMemcpyDeviceToDevice)); + CHECK_CUDA( + dynload::cudaMemcpy(dest_d, src_d, size, cudaMemcpyDeviceToDevice)); } void hl_memcpy_async(void *dst, void *src, size_t size, hl_stream_t stream) { @@ -423,8 +402,8 @@ void hl_memcpy_async(void *dst, void *src, size_t size, hl_stream_t stream) { CHECK_LT(stream, HPPL_STREAM_END); cu_stream = t_resource.stream[stream]; - CHECK_CUDA(dynload::cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, - cu_stream)); + CHECK_CUDA( + dynload::cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, cu_stream)); } void hl_start() { @@ -435,8 +414,8 @@ void hl_start() { bool hl_device_can_access_peer(int device, int peerDevice) { int canAccessPeer; - CHECK_CUDA(dynload::cudaDeviceCanAccessPeer(&canAccessPeer, device, - peerDevice)); + CHECK_CUDA( + dynload::cudaDeviceCanAccessPeer(&canAccessPeer, device, peerDevice)); if (canAccessPeer == 1) { return true; @@ -478,33 +457,32 @@ void hl_create_global_resources(hl_device_prop device_prop) { /* create curand gen */ CHECK_EQ(dynload::curandCreateGenerator(&device_res->gen, - CURAND_RNG_PSEUDO_DEFAULT), CURAND_STATUS_SUCCESS) - << "[Start failed] Curand init failed."; + CURAND_RNG_PSEUDO_DEFAULT), + CURAND_STATUS_SUCCESS) + << "[Start failed] Curand init failed."; - CHECK_EQ(dynload::curandSetStream(device_res->gen, - device_res->stream[0]), CURAND_STATUS_SUCCESS) - << "[Start failed] Curand set stream failed!"; + CHECK_EQ(dynload::curandSetStream(device_res->gen, device_res->stream[0]), + CURAND_STATUS_SUCCESS) + << "[Start failed] Curand set stream failed!"; /* create cudnn handle */ hl_cudnn_init(&device_res->cudnn_handle, device_res->stream[0]); int seed = gettid(); - CHECK_EQ(dynload::curandSetPseudoRandomGeneratorSeed( - device_res->gen, seed+device), CURAND_STATUS_SUCCESS); + CHECK_EQ(dynload::curandSetPseudoRandomGeneratorSeed(device_res->gen, + seed + device), + CURAND_STATUS_SUCCESS); - device_res->gen_mutex = - (pthread_mutex_t*)(malloc(sizeof (pthread_mutex_t))); + device_res->gen_mutex = (pthread_mutex_t *)(malloc(sizeof(pthread_mutex_t))); pthread_mutex_init(device_res->gen_mutex, NULL); CHECK_CUDA(dynload::cudaRuntimeGetVersion(&g_cuda_lib_version)); } -int hl_get_cuda_version() { - return g_cuda_lib_version; -} +int hl_get_cuda_version() { return g_cuda_lib_version; } void hl_create_thread_resources(int device, - thread_device_resources device_res) { + thread_device_resources device_res) { CHECK_CUDA(dynload::cudaSetDevice(device)); /* create thread stream */ @@ -513,15 +491,15 @@ void hl_create_thread_resources(int device, } /* allocation device memory */ - device_res->gpu_mem = (real*)hl_malloc_device(HPPL_GPU_MEMORY_SIZE); + device_res->gpu_mem = (real *)hl_malloc_device(HPPL_GPU_MEMORY_SIZE); /* allocation host memory */ - device_res->cpu_mem = (real*)hl_malloc_host(HPPL_GPU_MEMORY_SIZE); + device_res->cpu_mem = (real *)hl_malloc_host(HPPL_GPU_MEMORY_SIZE); CHECK_CUDA(dynload::cudaEventCreate(&device_res->mem_event)); } -void hl_specify_devices_start(int* device, int number) { +void hl_specify_devices_start(int *device, int number) { if (hl_start_flag) return; /* 1. get the number of devices */ @@ -533,20 +511,19 @@ void hl_specify_devices_start(int* device, int number) { /* 2. check device & create device property table */ CHECK_LE(number, g_system_device_num) - << "[Start failed] System does not have enough device. " - << "Device number: " << g_system_device_num - << "Input number: " << number; + << "[Start failed] System does not have enough device. " + << "Device number: " << g_system_device_num << "Input number: " << number; char *tmp; hl_device_prop device_prop; - tmp = (char *)malloc(g_system_device_num*sizeof(hl_device_prop*) + - number*sizeof(_hl_device_prop)); + tmp = (char *)malloc(g_system_device_num * sizeof(hl_device_prop *) + + number * sizeof(_hl_device_prop)); CHECK(tmp) << "[Start failed] System memory is not enough."; - g_device = (hl_device_prop*)tmp; - device_prop = (hl_device_prop)((char*)tmp + - g_system_device_num*sizeof(hl_device_prop*)); - memset(g_device, 0, g_system_device_num*sizeof(hl_device_prop*)); + g_device = (hl_device_prop *)tmp; + device_prop = (hl_device_prop)( + (char *)tmp + g_system_device_num * sizeof(hl_device_prop *)); + memset(g_device, 0, g_system_device_num * sizeof(hl_device_prop *)); int num = 0; for (int i = 0; i < number; i++) { int dev; @@ -557,13 +534,13 @@ void hl_specify_devices_start(int* device, int number) { } CHECK_LT(dev, g_system_device_num) - << "[Start failed] The specified device number is " - << "out of range. Max device number: " << g_system_device_num - 1 - << " Specified devcie number: "<< dev; + << "[Start failed] The specified device number is " + << "out of range. Max device number: " << g_system_device_num - 1 + << " Specified devcie number: " << dev; if (g_device[dev]) { /* Warning */ - LOG(WARNING) <<"[Warning] Repeat specify device: " << dev; + LOG(WARNING) << "[Warning] Repeat specify device: " << dev; continue; } @@ -574,11 +551,11 @@ void hl_specify_devices_start(int* device, int number) { device_num = num; /* 3. create global device resources */ - char *tmp_res = (char *)malloc(device_num*sizeof(_global_device_resources)); + char *tmp_res = (char *)malloc(device_num * sizeof(_global_device_resources)); CHECK_NOTNULL(tmp_res); - char *tmp_stream = - (char *)malloc(device_num*NUMBER_OF_GLOBAL_STREAM*sizeof(cudaStream_t)); + char *tmp_stream = (char *)malloc(device_num * NUMBER_OF_GLOBAL_STREAM * + sizeof(cudaStream_t)); CHECK_NOTNULL(tmp_stream); num = 0; @@ -587,10 +564,11 @@ void hl_specify_devices_start(int* device, int number) { continue; } - g_device[i]->device_resources = (global_device_resources)(tmp_res + - num*sizeof(_global_device_resources)); - g_device[i]->device_resources->stream = (cudaStream_t*)(tmp_stream + - num*NUMBER_OF_GLOBAL_STREAM*sizeof(cudaStream_t)); + g_device[i]->device_resources = (global_device_resources)( + tmp_res + num * sizeof(_global_device_resources)); + g_device[i]->device_resources->stream = + (cudaStream_t *)(tmp_stream + + num * NUMBER_OF_GLOBAL_STREAM * sizeof(cudaStream_t)); hl_create_global_resources(g_device[i]); num++; @@ -600,9 +578,9 @@ void hl_specify_devices_start(int* device, int number) { hl_start_flag = true; /* set default device */ if (device == NULL) { - hl_set_device(0); + hl_set_device(0); } else { - hl_set_device(device[0]); + hl_set_device(device[0]); } } @@ -610,35 +588,31 @@ void hl_rand(real *dest_d, size_t num) { pthread_mutex_lock(t_resource.gen_mutex); CHECK_EQ( #ifndef PADDLE_TYPE_DOUBLE - dynload::curandGenerateUniform(t_resource.gen, dest_d, num), + dynload::curandGenerateUniform(t_resource.gen, dest_d, num), #else - dynload::curandGenerateUniformDouble(t_resource.gen, dest_d, num), + dynload::curandGenerateUniformDouble(t_resource.gen, dest_d, num), #endif - CURAND_STATUS_SUCCESS); + CURAND_STATUS_SUCCESS); pthread_mutex_unlock(t_resource.gen_mutex); CHECK_SYNC("hl_rand failed"); } void hl_srand(unsigned int seed) { pthread_mutex_lock(t_resource.gen_mutex); - CHECK_EQ(dynload::curandSetPseudoRandomGeneratorSeed( - t_resource.gen, seed), CURAND_STATUS_SUCCESS); + CHECK_EQ(dynload::curandSetPseudoRandomGeneratorSeed(t_resource.gen, seed), + CURAND_STATUS_SUCCESS); pthread_mutex_unlock(t_resource.gen_mutex); } -void hl_set_sync_flag(bool flag) { - g_sync_flag = flag; -} +void hl_set_sync_flag(bool flag) { g_sync_flag = flag; } -bool hl_get_sync_flag() { - return g_sync_flag; -} +bool hl_get_sync_flag() { return g_sync_flag; } void hl_stream_synchronize(hl_stream_t stream) { cudaStream_t cu_stream; - CHECK_LT(stream, HPPL_STREAM_END) - << __func__ <<": the parameter stream is error."; + CHECK_LT(stream, HPPL_STREAM_END) << __func__ + << ": the parameter stream is error."; cu_stream = t_resource.stream[stream]; CHECK_CUDA(dynload::cudaStreamSynchronize(cu_stream)); @@ -647,8 +621,8 @@ void hl_stream_synchronize(hl_stream_t stream) { void hl_create_event(hl_event_t *event) { CHECK_NOTNULL(event); - struct _hl_event_st* st_event = - (struct _hl_event_st*)malloc(sizeof(struct _hl_event_st)); + struct _hl_event_st *st_event = + (struct _hl_event_st *)malloc(sizeof(struct _hl_event_st)); CHECK_CUDA(dynload::cudaEventCreate(&st_event->cu_event)); @@ -660,8 +634,8 @@ float hl_event_elapsed_time(hl_event_t start, hl_event_t end) { CHECK_NOTNULL(start); CHECK_NOTNULL(end); - CHECK_CUDA(dynload::cudaEventElapsedTime(&time, - start->cu_event, end->cu_event)); + CHECK_CUDA( + dynload::cudaEventElapsedTime(&time, start->cu_event, end->cu_event)); return time; } @@ -669,24 +643,22 @@ void hl_stream_record_event(hl_stream_t stream, hl_event_t event) { cudaStream_t cu_stream; CHECK_NOTNULL(event); - CHECK_LT(stream, HPPL_STREAM_END) - << __func__ <<": the parameter stream is error."; + CHECK_LT(stream, HPPL_STREAM_END) << __func__ + << ": the parameter stream is error."; cu_stream = t_resource.stream[stream]; - CHECK_CUDA(dynload::cudaEventRecord( - event->cu_event, cu_stream)); + CHECK_CUDA(dynload::cudaEventRecord(event->cu_event, cu_stream)); } void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) { cudaStream_t cu_stream; CHECK_NOTNULL(event); - CHECK_LT(stream, HPPL_STREAM_END) - << __func__ <<": the parameter stream is error."; + CHECK_LT(stream, HPPL_STREAM_END) << __func__ + << ": the parameter stream is error."; cu_stream = t_resource.stream[stream]; - CHECK_CUDA(dynload::cudaStreamWaitEvent( - cu_stream, event->cu_event, 0)); + CHECK_CUDA(dynload::cudaStreamWaitEvent(cu_stream, event->cu_event, 0)); } void hl_destroy_event(hl_event_t event) { @@ -705,15 +677,15 @@ void hl_event_synchronize(hl_event_t event) { void hl_get_device_name(char *name, int len, int device) { CHECK_NOTNULL(name); CHECK(device >= 0 && device < g_system_device_num && g_device[device]) - << "Device("<< device <<") is not specified in startup."; + << "Device(" << device << ") is not specified in startup."; - strncpy(name, g_device[device]->device_name , len); + strncpy(name, g_device[device]->device_name, len); } void hl_get_device_memory(size_t *mem_size, int device) { CHECK_NOTNULL(mem_size); CHECK(device >= 0 && device < g_system_device_num && g_device[device]) - << "Device("<< device <<") is not specified in startup."; + << "Device(" << device << ") is not specified in startup."; *mem_size = g_device[device]->device_mem; } @@ -722,31 +694,26 @@ void hl_get_device_compute_capability(int *major, int *minor, int device) { CHECK_NOTNULL(major); CHECK_NOTNULL(minor); CHECK(device >= 0 && device < g_system_device_num && g_device[device]) - << "Device("<< device << ") is not specified in startup."; + << "Device(" << device << ") is not specified in startup."; *major = g_device[device]->major; *minor = g_device[device]->minor; } -int hl_get_device_last_error() { - return (int)dynload::cudaGetLastError(); -} +int hl_get_device_last_error() { return (int)dynload::cudaGetLastError(); } -const char* hl_get_device_error_string() { +const char *hl_get_device_error_string() { cudaError_t err = dynload::cudaGetLastError(); return dynload::cudaGetErrorString(err); } -const char* hl_get_device_error_string(size_t err) { +const char *hl_get_device_error_string(size_t err) { return dynload::cudaGetErrorString((cudaError_t)err); } -void hl_device_synchronize() { - CHECK_CUDA(dynload::cudaDeviceSynchronize()); -} +void hl_device_synchronize() { CHECK_CUDA(dynload::cudaDeviceSynchronize()); } void hl_set_device_flags_block() { - CHECK_CUDA(dynload::cudaSetDeviceFlags( - cudaDeviceScheduleBlockingSync)); + CHECK_CUDA(dynload::cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync)); } bool hl_cuda_event_is_ready(hl_event_t event) { diff --git a/paddle/cuda/src/hl_cudart_wrap.cc b/paddle/cuda/src/hl_cudart_wrap.cc index fe755b8c2606dffeeff2ea1549180ca8b134c251..610b47581c90b89713f21d8eca6d86ac85ff647c 100644 --- a/paddle/cuda/src/hl_cudart_wrap.cc +++ b/paddle/cuda/src/hl_cudart_wrap.cc @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #ifdef PADDLE_USE_DSO #include @@ -29,48 +28,46 @@ limitations under the License. */ namespace dynload { extern std::once_flag cudart_dso_flag; -extern void* cudart_dso_handle; +extern void *cudart_dso_handle; /** * The following macro definition can generate structs * (for each function) to dynamic load cuda routine * via operator overloading. **/ -#define DYNAMIC_LOAD_CUDART_WRAP(__name, __type) \ - struct DynLoad__##__name { \ - template \ - __type operator()(Args... args) { \ - typedef __type (*cudartFunc)(Args...); \ - std::call_once(cudart_dso_flag, GetCudartDsoHandle, \ - &cudart_dso_handle); \ - void* p_##__name = dlsym(cudart_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - } __name; /* struct DynLoad__##__name */ +#define DYNAMIC_LOAD_CUDART_WRAP(__name, __type) \ + struct DynLoad__##__name { \ + template \ + __type operator()(Args... args) { \ + typedef __type (*cudartFunc)(Args...); \ + std::call_once(cudart_dso_flag, GetCudartDsoHandle, &cudart_dso_handle); \ + void *p_##__name = dlsym(cudart_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + } __name; /* struct DynLoad__##__name */ /* include all needed cuda functions in HPPL */ -#define CUDA_ROUTINE_EACH(__macro) \ - __macro(cudaLaunch, cudaError_t) \ - __macro(cudaSetupArgument, cudaError_t) \ - __macro(cudaConfigureCall, cudaError_t) \ - __macro(__cudaRegisterFatBinary, void**) \ - __macro(__cudaUnregisterFatBinary, void) \ - __macro(__cudaRegisterFunction, void) \ - __macro(__cudaRegisterVar, void) \ - __macro(__cudaRegisterManagedVar, void) \ - __macro(__cudaInitModule, char) \ - __macro(__cudaRegisterTexture, void) \ - __macro(__cudaRegisterSurface, void) +#define CUDA_ROUTINE_EACH(__macro) \ + __macro(cudaLaunch, cudaError_t) __macro(cudaSetupArgument, cudaError_t) \ + __macro(cudaConfigureCall, cudaError_t) \ + __macro(__cudaRegisterFatBinary, void **) \ + __macro(__cudaUnregisterFatBinary, void) \ + __macro(__cudaRegisterFunction, void) \ + __macro(__cudaRegisterVar, void) \ + __macro(__cudaRegisterManagedVar, void) \ + __macro(__cudaInitModule, char) \ + __macro(__cudaRegisterTexture, void) \ + __macro(__cudaRegisterSurface, void) CUDA_ROUTINE_EACH(DYNAMIC_LOAD_CUDART_WRAP) #if CUDART_VERSION >= 7000 - DYNAMIC_LOAD_CUDART_WRAP(cudaLaunchKernel, cudaError_t) +DYNAMIC_LOAD_CUDART_WRAP(cudaLaunchKernel, cudaError_t) #endif #undef CUDA_ROUNTINE_EACH -} /* namespace dynload */ +} /* namespace dynload */ #if CUDART_VERSION >= 7000 __host__ cudaError_t CUDARTAPI cudaLaunchKernel(const void *func, @@ -79,12 +76,11 @@ __host__ cudaError_t CUDARTAPI cudaLaunchKernel(const void *func, void **args, size_t sharedMem, cudaStream_t stream) { - return dynload::cudaLaunchKernel(func, gridDim, blockDim, - args, sharedMem, stream); + return dynload::cudaLaunchKernel( + func, gridDim, blockDim, args, sharedMem, stream); } #endif /* CUDART_VERSION >= 7000 */ - __host__ cudaError_t CUDARTAPI cudaLaunch(const void *func) { return dynload::cudaLaunch(func); } @@ -99,13 +95,12 @@ __host__ cudaError_t CUDARTAPI cudaConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem, cudaStream_t stream) { - return dynload::cudaConfigureCall(gridDim, blockDim, - sharedMem, stream); + return dynload::cudaConfigureCall(gridDim, blockDim, sharedMem, stream); } extern "C" { -void** CUDARTAPI __cudaRegisterFatBinary(void *fatCubin) { +void **CUDARTAPI __cudaRegisterFatBinary(void *fatCubin) { return dynload::__cudaRegisterFatBinary(fatCubin); } @@ -113,86 +108,87 @@ void CUDARTAPI __cudaUnregisterFatBinary(void **fatCubinHandle) { return dynload::__cudaUnregisterFatBinary(fatCubinHandle); } -void CUDARTAPI __cudaRegisterFunction( - void **fatCubinHandle, - const char *hostFun, - char *deviceFun, - const char *deviceName, - int thread_limit, - uint3 *tid, - uint3 *bid, - dim3 *bDim, - dim3 *gDim, - int *wSize -) { - return dynload::__cudaRegisterFunction( - fatCubinHandle, hostFun, deviceFun, deviceName, - thread_limit, tid, bid, bDim, gDim, wSize); +void CUDARTAPI __cudaRegisterFunction(void **fatCubinHandle, + const char *hostFun, + char *deviceFun, + const char *deviceName, + int thread_limit, + uint3 *tid, + uint3 *bid, + dim3 *bDim, + dim3 *gDim, + int *wSize) { + return dynload::__cudaRegisterFunction(fatCubinHandle, + hostFun, + deviceFun, + deviceName, + thread_limit, + tid, + bid, + bDim, + gDim, + wSize); } -void CUDARTAPI __cudaRegisterVar( - void **fatCubinHandle, - char *hostVar, - char *deviceAddress, - const char *deviceName, - int ext, - int size, - int constant, - int global -) { - return dynload::__cudaRegisterVar( - fatCubinHandle, hostVar, deviceAddress, - deviceName, ext, size, constant, global); +void CUDARTAPI __cudaRegisterVar(void **fatCubinHandle, + char *hostVar, + char *deviceAddress, + const char *deviceName, + int ext, + int size, + int constant, + int global) { + return dynload::__cudaRegisterVar(fatCubinHandle, + hostVar, + deviceAddress, + deviceName, + ext, + size, + constant, + global); } - - -extern void CUDARTAPI __cudaRegisterManagedVar( - void **fatCubinHandle, - void **hostVarPtrAddress, - char *deviceAddress, - const char *deviceName, - int ext, - int size, - int constant, - int global -) { - return dynload::__cudaRegisterManagedVar( - fatCubinHandle, hostVarPtrAddress, deviceAddress, - deviceName, ext, size, constant, global); +extern void CUDARTAPI __cudaRegisterManagedVar(void **fatCubinHandle, + void **hostVarPtrAddress, + char *deviceAddress, + const char *deviceName, + int ext, + int size, + int constant, + int global) { + return dynload::__cudaRegisterManagedVar(fatCubinHandle, + hostVarPtrAddress, + deviceAddress, + deviceName, + ext, + size, + constant, + global); } -char CUDARTAPI __cudaInitModule( - void **fatCubinHandle -) { +char CUDARTAPI __cudaInitModule(void **fatCubinHandle) { return dynload::__cudaInitModule(fatCubinHandle); } -void CUDARTAPI __cudaRegisterTexture( - void **fatCubinHandle, - const struct textureReference *hostVar, - const void **deviceAddress, - const char *deviceName, - int dim, - int norm, - int ext -) { +void CUDARTAPI __cudaRegisterTexture(void **fatCubinHandle, + const struct textureReference *hostVar, + const void **deviceAddress, + const char *deviceName, + int dim, + int norm, + int ext) { return dynload::__cudaRegisterTexture( - fatCubinHandle, hostVar, deviceAddress, - deviceName, dim, norm, ext); + fatCubinHandle, hostVar, deviceAddress, deviceName, dim, norm, ext); } -void CUDARTAPI __cudaRegisterSurface( - void **fatCubinHandle, - const struct surfaceReference *hostVar, - const void **deviceAddress, - const char *deviceName, - int dim, - int ext -) { +void CUDARTAPI __cudaRegisterSurface(void **fatCubinHandle, + const struct surfaceReference *hostVar, + const void **deviceAddress, + const char *deviceName, + int dim, + int ext) { return dynload::__cudaRegisterSurface( - fatCubinHandle, hostVar, deviceAddress, - deviceName, dim, ext); + fatCubinHandle, hostVar, deviceAddress, deviceName, dim, ext); } } /* extern "C" */ diff --git a/paddle/cuda/src/hl_math.cc b/paddle/cuda/src/hl_math.cc index 76d48c4a9b94d402cf84c57bd240e03a1a83b1a0..f4bf888bab4e92dd940714ef1b7aeee9242eb817 100644 --- a/paddle/cuda/src/hl_math.cc +++ b/paddle/cuda/src/hl_math.cc @@ -12,24 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "avx_mathfun.h" namespace hppl { -__m256 exp(__m256 a) { - return exp256_ps(a); -} +__m256 exp(__m256 a) { return exp256_ps(a); } -__m256 log(__m256 a) { - return log256_ps(a); -} +__m256 log(__m256 a) { return log256_ps(a); } -__m256 sin(__m256 a) { - return sin256_ps(a); -} +__m256 sin(__m256 a) { return sin256_ps(a); } -__m256 cos(__m256 a) { - return cos256_ps(a); -} +__m256 cos(__m256 a) { return cos256_ps(a); } } // namespace hppl diff --git a/paddle/cuda/src/hl_time.cc b/paddle/cuda/src/hl_time.cc index adc88d60dd8d547cedcae5fd088b2fa581d8e5be..d52b2a1df07374f632def12eb52e10e10ca86028 100644 --- a/paddle/cuda/src/hl_time.cc +++ b/paddle/cuda/src/hl_time.cc @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include #include #include @@ -21,8 +20,7 @@ limitations under the License. */ using std::chrono::high_resolution_clock; int64_t getCurrentTimeStick() { - high_resolution_clock::time_point tp = high_resolution_clock::now(); - high_resolution_clock::duration dtn = tp.time_since_epoch(); - return dtn.count(); + high_resolution_clock::time_point tp = high_resolution_clock::now(); + high_resolution_clock::duration dtn = tp.time_since_epoch(); + return dtn.count(); } - diff --git a/paddle/gserver/activations/ActivationFunction.cpp b/paddle/gserver/activations/ActivationFunction.cpp index 27eed75d4d76c351e381a3b71dc44a3254fb1a4d..f1bb94216c44b3e915f87a3ae49bdfd3ef812916 100644 --- a/paddle/gserver/activations/ActivationFunction.cpp +++ b/paddle/gserver/activations/ActivationFunction.cpp @@ -51,12 +51,14 @@ static ClassRegistrar gActivationRegistrar; * @brief Macro for registering a derived activation class */ #define END_DEFINE_ACTIVATION(ACTIVATION_NAME) \ - }; \ + } \ + ; \ const std::string ACTIVATION_CLASS_NAME(ACTIVATION_NAME)::name = \ #ACTIVATION_NAME; \ static InitFunction __reg_activation__##ACTIVATION_NAME([] { \ - gActivationRegistrar.registerClass< \ - ACTIVATION_CLASS_NAME(ACTIVATION_NAME)>(#ACTIVATION_NAME); \ + gActivationRegistrar \ + .registerClass( \ + #ACTIVATION_NAME); \ }); /** @@ -111,14 +113,22 @@ void backward(Argument& act) { outputG->softmaxBackward(*outputV); } else { SetDevice device(act.deviceId); - Matrix::resizeOrCreate(sftMaxDot_, outputG->getHeight(), + Matrix::resizeOrCreate(sftMaxDot_, + outputG->getHeight(), outputG->getWidth(), - /* trans */ false, useGpu(act.deviceId)); - Matrix::resizeOrCreate(sftMaxSum_, outputG->getHeight(), 1, - /* trans */ false, useGpu(act.deviceId)); + /* trans */ false, + useGpu(act.deviceId)); + Matrix::resizeOrCreate(sftMaxSum_, + outputG->getHeight(), + 1, + /* trans */ false, + useGpu(act.deviceId)); if (!one_ || one_->getWidth() != outputG->getWidth()) { - Matrix::resizeOrCreate(one_, 1, outputG->getWidth(), - /* trans */ false, useGpu(act.deviceId)); + Matrix::resizeOrCreate(one_, + 1, + outputG->getWidth(), + /* trans */ false, + useGpu(act.deviceId)); one_->one(); } @@ -130,7 +140,6 @@ void backward(Argument& act) { } END_DEFINE_ACTIVATION(softmax) - /** * @brief Sequence_softmax Activation * @note Softmax on all frames of one sequence. @@ -146,10 +155,16 @@ void forward(Argument& act) { CHECK_EQ(act.value->getWidth(), 1UL); if (!argument_.value) { - argument_.value = Matrix::create(nullptr, /* height= */ 1, 1, - /* trans= */ false, useGpu(act.deviceId)); - argument_.grad = Matrix::create(nullptr, /* height= */ 1, 1, - /* trans= */ false, useGpu(act.deviceId)); + argument_.value = Matrix::create(nullptr, + /* height= */ 1, + 1, + /* trans= */ false, + useGpu(act.deviceId)); + argument_.grad = Matrix::create(nullptr, + /* height= */ 1, + 1, + /* trans= */ false, + useGpu(act.deviceId)); } auto starts = act.sequenceStartPositions->getVector(useGpu(act.deviceId)); @@ -267,8 +282,11 @@ END_DEFINE_ACTIVATION(softrelu) BEGIN_DEFINE_ACTIVATION(abs) void forward(Argument& act) { SetDevice device(act.deviceId); - Matrix::resizeOrCreate(act.in, act.value->getHeight(), act.value->getWidth(), - /* trans */ false, useGpu(act.deviceId)); + Matrix::resizeOrCreate(act.in, + act.value->getHeight(), + act.value->getWidth(), + /* trans */ false, + useGpu(act.deviceId)); act.in->copyFrom(*act.value); act.value->abs(*act.value); @@ -286,8 +304,11 @@ END_DEFINE_ACTIVATION(abs) BEGIN_DEFINE_ACTIVATION(square) void forward(Argument& act) { SetDevice device(act.deviceId); - Matrix::resizeOrCreate(act.in, act.value->getHeight(), act.value->getWidth(), - /* trans */ false, useGpu(act.deviceId)); + Matrix::resizeOrCreate(act.in, + act.value->getHeight(), + act.value->getWidth(), + /* trans */ false, + useGpu(act.deviceId)); act.in->copyFrom(*act.value); act.value->square(*act.value); @@ -317,8 +338,11 @@ END_DEFINE_ACTIVATION(exponential) BEGIN_DEFINE_ACTIVATION(log) void forward(Argument& act) { SetDevice device(act.deviceId); - Matrix::resizeOrCreate(act.in, act.value->getHeight(), act.value->getWidth(), - /* trans */ false, useGpu(act.deviceId)); + Matrix::resizeOrCreate(act.in, + act.value->getHeight(), + act.value->getWidth(), + /* trans */ false, + useGpu(act.deviceId)); act.in->copyFrom(*act.value); act.value->log(*act.value); @@ -333,11 +357,9 @@ ActivationFunction* ActivationFunction::create(const std::string& type) { std::vector ActivationFunction::getAllRegisteredTypes() { std::vector types; - gActivationRegistrar.forEachType([&](const std::string& type) { - types.push_back(type); - }); + gActivationRegistrar.forEachType( + [&](const std::string& type) { types.push_back(type); }); return types; } - } // namespace paddle diff --git a/paddle/gserver/activations/ActivationFunction.h b/paddle/gserver/activations/ActivationFunction.h index c483372256c035e39bfdbcaa4193a1a2e7fd80b8..e9ed5c619ab5e4dd9c52c0dac24478c2a57aa1bf 100644 --- a/paddle/gserver/activations/ActivationFunction.h +++ b/paddle/gserver/activations/ActivationFunction.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include #include diff --git a/paddle/gserver/dataproviders/DataProvider.cpp b/paddle/gserver/dataproviders/DataProvider.cpp index 2cfb5a3a18c8a63d69bf0598eeee2807376340bc..e6cc4a246a8494d287f8638674f4ae213f38f657 100644 --- a/paddle/gserver/dataproviders/DataProvider.cpp +++ b/paddle/gserver/dataproviders/DataProvider.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "DataProvider.h" #include "paddle/utils/Util.h" @@ -57,7 +56,7 @@ void BufferBatch::clone(DataBatch* srcBatch, bool useGpu) { } } -DoubleBuffer::DoubleBuffer(DataProvider *dataPool, +DoubleBuffer::DoubleBuffer(DataProvider* dataPool, bool useGpu, int64_t batchSize) { batchSize_ = batchSize; @@ -155,7 +154,7 @@ void DoubleBuffer::startAsyncLoad() { } ClassRegistrar -DataProvider::registrar_; + DataProvider::registrar_; DataProvider* DataProvider::create(const DataConfig& config, const ModelConfig& modelConfig, @@ -182,7 +181,8 @@ int64_t DataProvider::getNextBatch(int64_t size, DataBatch* batch) { for (int i = 0; i < config_.constant_slots_size(); ++i) { MemoryHandlePtr handle = constantSlots[i] ? constantSlots[i]->getMemoryHandle() : nullptr; - Matrix::resizeOrCreate(constantSlots[i], batchSize, + Matrix::resizeOrCreate(constantSlots[i], + batchSize, 1, // = width false, // = trans useGpu_); // = useGpu @@ -216,7 +216,8 @@ void DataProvider::initAsyncLoader() { } SimpleDataProviderBase::SimpleDataProviderBase(const DataConfig& config, - bool useGpu, bool withInfo) + bool useGpu, + bool withInfo) : DataProvider(config, useGpu) { /* initialize the size of a sample, and the buffer */ sampleDim_ = config_.feat_dim() * (2 * config_.context_len() + 1); @@ -337,7 +338,8 @@ int64_t SimpleDataProviderBase::fillBuffer() { sampleNumInBuf_ = n + fillBufferImp(hInputDataBuf_->getData() + n * sampleDim_, hInputLabelBuf_->getData() + n, - hInputInfoBuf_->getData() + n, bufferCapacity_ - n); + hInputInfoBuf_->getData() + n, + bufferCapacity_ - n); /* for stachastic gradient training */ if (!skipShuffle_) { @@ -357,11 +359,14 @@ SimpleDataProvider::SimpleDataProvider(const DataConfig& config, bool useGpu) SimpleDataProvider::~SimpleDataProvider() {} -int64_t SimpleDataProvider::fillBufferImp(real* data, int* label, int* info, +int64_t SimpleDataProvider::fillBufferImp(real* data, + int* label, + int* info, int64_t size) { (void)info; int64_t n = std::min(labels_.size() - currentSampleIndex_, size); - memcpy(data, &data_[currentSampleIndex_ * sampleDim_], + memcpy(data, + &data_[currentSampleIndex_ * sampleDim_], n * sampleDim_ * sizeof(real)); memcpy(label, &labels_[currentSampleIndex_], sizeof(int) * n); currentSampleIndex_ += n; diff --git a/paddle/gserver/dataproviders/DataProvider.h b/paddle/gserver/dataproviders/DataProvider.h index 112e45de1cb232097ed63b120d5ac631b37952e9..8b7fb27f821a47d830413eced79b3352a6969c90 100644 --- a/paddle/gserver/dataproviders/DataProvider.h +++ b/paddle/gserver/dataproviders/DataProvider.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include @@ -44,15 +43,15 @@ namespace paddle { * @brief Macro for registering a data provider. The class type should contain * a consturctor with parameter (DataConfig, bool). */ -#define REGISTER_DATA_PROVIDER(__type_name, __class_name)\ - static InitFunction __reg_type_##__type_name([]() {\ - DataProvider::registrar_.registerClass(\ - #__type_name, \ - [](DataConfig conf, ModelConfig, bool useGpu) -> DataProvider* { \ - DataProvider* dp = new __class_name (conf, useGpu);\ - return dp;\ - });\ -}) +#define REGISTER_DATA_PROVIDER(__type_name, __class_name) \ + static InitFunction __reg_type_##__type_name([]() { \ + DataProvider::registrar_.registerClass( \ + #__type_name, \ + [](DataConfig conf, ModelConfig, bool useGpu) -> DataProvider* { \ + DataProvider* dp = new __class_name(conf, useGpu); \ + return dp; \ + }); \ + }) /** * @def REGISTER_DATA_PROVIDER_EX @@ -61,8 +60,8 @@ namespace paddle { */ #define REGISTER_DATA_PROVIDER_EX(__type_name, __class_name) \ static InitFunction __reg_type_##__type_name([] { \ - DataProvider::registrar_.registerClass<__class_name>(#__type_name); \ -}) + DataProvider::registrar_.registerClass<__class_name>(#__type_name); \ + }) class DataBatch; class BufferBatch; @@ -181,7 +180,8 @@ public: * @param[in] size DataBatch.getSize() * @param[in] dataId sub dataprovider id (in MultiDataProvider) */ - void appendArguments(const std::vector& argus, int size, + void appendArguments(const std::vector& argus, + int size, int dataId) { size_ += size; for (const auto& argu : argus) { @@ -259,9 +259,7 @@ typedef Queue BufferBatchQueue; class DoubleBuffer { public: - DoubleBuffer(DataProvider* dataPool, - bool useGpu, - int64_t batchSize = 0); + DoubleBuffer(DataProvider* dataPool, bool useGpu, int64_t batchSize = 0); virtual ~DoubleBuffer(); void removeOneBatch(DataBatch* dataBatch); @@ -310,7 +308,7 @@ public: /** * @brief create only used for unittest. */ - inline static DataProvider* create(const DataConfig &config, + inline static DataProvider* create(const DataConfig& config, bool useGpu = FLAGS_use_gpu) { return create(config, ModelConfig(), useGpu); } @@ -462,7 +460,9 @@ protected: * * label[n] is the label for the n-th sample. */ - virtual int64_t fillBufferImp(real* data, int* label, int* info, + virtual int64_t fillBufferImp(real* data, + int* label, + int* info, int64_t size) = 0; }; @@ -475,7 +475,9 @@ public: protected: void loadData(const std::string& fileName); void loadDataFile(const std::string& fileName); - virtual int64_t fillBufferImp(real* data, int* label, int* info, + virtual int64_t fillBufferImp(real* data, + int* label, + int* info, int64_t size); protected: diff --git a/paddle/gserver/dataproviders/DataProviderGroup.h b/paddle/gserver/dataproviders/DataProviderGroup.h index 0689f90f3e7dd3d3e1df19f3958c821d53e69700..6c178e29ee714a6bd7f58861d7cf15716fee848d 100644 --- a/paddle/gserver/dataproviders/DataProviderGroup.h +++ b/paddle/gserver/dataproviders/DataProviderGroup.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "DataProvider.h" @@ -65,8 +64,8 @@ void DataProviderGroup::reset() { provider_ = nullptr; // shuffle file list - std::shuffle(fileList_.begin(), fileList_.end(), - ThreadLocalRandomEngine::get()); + std::shuffle( + fileList_.begin(), fileList_.end(), ThreadLocalRandomEngine::get()); startLoader(); DataProvider::reset(); @@ -113,8 +112,9 @@ void DataProviderGroup::startLoader() { size_t endPos = std::min(fileList_.size(), startPos + loadFileCount); std::vector fileVec(fileList_.begin() + startPos, fileList_.begin() + endPos); - loader_->addJob([this, fileVec]() - -> ProviderPtrType { return this->loadFile(fileVec); }); + loader_->addJob([this, fileVec]() -> ProviderPtrType { + return this->loadFile(fileVec); + }); } loader_->stopAddJob(); } diff --git a/paddle/gserver/dataproviders/MultiDataProvider.cpp b/paddle/gserver/dataproviders/MultiDataProvider.cpp index 8e4f53978a0451f3bb6cd5da30f017708448f9ac..51fb1f26668c55dc1c2aecd5389f327e2569a52f 100644 --- a/paddle/gserver/dataproviders/MultiDataProvider.cpp +++ b/paddle/gserver/dataproviders/MultiDataProvider.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/utils/Util.h" #include "MultiDataProvider.h" #include "paddle/utils/Logging.h" @@ -59,10 +58,8 @@ MultiDataProvider::MultiDataProvider(const DataConfig& config, "MultiDataProvider"; subConfig.set_async_load_data(false); } - subDataProviders_[i] = - std::unique_ptr(DataProvider::create(subConfig, - modelConfig, - useGpu_)); + subDataProviders_[i] = std::unique_ptr( + DataProvider::create(subConfig, modelConfig, useGpu_)); } } diff --git a/paddle/gserver/dataproviders/MultiDataProvider.h b/paddle/gserver/dataproviders/MultiDataProvider.h index b498ba6516c4320566b1b3cc2bd557ae016d7c39..876467c04f074cf37e48fdfa9b24f236fcfe8ba1 100644 --- a/paddle/gserver/dataproviders/MultiDataProvider.h +++ b/paddle/gserver/dataproviders/MultiDataProvider.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "DataProvider.h" diff --git a/paddle/gserver/dataproviders/ProtoDataProvider.cpp b/paddle/gserver/dataproviders/ProtoDataProvider.cpp index 344644755f24045443b8cb3ebd08004a4b1cdcb5..0a7ff802461f2ded0e6e842c088bddf218361f79 100644 --- a/paddle/gserver/dataproviders/ProtoDataProvider.cpp +++ b/paddle/gserver/dataproviders/ProtoDataProvider.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "ProtoDataProvider.h" #include "paddle/utils/Util.h" #include "paddle/utils/StringUtil.h" @@ -23,7 +22,8 @@ limitations under the License. */ #include "paddle/utils/Logging.h" #include "DataProviderGroup.h" -P_DEFINE_double(memory_threshold_on_load_data, 1.0, +P_DEFINE_double(memory_threshold_on_load_data, + 1.0, "stop loading data when memory is not sufficient"); namespace paddle { @@ -32,7 +32,8 @@ REGISTER_DATA_PROVIDER(proto_group, DataProviderGroup); REGISTER_DATA_PROVIDER(proto_sequence_group, DataProviderGroup); -ProtoDataProvider::ProtoDataProvider(const DataConfig& config, bool useGpu, +ProtoDataProvider::ProtoDataProvider(const DataConfig& config, + bool useGpu, bool loadDataAll) : DataProvider(config, useGpu), sampleNums_(0), currentSequenceIndex_(0) { if (loadDataAll) { @@ -279,7 +280,8 @@ void ProtoDataProvider::fillSlots(const DataSample& sample) { } slot.sparseNonValueData.resize(slot.indices.back() + slotSize); const unsigned int* ids = sample.vector_slots(i).ids().data(); - memcpy(slot.sparseNonValueData.data() + slot.indices.back(), ids, + memcpy(slot.sparseNonValueData.data() + slot.indices.back(), + ids, sizeof(*ids) * slotSize); slot.indices.push_back(slot.indices.back() + slotSize); if (subSlotSize) { @@ -318,10 +320,11 @@ void ProtoDataProvider::fillSlots(const DataSample& sample) { slot.varDenseData[oldSize].data.resize(varDim); const float* values = sample.vector_slots(i).values().data(); #ifdef PADDLE_TYPE_DOUBLE - std::copy(values, values + varDim, - slot.varDenseData[oldSize].data.data()); + std::copy( + values, values + varDim, slot.varDenseData[oldSize].data.data()); #else - memcpy(slot.varDenseData[oldSize].data.data(), values, + memcpy(slot.varDenseData[oldSize].data.data(), + values, sizeof(real) * varDim); #endif slot.varDenseData[oldSize].dims.resize( @@ -374,8 +377,9 @@ void ProtoDataProvider::reset() { } void ProtoDataProvider::shuffle() { - std::shuffle(shuffledSequenceIds_.begin(), shuffledSequenceIds_.end(), - ThreadLocalRandomEngine::get()); + std::shuffle(shuffledSequenceIds_.begin(), + shuffledSequenceIds_.end(), + ThreadLocalRandomEngine::get()); } /* @@ -502,7 +506,8 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size, if (!iidData()) { ICpuGpuVector::resizeOrCreate(cpuArguments[0].sequenceStartPositions, - numSequences + 1, /* useGpu= */ false); + numSequences + 1, + /* useGpu= */ false); int* buf = cpuArguments[0].sequenceStartPositions->getMutableData(false); int pos = 0; int i = 0; @@ -530,7 +535,9 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size, switch (slotType) { case SlotDef::VECTOR_DENSE: { - Matrix::resizeOrCreate(cpuArguments[slot].value, size, dim, + Matrix::resizeOrCreate(cpuArguments[slot].value, + size, + dim, false, // trans = false false); // useGpu = false real* buf = cpuArguments[slot].value->getData(); @@ -543,19 +550,27 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size, } case SlotDef::VECTOR_SPARSE_NON_VALUE: { if (!(cpuArguments[slot].value)) { - cpuArguments[slot].value = Matrix::createSparseMatrix( - size, dim, size /*DEFAULT_AVG_WIDTH = 1*/, NO_VALUE, SPARSE_CSR, - false, useGpu_); + cpuArguments[slot].value = + Matrix::createSparseMatrix(size, + dim, + size /*DEFAULT_AVG_WIDTH = 1*/, + NO_VALUE, + SPARSE_CSR, + false, + useGpu_); } auto mat = cpuArguments[slot].value; mat->resize(size, dim); if (std::dynamic_pointer_cast(mat)) { std::dynamic_pointer_cast(mat) - ->copyFrom(dataPos.data(), slots_[slot].indices.data(), - slots_[slot].sparseNonValueData.data(), HPPL_STREAM_1); + ->copyFrom(dataPos.data(), + slots_[slot].indices.data(), + slots_[slot].sparseNonValueData.data(), + HPPL_STREAM_1); } else if (std::dynamic_pointer_cast(mat)) { std::dynamic_pointer_cast(mat) - ->copyFrom(dataPos.data(), slots_[slot].indices.data(), + ->copyFrom(dataPos.data(), + slots_[slot].indices.data(), slots_[slot].sparseNonValueData.data()); } else { LOG(FATAL) << "Not Supported"; @@ -571,19 +586,27 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size, } case SlotDef::VECTOR_SPARSE_VALUE: { if (!(cpuArguments[slot].value)) { - cpuArguments[slot].value = Matrix::createSparseMatrix( - size, dim, size /*DEFAULT_AVG_WIDTH = 1*/, FLOAT_VALUE, - SPARSE_CSR, false, useGpu_); + cpuArguments[slot].value = + Matrix::createSparseMatrix(size, + dim, + size /*DEFAULT_AVG_WIDTH = 1*/, + FLOAT_VALUE, + SPARSE_CSR, + false, + useGpu_); } auto mat = cpuArguments[slot].value; mat->resize(size, dim); if (std::dynamic_pointer_cast(mat)) { - std::dynamic_pointer_cast(mat)->copyFrom( - dataPos.data(), slots_[slot].indices.data(), - slots_[slot].sparseFloatValueData.data(), HPPL_STREAM_1); + std::dynamic_pointer_cast(mat) + ->copyFrom(dataPos.data(), + slots_[slot].indices.data(), + slots_[slot].sparseFloatValueData.data(), + HPPL_STREAM_1); } else if (std::dynamic_pointer_cast(mat)) { std::dynamic_pointer_cast(mat) - ->copyFrom(dataPos.data(), slots_[slot].indices.data(), + ->copyFrom(dataPos.data(), + slots_[slot].indices.data(), slots_[slot].sparseFloatValueData.data()); } else { LOG(FATAL) << "Not Supported"; @@ -591,7 +614,8 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size, break; } case SlotDef::INDEX: { - IVector::resizeOrCreate(cpuArguments[slot].ids, size, + IVector::resizeOrCreate(cpuArguments[slot].ids, + size, /* useGpu= */ false); int* buf = cpuArguments[slot].ids->getData(); for (int i = 0; i < size; ++i) { @@ -621,7 +645,9 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size, if (oldWidth < height) { totalDim = width * height * depth; } - Matrix::resizeOrCreate(cpuArguments[slot].value, size, totalDim, + Matrix::resizeOrCreate(cpuArguments[slot].value, + size, + totalDim, false, // trans = false false); // useGpu = false real* buf = cpuArguments[slot].value->getData(); @@ -637,13 +663,13 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size, } } } else { - memcpy(buf, slots_[slot].varDenseData[dataPos[0]].data.data(), + memcpy(buf, + slots_[slot].varDenseData[dataPos[0]].data.data(), sizeof(real) * totalDim); } - ICpuGpuVector::resizeOrCreate( - cpuArguments[slot].sequenceStartPositions, - size + 1, /* size == 1 currently */ - /* useGpu= */ false); + ICpuGpuVector::resizeOrCreate(cpuArguments[slot].sequenceStartPositions, + size + 1, /* size == 1 currently */ + /* useGpu= */ false); int* bufStarts = cpuArguments[slot].sequenceStartPositions->getMutableData(false); bufStarts[0] = 0; @@ -653,16 +679,17 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size, case SlotDef::VAR_MDIM_INDEX: { CHECK_EQ(size, 1); size_t totalDim = slots_[slot].varIndices[dataPos[0]].size(); - IVector::resizeOrCreate(cpuArguments[slot].ids, totalDim, + IVector::resizeOrCreate(cpuArguments[slot].ids, + totalDim, /* useGpu= */ false); int* buf = cpuArguments[slot].ids->getData(); - memcpy(buf, slots_[slot].varIndices[dataPos[0]].data(), + memcpy(buf, + slots_[slot].varIndices[dataPos[0]].data(), sizeof(int) * totalDim); - ICpuGpuVector::resizeOrCreate( - cpuArguments[slot].sequenceStartPositions, - size + 1, /* size == 1 currently */ - /* useGpu= */ false); + ICpuGpuVector::resizeOrCreate(cpuArguments[slot].sequenceStartPositions, + size + 1, /* size == 1 currently */ + /* useGpu= */ false); int* bufStarts = cpuArguments[slot].sequenceStartPositions->getMutableData(false); bufStarts[0] = 0; @@ -700,8 +727,8 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size, gpuArguments[i].sequenceStartPositions = cpuArguments[i].sequenceStartPositions; } else { - gpuArguments[i].resizeAndCopyFrom(cpuArguments[i], useGpu_, - HPPL_STREAM_1); + gpuArguments[i].resizeAndCopyFrom( + cpuArguments[i], useGpu_, HPPL_STREAM_1); } } hl_stream_synchronize(HPPL_STREAM_1); @@ -746,10 +773,9 @@ int64_t ProtoSequenceDataProvider::getNextBatchInternal(int64_t size, sampleLoop(op, size); // current slot: sequenceStartPositions - ICpuGpuVector::resizeOrCreate( - cpuArguments[slot].sequenceStartPositions, - size + 1, - /* useGpu= */ false); + ICpuGpuVector::resizeOrCreate(cpuArguments[slot].sequenceStartPositions, + size + 1, + /* useGpu= */ false); switch (slotType) { case SlotDef::VECTOR_SPARSE_VALUE: @@ -821,10 +847,10 @@ int64_t ProtoSequenceDataProvider::getNextBatchInternal(int64_t size, }; int subSize = subSampleLoop(op, size, slot); ICpuGpuVector::resizeOrCreate( - cpuArguments[slot].subSequenceStartPositions, subSize + 1, - false); + cpuArguments[slot].subSequenceStartPositions, subSize + 1, false); int* currPosOfArgumentSubSeqStart = - cpuArguments[slot].subSequenceStartPositions->getMutableData(false); + cpuArguments[slot].subSequenceStartPositions->getMutableData( + false); int64_t* subSeqs = dataSubPos.data(); int64_t* subIndexs = slots_[slot].subIndices.data(); int allSubSequenceLength = 0; @@ -849,7 +875,8 @@ int64_t ProtoSequenceDataProvider::getNextBatchInternal(int64_t size, } case SlotDef::INDEX: { // label slot - IVector::resizeOrCreate(cpuArguments[slot].ids, size, + IVector::resizeOrCreate(cpuArguments[slot].ids, + size, /* useGpu= */ false); // fill labels int* buf = cpuArguments[slot].ids->getData(); @@ -863,7 +890,9 @@ int64_t ProtoSequenceDataProvider::getNextBatchInternal(int64_t size, case SlotDef::VECTOR_DENSE: { // copy values size_t dim = header_.slot_defs(slot).dim(); - Matrix::resizeOrCreate(cpuArguments[slot].value, size, dim, + Matrix::resizeOrCreate(cpuArguments[slot].value, + size, + dim, false, // trans = false false); // useGpu = false real* buf = cpuArguments[slot].value->getData(); @@ -887,8 +916,8 @@ int64_t ProtoSequenceDataProvider::getNextBatchInternal(int64_t size, gpuArguments.resize(cpuArguments.size()); gpuBatch.setSize(size); for (size_t i = 0; i < cpuArguments.size(); ++i) { - gpuArguments[i].resizeAndCopyFrom(cpuArguments[i], useGpu_, - HPPL_STREAM_1); + gpuArguments[i].resizeAndCopyFrom( + cpuArguments[i], useGpu_, HPPL_STREAM_1); } hl_stream_synchronize(HPPL_STREAM_1); *batch = gpuBatch; diff --git a/paddle/gserver/dataproviders/ProtoDataProvider.h b/paddle/gserver/dataproviders/ProtoDataProvider.h index 846dd7673abe8b836be1b728bb690daa0e8acc20..ffdcc8fdc977f53e29dc9f03fa3cf7af56acb92f 100644 --- a/paddle/gserver/dataproviders/ProtoDataProvider.h +++ b/paddle/gserver/dataproviders/ProtoDataProvider.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include @@ -48,7 +47,8 @@ namespace paddle { */ class ProtoDataProvider : public DataProvider { public: - ProtoDataProvider(const DataConfig& config, bool useGpu, + ProtoDataProvider(const DataConfig& config, + bool useGpu, bool loadDataAll = true); virtual void reset(); @@ -161,14 +161,16 @@ protected: }; /** - * @brief Special use for Proto data: instances should contain sparse-non-value slots + * @brief Special use for Proto data: instances should contain sparse-non-value + * slots * and label. * * @note ProtoSequenceDataProvider treats each SPARSE SLOT as a SEQUENCE */ class ProtoSequenceDataProvider : public ProtoDataProvider { public: - ProtoSequenceDataProvider(const DataConfig& config, bool useGpu, + ProtoSequenceDataProvider(const DataConfig& config, + bool useGpu, bool loadDataAll = true); ~ProtoSequenceDataProvider() {} virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch); diff --git a/paddle/gserver/dataproviders/ProtoReader.h b/paddle/gserver/dataproviders/ProtoReader.h index 3b1eb7e9ef03c42df31c6efc9f0e0240d64e78df..b8fca3cd7f3c5efaea35dc8e09f7ca0ec250830f 100644 --- a/paddle/gserver/dataproviders/ProtoReader.h +++ b/paddle/gserver/dataproviders/ProtoReader.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include @@ -138,7 +137,8 @@ protected: * * @note this code depends on protobuf 2.4.0. There is nothing like * CodedInputStream::CurrentPosition() in protobuf 2.5.0 to tell us how many - * bytes has the object readed so far. Therefore, we calculated bytes ourselves. + * bytes has the object readed so far. Therefore, we calculated bytes + * ourselves. */ int approximateReadedBytes_; }; diff --git a/paddle/gserver/dataproviders/PyDataProvider.cpp b/paddle/gserver/dataproviders/PyDataProvider.cpp index 1332c0ab635b6ebec05f25fd77b9703b39227bc1..bee6ca14a2ec3995a3b432fc5a39419a5dd8a8ce 100644 --- a/paddle/gserver/dataproviders/PyDataProvider.cpp +++ b/paddle/gserver/dataproviders/PyDataProvider.cpp @@ -12,21 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "PyDataProvider.h" #include "paddle/utils/PythonUtil.h" #include #include "paddle/utils/Util.h" #include "paddle/utils/Excepts.h" - namespace paddle { #ifndef PADDLE_NO_PYTHON REGISTER_DATA_PROVIDER(py, PyDataProvider); #endif -PyDataProvider::PyDataProvider(const DataConfig& config, bool useGpu, +PyDataProvider::PyDataProvider(const DataConfig& config, + bool useGpu, bool loadDataAll) : DataProvider(config, useGpu), batchSize_(0) { PyGuard guard; @@ -50,8 +49,8 @@ void PyDataProvider::loadData(const std::vector& fileList) { classInstance_ = createPythonClass(pyModuleName_, pyClassName_, fileList, pyUserArgs_); CHECK(classInstance_) << "Create class instance failed."; - PyObjectPtr obj(PyObject_CallMethod(classInstance_.get(), - const_cast("getHeader"), NULL)); + PyObjectPtr obj(PyObject_CallMethod( + classInstance_.get(), const_cast("getHeader"), NULL)); CHECK_PY(obj) << "Call function getHeader failed."; std::string headerInfo = std::string(PyString_AsString(obj.get()), PyString_Size(obj.get())); @@ -90,7 +89,8 @@ void PyDataProvider::resetSlots() { } } -void PyDataProvider::fillDenseSlot(ProtoSlot& slot, char*& data, +void PyDataProvider::fillDenseSlot(ProtoSlot& slot, + char*& data, const char* dataEnd) { unsigned int dim = slot.dim; slot.sampleNum = readT(data, dataEnd); @@ -102,14 +102,17 @@ void PyDataProvider::fillDenseSlot(ProtoSlot& slot, char*& data, float* dat = reinterpret_cast(data); std::copy(dat, dat + slot.sampleNum * dim, slot.denseData.begin()); #else - memcpyWithCheck(slot.denseData.data(), data, - sizeof(real) * dim * slot.sampleNum, dataEnd); + memcpyWithCheck(slot.denseData.data(), + data, + sizeof(real) * dim * slot.sampleNum, + dataEnd); #endif // PyDataProvider always provide data in float data += sizeof(float) * dim * slot.sampleNum; } -void PyDataProvider::fillSparseNonValueSlot(ProtoSlot& slot, char*& data, +void PyDataProvider::fillSparseNonValueSlot(ProtoSlot& slot, + char*& data, const char* dataEnd) { slot.sampleNum = readT(data, dataEnd); unsigned int* indexPtr = (unsigned int*)data; @@ -121,12 +124,15 @@ void PyDataProvider::fillSparseNonValueSlot(ProtoSlot& slot, char*& data, length = readT(data, dataEnd); slot.indices.push_back(length); slot.sparseNonValueData.resize(length); - memcpyWithCheck(slot.sparseNonValueData.data(), data, - sizeof(unsigned int) * length, dataEnd); + memcpyWithCheck(slot.sparseNonValueData.data(), + data, + sizeof(unsigned int) * length, + dataEnd); data += sizeof(unsigned int) * length; } -void PyDataProvider::fillSparseValueSlot(ProtoSlot& slot, char*& data, +void PyDataProvider::fillSparseValueSlot(ProtoSlot& slot, + char*& data, const char* dataEnd) { slot.sampleNum = readT(data, dataEnd); unsigned int* indexPtr = (unsigned int*)data; @@ -153,7 +159,8 @@ void PyDataProvider::fillSparseValueSlot(ProtoSlot& slot, char*& data, } } -void PyDataProvider::fillIndexSlot(ProtoSlot& slot, char*& data, +void PyDataProvider::fillIndexSlot(ProtoSlot& slot, + char*& data, const char* dataEnd) { slot.sampleNum = readT(data, dataEnd); CHECK_LE(data + sizeof(unsigned int) * slot.sampleNum, dataEnd) @@ -163,7 +170,8 @@ void PyDataProvider::fillIndexSlot(ProtoSlot& slot, char*& data, data += sizeof(unsigned int) * slot.sampleNum; } -void PyDataProvider::fillStringSlot(ProtoSlot& slot, char*& data, +void PyDataProvider::fillStringSlot(ProtoSlot& slot, + char*& data, const char* dataEnd) { slot.sampleNum = readT(data, dataEnd); for (unsigned int i = 0; i < slot.sampleNum; ++i) { @@ -225,9 +233,8 @@ void PyDataProvider::fillSlotsByStr(const std::string& samples) { } for (size_t i = 0; i < sequenceNum; ++i) { size_t begin = slot.sequenceStartPositions[i]; - size_t end = (i < sequenceNum - 1) - ? slot.sequenceStartPositions[i + 1] - : slot.sampleNum; + size_t end = (i < sequenceNum - 1) ? slot.sequenceStartPositions[i + 1] + : slot.sampleNum; for (size_t ii = begin; ii < end; ++ii) { slot.sampleSequenceIdVec.push_back(ii); } @@ -255,8 +262,8 @@ void PyDataProvider::fillSlotsByStr(const std::string& samples) { void PyDataProvider::reset() { { // Invoke PyDataProvider Reset PyGuard guard; - PyObjectPtr obj(PyObject_CallMethod(classInstance_.get(), - const_cast("reset"), NULL)); + PyObjectPtr obj(PyObject_CallMethod( + classInstance_.get(), const_cast("reset"), NULL)); CHECK_PY(obj) << "Call function reset failed."; } @@ -270,15 +277,18 @@ void PyDataProvider::reset() { void PyDataProvider::shuffle() { // py shuffle PyGuard guard; - PyObjectPtr obj(PyObject_CallMethod(classInstance_.get(), - const_cast("shuffle"), NULL)); + PyObjectPtr obj(PyObject_CallMethod( + classInstance_.get(), const_cast("shuffle"), NULL)); CHECK_PY(obj) << "Call function shuffle failed."; } -void PyDataProvider::handleDenseSlot(ProtoSlot& slot, size_t slotIndex, +void PyDataProvider::handleDenseSlot(ProtoSlot& slot, + size_t slotIndex, std::vector& cpuArguments) { unsigned int dim = slot.dim; - Matrix::resizeOrCreate(cpuArguments[slotIndex].value, slot.sampleNum, dim, + Matrix::resizeOrCreate(cpuArguments[slotIndex].value, + slot.sampleNum, + dim, false, // trans = false false); // useGpu = false real* buf = cpuArguments[slotIndex].value->getData(); @@ -294,19 +304,27 @@ void PyDataProvider::handleSparseNonValueSlot( ProtoSlot& slot, size_t slotIndex, std::vector& cpuArguments) { unsigned int dim = slot.dim; if (!(cpuArguments[slotIndex].value)) { - cpuArguments[slotIndex].value = Matrix::createSparseMatrix( - slot.sampleNum, dim, slot.sampleNum /*DEFAULT_AVG_WIDTH = 1*/, NO_VALUE, - SPARSE_CSR, false, useGpu_); + cpuArguments[slotIndex].value = + Matrix::createSparseMatrix(slot.sampleNum, + dim, + slot.sampleNum /*DEFAULT_AVG_WIDTH = 1*/, + NO_VALUE, + SPARSE_CSR, + false, + useGpu_); } auto mat = cpuArguments[slotIndex].value; mat->resize(slot.sampleNum, dim, slot.sampleNum, NO_VALUE, SPARSE_CSR); if (std::dynamic_pointer_cast(mat)) { std::dynamic_pointer_cast(mat) - ->copyFrom(slot.sampleSequenceIdVec.data(), slot.indices.data(), - slot.sparseNonValueData.data(), HPPL_STREAM_1); + ->copyFrom(slot.sampleSequenceIdVec.data(), + slot.indices.data(), + slot.sparseNonValueData.data(), + HPPL_STREAM_1); } else if (std::dynamic_pointer_cast(mat)) { std::dynamic_pointer_cast(mat) - ->copyFrom(slot.sampleSequenceIdVec.data(), slot.indices.data(), + ->copyFrom(slot.sampleSequenceIdVec.data(), + slot.indices.data(), slot.sparseNonValueData.data()); } else { LOG(FATAL) << "Not Supported"; @@ -317,28 +335,38 @@ void PyDataProvider::handleSparseValueSlot( ProtoSlot& slot, size_t slotIndex, std::vector& cpuArguments) { unsigned int dim = slot.dim; if (!(cpuArguments[slotIndex].value)) { - cpuArguments[slotIndex].value = Matrix::createSparseMatrix( - slot.sampleNum, dim, slot.sampleNum /*DEFAULT_AVG_WIDTH = 1*/, - FLOAT_VALUE, SPARSE_CSR, false, useGpu_); + cpuArguments[slotIndex].value = + Matrix::createSparseMatrix(slot.sampleNum, + dim, + slot.sampleNum /*DEFAULT_AVG_WIDTH = 1*/, + FLOAT_VALUE, + SPARSE_CSR, + false, + useGpu_); } auto mat = cpuArguments[slotIndex].value; mat->resize(slot.sampleNum, dim, slot.sampleNum, FLOAT_VALUE, SPARSE_CSR); if (std::dynamic_pointer_cast(mat)) { std::dynamic_pointer_cast(mat) - ->copyFrom(slot.sampleSequenceIdVec.data(), slot.indices.data(), - slot.sparseFloatValueData.data(), HPPL_STREAM_DEFAULT); + ->copyFrom(slot.sampleSequenceIdVec.data(), + slot.indices.data(), + slot.sparseFloatValueData.data(), + HPPL_STREAM_DEFAULT); } else if (std::dynamic_pointer_cast(mat)) { std::dynamic_pointer_cast(mat) - ->copyFrom(slot.sampleSequenceIdVec.data(), slot.indices.data(), + ->copyFrom(slot.sampleSequenceIdVec.data(), + slot.indices.data(), slot.sparseFloatValueData.data()); } else { LOG(FATAL) << "Not Supported"; } } -void PyDataProvider::handleIndexSlot(ProtoSlot& slot, size_t slotIndex, +void PyDataProvider::handleIndexSlot(ProtoSlot& slot, + size_t slotIndex, std::vector& cpuArguments) { - IVector::resizeOrCreate(cpuArguments[slotIndex].ids, slot.sampleNum, + IVector::resizeOrCreate(cpuArguments[slotIndex].ids, + slot.sampleNum, /*useGpu_*/ false); int* buf = cpuArguments[slotIndex].ids->getData(); for (size_t i = 0; i < slot.sampleNum; ++i) { @@ -346,7 +374,8 @@ void PyDataProvider::handleIndexSlot(ProtoSlot& slot, size_t slotIndex, } } -void PyDataProvider::handleStringSlot(ProtoSlot& slot, size_t slotIndex, +void PyDataProvider::handleStringSlot(ProtoSlot& slot, + size_t slotIndex, std::vector& cpuArguments) { if (cpuArguments[slotIndex].strs) { cpuArguments[slotIndex].strs->resize(slot.sampleNum); @@ -364,7 +393,8 @@ int64_t PyDataProvider::getNextBatchInternal(int64_t size, DataBatch* batch) { PyGuard guard; PyObjectPtr obj(PyObject_CallMethod(classInstance_.get(), const_cast("getNextBatch"), - const_cast("i"), size)); + const_cast("i"), + size)); CHECK_PY(obj) << "Call function getNextBatch failed."; const std::string& samples = std::string(PyString_AsString(obj.get()), PyString_Size(obj.get())); @@ -381,23 +411,24 @@ int64_t PyDataProvider::getNextBatchInternal(int64_t size, DataBatch* batch) { if (!iidData()) { for (size_t j = 0; j < slotNum_; ++j) { auto& slot = slots_[j]; - ICpuGpuVector::resizeOrCreate( - cpuArguments[j].sequenceStartPositions, - slot.sequenceNum + 1, /* useGpu= */ false); + ICpuGpuVector::resizeOrCreate(cpuArguments[j].sequenceStartPositions, + slot.sequenceNum + 1, + /* useGpu= */ false); int* buf = cpuArguments[j].sequenceStartPositions->getMutableData(false); std::copy(slot.sequenceStartPositions.begin(), - slot.sequenceStartPositions.end(), buf); + slot.sequenceStartPositions.end(), + buf); buf[slot.sequenceStartPositions.size()] = slot.sampleNum; if (slot.subSequenceStartPositions.size()) { - ICpuGpuVector::resizeOrCreate( - cpuArguments[j].subSequenceStartPositions, - slot.subSequenceNum + 1, - /* useGpu= */ false); + ICpuGpuVector::resizeOrCreate(cpuArguments[j].subSequenceStartPositions, + slot.subSequenceNum + 1, + /* useGpu= */ false); int* buf = - cpuArguments[j].subSequenceStartPositions->getMutableData(false); + cpuArguments[j].subSequenceStartPositions->getMutableData(false); std::copy(slot.subSequenceStartPositions.begin(), - slot.subSequenceStartPositions.end(), buf); + slot.subSequenceStartPositions.end(), + buf); buf[slot.subSequenceNum] = slot.sampleNum; // check subSequenceStartPositions and sequenceStartPositions cpuArguments[j].checkSubset(); @@ -452,8 +483,8 @@ int64_t PyDataProvider::getNextBatchInternal(int64_t size, DataBatch* batch) { cpuArguments[i].subSequenceStartPositions; } } else { - gpuArguments[i].resizeAndCopyFrom(cpuArguments[i], useGpu_, - HPPL_STREAM_1); + gpuArguments[i].resizeAndCopyFrom( + cpuArguments[i], useGpu_, HPPL_STREAM_1); } } hl_stream_synchronize(HPPL_STREAM_1); diff --git a/paddle/gserver/dataproviders/PyDataProvider.h b/paddle/gserver/dataproviders/PyDataProvider.h index 939d9cf725c2fe6e4989c17e1e768c9f8aedfc95..6bb7c831fdd451abc5241199d6a4d1b1ad814517 100644 --- a/paddle/gserver/dataproviders/PyDataProvider.h +++ b/paddle/gserver/dataproviders/PyDataProvider.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include @@ -25,7 +24,8 @@ namespace paddle { class PyDataProvider : public DataProvider { public: - PyDataProvider(const DataConfig& config, bool useGpu, + PyDataProvider(const DataConfig& config, + bool useGpu, bool loadDataAll = true); virtual void reset(); @@ -48,21 +48,27 @@ protected: void parseHeaderData(const std::string& headerData); void fillDenseSlot(ProtoSlot& slot, char*& data, const char* dataEnd); - void fillSparseNonValueSlot(ProtoSlot& slot, char*& data, + void fillSparseNonValueSlot(ProtoSlot& slot, + char*& data, const char* dataEnd); void fillSparseValueSlot(ProtoSlot& slot, char*& data, const char* dataEnd); void fillIndexSlot(ProtoSlot& slot, char*& data, const char* dataEnd); void fillStringSlot(ProtoSlot& slot, char*& data, const char* dataEnd); void fillSlotsByStr(const std::string& samples); - void handleDenseSlot(ProtoSlot& slot, size_t slotIndex, + void handleDenseSlot(ProtoSlot& slot, + size_t slotIndex, std::vector& cpuArguments); - void handleSparseNonValueSlot(ProtoSlot& slot, size_t slotIndex, + void handleSparseNonValueSlot(ProtoSlot& slot, + size_t slotIndex, std::vector& cpuArguments); - void handleSparseValueSlot(ProtoSlot& slot, size_t slotIndex, + void handleSparseValueSlot(ProtoSlot& slot, + size_t slotIndex, std::vector& cpuArguments); - void handleIndexSlot(ProtoSlot& slot, size_t slotIndex, + void handleIndexSlot(ProtoSlot& slot, + size_t slotIndex, std::vector& cpuArguments); - void handleStringSlot(ProtoSlot& slot, size_t slotIndex, + void handleStringSlot(ProtoSlot& slot, + size_t slotIndex, std::vector& cpuArguments); void resetSlots(); void loadData(const std::vector& fileList); diff --git a/paddle/gserver/dataproviders/PyDataProvider2.cpp b/paddle/gserver/dataproviders/PyDataProvider2.cpp index 90391a7c307d8dff7e289d445cafd27dc5008547..967fc9026a39967477d606862e060b680512901a 100644 --- a/paddle/gserver/dataproviders/PyDataProvider2.cpp +++ b/paddle/gserver/dataproviders/PyDataProvider2.cpp @@ -34,7 +34,7 @@ namespace paddle { namespace unittest { static std::unique_ptr> - OnPoolFilled; + OnPoolFilled; namespace pydp2 { @@ -43,15 +43,11 @@ void setOnPoolFilledHook(const std::function& callback) { *OnPoolFilled = callback; } -void clearOnPoolFilledHook() { - OnPoolFilled.reset(); -} +void clearOnPoolFilledHook() { OnPoolFilled.reset(); } } // namespace pydp2 } // namespace unittest - - /** * Slot type */ @@ -65,17 +61,13 @@ enum SlotType { /** * Sequence type */ -enum SeqType { - SQT_NONE = 0, - SQT_SEQ, - SQT_SUBSEQ -}; +enum SeqType { SQT_NONE = 0, SQT_SEQ, SQT_SUBSEQ }; /** * Cache Type. */ enum CacheType { - NO_CACHE = 0, // Each pass will load data from PyDataProvider2. + NO_CACHE = 0, // Each pass will load data from PyDataProvider2. CACHE_PASS_IN_MEM = 1, // First pass will load data from PyDataProvider2, // then cache all data in memory. Load data from // memory in rest passes. @@ -87,8 +79,8 @@ struct SlotHeader { // Slot Header will parse from python object's slots field. SeqType seqType; }; -inline std::ostream& operator << (std::ostream& os, const SlotHeader& header) { - os <<"Dim = " << header.dim << " Type = " << header.slotType +inline std::ostream& operator<<(std::ostream& os, const SlotHeader& header) { + os << "Dim = " << header.dim << " Type = " << header.slotType << " SeqType = " << header.seqType; return os; } @@ -158,7 +150,6 @@ protected: SlotHeader* headerPtr_; }; - /** * Py Data Provider Cache Interface. */ @@ -209,17 +200,13 @@ public: PyDataProvider2(const DataConfig& config, const ModelConfig& modelConfig, bool useGpu) - :DataProvider(config, useGpu), - callingContextCreated_(2) { - if (PyArray_API == NULL) - import_array(); + : DataProvider(config, useGpu), callingContextCreated_(2) { + if (PyArray_API == NULL) import_array(); auto& args = config.load_data_args(); PyObjectPtr kwargs = PyObjectPtr(PyDict_New()); if (!args.empty()) { kwargs = callPythonFuncRetPyObj( - "paddle.trainer.PyDataProvider2", - "deserialize_args", - {args}); + "paddle.trainer.PyDataProvider2", "deserialize_args", {args}); } py::DictHelper kwargsDict(kwargs); @@ -245,40 +232,38 @@ public: * Dtor * @note will stop loading thread when destructing */ - virtual ~PyDataProvider2() { - resetImpl(false); - } + virtual ~PyDataProvider2() { resetImpl(false); } private: void createPyDataObj(const std::string& model, const std::string& className, const std::string& fileListName, - PyObjectPtr && kwargs) { - LOG(INFO) << "loading dataprovider " << model <<"::" << className; + PyObjectPtr&& kwargs // NOLINT + ) { + LOG(INFO) << "loading dataprovider " << model << "::" << className; PyObjectPtr module = py::import(model); PyObjectPtr moduleDict(PyModule_GetDict(module.get())); CHECK_PY(moduleDict) << "Invoke module.__dict__ error"; - PyObjectPtr cls(PyDict_GetItemString(moduleDict.get(), - className.c_str())); + PyObjectPtr cls(PyDict_GetItemString(moduleDict.get(), className.c_str())); CHECK_PY(cls) << "load class " << className.c_str() << "error"; // If there are multiple python instance share same module, the PyObjectPtr // only for instance will make python reference-count error. // // So here, we increase reference count manually. - if (gModuleClsPtrs_.find((uintptr_t) module.get()) - != gModuleClsPtrs_.end()) { + if (gModuleClsPtrs_.find((uintptr_t)module.get()) != + gModuleClsPtrs_.end()) { // Multi instance use same module Py_XINCREF(module.get()); Py_XINCREF(moduleDict.get()); } else { - gModuleClsPtrs_.insert((uintptr_t) module.get()); + gModuleClsPtrs_.insert((uintptr_t)module.get()); } - if (gModuleClsPtrs_.find((uintptr_t) cls.get()) != gModuleClsPtrs_.end()) { + if (gModuleClsPtrs_.find((uintptr_t)cls.get()) != gModuleClsPtrs_.end()) { Py_XINCREF(cls.get()); } else { - gModuleClsPtrs_.insert((uintptr_t) cls.get()); + gModuleClsPtrs_.insert((uintptr_t)cls.get()); } PyObjectPtr fileListInPy = loadPyFileLists(fileListName); @@ -294,8 +279,8 @@ private: py::ObjectHelper self(this->instance_); bool ok; - this->skipShuffle_ = !self.getBoolAttr("should_shuffle", - &ok /*isBoolType*/); + this->skipShuffle_ = + !self.getBoolAttr("should_shuffle", &ok /*isBoolType*/); if (!ok) { this->skipShuffle_ = testing; // shuffle when is training, skip shuffle // when is testing. @@ -335,12 +320,12 @@ private: PyObjectPtr headerPtrWrap(hdPtr); py::ObjectHelper hd(headerPtrWrap); header.dim = hd.getIntAttrWithError("dim"); - header.seqType = (SeqType) hd.getIntAttrWithError("seq_type"); - header.slotType = (SlotType) hd.getIntAttrWithError("type"); + header.seqType = (SeqType)hd.getIntAttrWithError("seq_type"); + header.slotType = (SlotType)hd.getIntAttrWithError("type"); } DBG << "Data header size " << headers_.size(); - for (auto & header : headers_) { + for (auto& header : headers_) { DBG << header; } cache_.reset(IPyDataProviderCache::create( @@ -351,8 +336,7 @@ private: loadFileList(fileListName, fileLists_); PyObject* lst = PyList_New(fileLists_.size()); for (size_t i = 0; i < fileLists_.size(); ++i) { - PyList_SET_ITEM(lst, i, - PyString_FromString(fileLists_[i].c_str())); + PyList_SET_ITEM(lst, i, PyString_FromString(fileLists_[i].c_str())); } return PyObjectPtr(lst); } @@ -414,11 +398,12 @@ private: CHECK(ok) << "CalcBatchSize must return int or long"; } - if (this->loadThread_){ // wait poolActualSize < poolSize; + if (this->loadThread_) { // wait poolActualSize < poolSize; std::unique_lock l(mtx_); - pushCV_.wait(l, [this, additionalBatchSize] { - return this->poolActualSize_ < poolSize_; - }); + pushCV_.wait(l, + [this, additionalBatchSize] { + return this->poolActualSize_ < poolSize_; + }); } { @@ -487,14 +472,14 @@ private: std::vector fileLists_; std::vector headers_; static PyObjectPtr zeroTuple_; - static std::unordered_set gModuleClsPtrs_; + static std::unordered_set gModuleClsPtrs_; class PositionRandom { public: - inline explicit PositionRandom(bool skipRand): - eng_(ThreadLocalRandomEngine::get()), skipRand_(skipRand) {} + inline explicit PositionRandom(bool skipRand) + : eng_(ThreadLocalRandomEngine::get()), skipRand_(skipRand) {} - inline size_t operator() (size_t len) { + inline size_t operator()(size_t len) { if (!skipRand_) { if (!dist_ || dist_->b() != len - 1) { dist_.reset(new std::uniform_int_distribution(0, len - 1)); @@ -525,32 +510,31 @@ public: * Shuffle. Do nothing because PyDataProvider do shuffle implicitly by random * select data from datapool. */ - void shuffle() { - } + void shuffle() {} /** * Not limited size. */ - int64_t getSize() { - return -1; - } + int64_t getSize() { return -1; } /** * Loading a batch of data. */ - int64_t getNextBatchInternal(int64_t size_, DataBatch *batch) { + int64_t getNextBatchInternal(int64_t size_, DataBatch* batch) { std::lock_guard guard(mutexForReset_); REGISTER_TIMER("PyDP2.getNextBatchInternal") CHECK_GE(size_, 0); - size_t size = (size_t) size_; + size_t size = (size_t)size_; if (loadThread_) { // loading from thread should wait for data pool ready. // but, loading from cache, cache object should ensure // data pool ready. std::unique_lock l(mtx_); - pullCV_.wait(l, [this, &size] { - return this->poolActualSize_ >= std::max(size, this->minPoolSize_) - || callingContexts_.empty(); - }); + pullCV_.wait(l, + [this, &size] { + return this->poolActualSize_ >= + std::max(size, this->minPoolSize_) || + callingContexts_.empty(); + }); if (unittest::OnPoolFilled) { (*unittest::OnPoolFilled)(this->poolActualSize_); @@ -633,35 +617,35 @@ public: cpuBatch.setSize(bsize); auto& inArgs = cpuBatch.getStreams(); inArgs.resize(headers_.size()); - std::vector > scanners; + std::vector> scanners; scanners.reserve(headers_.size()); for (auto& header : headers_) { scanners.emplace_back(IFieldScanner::create(&header)); } DBG << "Scanner created."; - for (size_t i=0; i < headers_.size(); ++i) { + for (size_t i = 0; i < headers_.size(); ++i) { scanners[i]->startPrepare(inArgs[i]); } - for (auto & d : data) { + for (auto& d : data) { py::SequenceHelper s(d); - for (size_t i=0; i < headers_.size(); ++i) { + for (size_t i = 0; i < headers_.size(); ++i) { scanners[i]->prepare(inArgs[i], s[i]); } } - for (size_t i=0; i < headers_.size(); ++i) { + for (size_t i = 0; i < headers_.size(); ++i) { scanners[i]->finishPrepare(inArgs[i]); } - for (size_t i=0; i < headers_.size(); ++i) { + for (size_t i = 0; i < headers_.size(); ++i) { scanners[i]->startFill(inArgs[i]); } - for (auto & d : data) { + for (auto& d : data) { py::SequenceHelper s(d); for (size_t i = 0; i < headers_.size(); ++i) { scanners[i]->fill(inArgs[i], s[i]); } } - for (size_t i=0; i < headers_.size(); ++i) { + for (size_t i = 0; i < headers_.size(); ++i) { scanners[i]->finishFill(inArgs[i]); } @@ -679,8 +663,8 @@ public: gpuArguments.resize(cpuArguments.size()); gpuBatch.setSize(size); for (size_t i = 0; i < headers_.size(); ++i) { - gpuArguments[i].resizeAndCopyFrom(cpuArguments[i], useGpu_, - HPPL_STREAM_1); + gpuArguments[i].resizeAndCopyFrom( + cpuArguments[i], useGpu_, HPPL_STREAM_1); } hl_stream_synchronize(HPPL_STREAM_1); } else { @@ -690,31 +674,28 @@ public: } }; -std::unordered_set PyDataProvider2::gModuleClsPtrs_; +std::unordered_set PyDataProvider2::gModuleClsPtrs_; PyObjectPtr PyDataProvider2::zeroTuple_(PyTuple_New(0)); REGISTER_DATA_PROVIDER_EX(py2, PyDataProvider2); - /** * Scanner for dense slot. */ -class DenseScanner: public IFieldScanner { +class DenseScanner : public IFieldScanner { public: - explicit DenseScanner(SlotHeader* ptr):IFieldScanner(ptr), height_(0) {} + explicit DenseScanner(SlotHeader* ptr) : IFieldScanner(ptr), height_(0) {} /** * Prepare. * @param argument target argument * @param obj each timestep of a sample. */ - virtual void prepare(Argument &argument, PyObject *obj) { - ++height_; - } + virtual void prepare(Argument& argument, PyObject* obj) { ++height_; } - virtual void finishPrepare(Argument &argument) { - Matrix::resizeOrCreate(argument.value, height_, headerPtr_->dim, - false, false); + virtual void finishPrepare(Argument& argument) { + Matrix::resizeOrCreate( + argument.value, height_, headerPtr_->dim, false, false); height_ = 0; } @@ -723,24 +704,23 @@ public: * @param argument * @param obj */ - virtual void fill(Argument &argument, PyObject *obj) { + virtual void fill(Argument& argument, PyObject* obj) { real* dat = argument.value->getData() + height_ * headerPtr_->dim; if (PyArray_Check(obj)) { - auto dtype = PyArray_DTYPE((PyArrayObject*)obj); - if (dtype->type == 'f' && dtype->elsize == sizeof(real)) { - real * data = (real*)PyArray_DATA((PyArrayObject*)obj); - auto sz = PyArray_SIZE((PyArrayObject*)obj); - std::copy(data, data + sz, dat); - } else { - LOG(FATAL) << "You should yield float" << sizeof(real) * 8 - << " array"; - } - } else { - py::SequenceHelper s(obj); - // TODO(yuyang18): Here we can use AVX or SSE to accelerate memory copy. - for (size_t i=0; i < headerPtr_->dim; ++i) { - dat[i] = (real) s.getDouble(i); - } + auto dtype = PyArray_DTYPE((PyArrayObject*)obj); + if (dtype->type == 'f' && dtype->elsize == sizeof(real)) { + real* data = (real*)PyArray_DATA((PyArrayObject*)obj); + auto sz = PyArray_SIZE((PyArrayObject*)obj); + std::copy(data, data + sz, dat); + } else { + LOG(FATAL) << "You should yield float" << sizeof(real) * 8 << " array"; + } + } else { + py::SequenceHelper s(obj); + // TODO(yuyang18): Here we can use AVX or SSE to accelerate memory copy. + for (size_t i = 0; i < headerPtr_->dim; ++i) { + dat[i] = (real)s.getDouble(i); + } } ++height_; } @@ -752,20 +732,18 @@ private: /** * Scanner for index slot */ -class IndexScanner: public IFieldScanner { +class IndexScanner : public IFieldScanner { public: - explicit IndexScanner(SlotHeader* ptr):IFieldScanner(ptr), cnt_(0) {} + explicit IndexScanner(SlotHeader* ptr) : IFieldScanner(ptr), cnt_(0) {} /** * Prepare memory space. * * @note obj is a single timestep of sample */ - virtual void prepare(Argument &argument, PyObject *obj) { - ++cnt_; - } + virtual void prepare(Argument& argument, PyObject* obj) { ++cnt_; } - virtual void finishPrepare(Argument &argument) { + virtual void finishPrepare(Argument& argument) { IVector::resizeOrCreate(argument.ids, cnt_, false); cnt_ = 0; } @@ -773,9 +751,9 @@ public: /** * Fill one index to argument. */ - virtual void fill(Argument &argument, PyObject *obj) { + virtual void fill(Argument& argument, PyObject* obj) { bool ok; - argument.ids->getData()[cnt_++] = py::castInt(obj, &ok); + argument.ids->getData()[cnt_++] = py::castInt(obj, &ok); CHECK(ok) << "Cannot cast int " << py::repr(obj); } @@ -785,27 +763,25 @@ private: class SparseNonValueScanner : public IFieldScanner { public: - explicit SparseNonValueScanner(SlotHeader* ptr): IFieldScanner(ptr), - nnz_(0), - height_(0) {} + explicit SparseNonValueScanner(SlotHeader* ptr) + : IFieldScanner(ptr), nnz_(0), height_(0) {} /** * Prepare memory space * @note obj is a timestep of one sample. */ - virtual void prepare(Argument &argument, PyObject *obj) { + virtual void prepare(Argument& argument, PyObject* obj) { ++height_; nnz_ += py::SequenceHelper(obj).size(); } - virtual void finishPrepare(Argument &argument) { - Matrix::resizeOrCreateSparseMatrix(argument.value, height_, - headerPtr_->dim, - nnz_, NO_VALUE); + virtual void finishPrepare(Argument& argument) { + Matrix::resizeOrCreateSparseMatrix( + argument.value, height_, headerPtr_->dim, nnz_, NO_VALUE); } - virtual void startFill(Argument & argument) { - auto smat = (CpuSparseMatrix*) (argument.value.get()); + virtual void startFill(Argument& argument) { + auto smat = (CpuSparseMatrix*)(argument.value.get()); smat->getRows()[0] = 0; nnz_ = 0; height_ = 1; @@ -818,14 +794,14 @@ public: virtual void fill(Argument& argument, PyObject* obj) { py::SequenceHelper s(obj); auto sz = s.size(); - auto smat = (CpuSparseMatrix*) (argument.value.get()); + auto smat = (CpuSparseMatrix*)(argument.value.get()); int* row = smat->getRows(); int* col = smat->getCols(); real* dat = smat->getData(); - row[height_] = row[height_-1] + (int)sz; + row[height_] = row[height_ - 1] + (int)sz; for (decltype(sz) i = 0; i < sz; ++i) { - setData(col+nnz_, dat+nnz_, s[i]); + setData(col + nnz_, dat + nnz_, s[i]); ++nnz_; } ++height_; @@ -839,7 +815,7 @@ protected: * @param [in] obj Python Object. For sparse_non_value is a PyInt or PyLong. * For sparse_value is a Tuple (int, float). */ - virtual void setData(int* col, real * dat, PyObject* obj) { + virtual void setData(int* col, real* dat, PyObject* obj) { bool ok; *col = py::castInt(obj, &ok); CHECK(ok); @@ -851,26 +827,25 @@ protected: class SparseValueScanner : public SparseNonValueScanner { public: - explicit SparseValueScanner(SlotHeader *ptr) : SparseNonValueScanner(ptr) {} + explicit SparseValueScanner(SlotHeader* ptr) : SparseNonValueScanner(ptr) {} - virtual void finishPrepare(Argument &argument) { - Matrix::resizeOrCreateSparseMatrix(argument.value, height_, - headerPtr_->dim, - nnz_, FLOAT_VALUE); + virtual void finishPrepare(Argument& argument) { + Matrix::resizeOrCreateSparseMatrix( + argument.value, height_, headerPtr_->dim, nnz_, FLOAT_VALUE); } protected: - virtual void setData(int *col, real *dat, PyObject *obj) { + virtual void setData(int* col, real* dat, PyObject* obj) { py::SequenceHelper s(obj); SparseNonValueScanner::setData(col, dat, s[0]); - *dat = (real) s.getDouble(1); + *dat = (real)s.getDouble(1); } }; /** * Sequence Scanner. Scanner for sequence or sub-sequence. */ -class SequenceScanner: public IFieldScanner { +class SequenceScanner : public IFieldScanner { public: /** * Ctor @@ -879,15 +854,18 @@ public: * return a sequence start position or a sub-sequence * start position. */ - SequenceScanner(std::unique_ptr&& innerScanner, - const std::function& getSeqStartPos) - : IFieldScanner(nullptr), inner_(std::move(innerScanner)), - cnt_(0), getSeqStartPos_(getSeqStartPos) {} + SequenceScanner( + std::unique_ptr&& innerScanner, + const std::function& getSeqStartPos) + : IFieldScanner(nullptr), + inner_(std::move(innerScanner)), + cnt_(0), + getSeqStartPos_(getSeqStartPos) {} /** * Start prepare. Invoke inner->startPrepare too. */ - virtual void startPrepare(Argument &argument) { + virtual void startPrepare(Argument& argument) { inner_->startPrepare(argument); } @@ -895,10 +873,10 @@ public: * Prepare. obj is a list or tuple. it will invoke inner_->prepare for each * element of sequence obj. */ - virtual void prepare(Argument &argument, PyObject *obj) { + virtual void prepare(Argument& argument, PyObject* obj) { py::SequenceHelper s(obj); ++cnt_; - for (size_t i=0; i < s.size(); ++i) { + for (size_t i = 0; i < s.size(); ++i) { inner_->prepare(argument, s[i]); } } @@ -906,7 +884,7 @@ public: /** * Finish prepare. invoke inner_->finishPrepare too. */ - virtual void finishPrepare(Argument &argument) { + virtual void finishPrepare(Argument& argument) { ICpuGpuVector::resizeOrCreate(getSeqStartPos_(argument), cnt_ + 1, false); inner_->finishPrepare(argument); } @@ -914,7 +892,7 @@ public: /** * Start fill. invoke inner->startFill too. */ - virtual void startFill(Argument &argument) { + virtual void startFill(Argument& argument) { getSeqStartPos_(argument)->getMutableData(false)[0] = 0; cnt_ = 1; inner_->startFill(argument); @@ -925,13 +903,13 @@ public: * sequence obj. And set seqStartPos at same time. The seqStartPos will be * calculated by getSeqStartPos callback passed in ctor. */ - virtual void fill(Argument &argument, PyObject *obj) { + virtual void fill(Argument& argument, PyObject* obj) { getSeqStartPos_(argument)->getMutableData(false)[cnt_] = - getSeqStartPos_(argument)->getMutableData(false)[cnt_ - 1] + - (int)getSize(obj); + getSeqStartPos_(argument)->getMutableData(false)[cnt_ - 1] + + (int)getSize(obj); py::SequenceHelper s(obj); ++cnt_; - for (size_t i=0; i < s.size(); ++i) { + for (size_t i = 0; i < s.size(); ++i) { inner_->fill(argument, s[i]); } } @@ -939,9 +917,7 @@ public: /** * Finish fill. will invoke inner->finishFill too. */ - virtual void finishFill(Argument &argument) { - inner_->finishFill(argument); - } + virtual void finishFill(Argument& argument) { inner_->finishFill(argument); } protected: size_t getSize(PyObject* obj) { @@ -949,7 +925,7 @@ protected: auto sc = dynamic_cast(inner_.get()); if (sc) { size_t sum = 0; - for (size_t i=0; i < s.size(); ++i) { + for (size_t i = 0; i < s.size(); ++i) { sum += sc->getSize(s[i]); } return sum; @@ -964,8 +940,7 @@ private: std::function getSeqStartPos_; }; - -IFieldScanner* IFieldScanner::create(SlotHeader *header) { +IFieldScanner* IFieldScanner::create(SlotHeader* header) { IFieldScanner* retv = nullptr; switch (header->slotType) { case ST_DENSE: @@ -989,15 +964,15 @@ IFieldScanner* IFieldScanner::create(SlotHeader *header) { break; case SQT_SUBSEQ: retv = new SequenceScanner(std::unique_ptr(retv), - [](Argument& arg) -> ICpuGpuVectorPtr& { - return arg.subSequenceStartPositions; - }); - // fall through, not break; + [](Argument& arg) -> ICpuGpuVectorPtr& { + return arg.subSequenceStartPositions; + }); + // fall through, not break; case SQT_SEQ: retv = new SequenceScanner(std::unique_ptr(retv), - [](Argument& arg) -> ICpuGpuVectorPtr& { - return arg.sequenceStartPositions; - }); + [](Argument& arg) -> ICpuGpuVectorPtr& { + return arg.sequenceStartPositions; + }); break; default: LOG(FATAL) << "Not implemented"; @@ -1010,19 +985,13 @@ IFieldScanner* IFieldScanner::create(SlotHeader *header) { * No Cache Strategy. Will destruct old data immediately and load data from * python every pass. */ -class NoCacheStrategy: public IPyDataProviderCache { +class NoCacheStrategy : public IPyDataProviderCache { public: - virtual bool reset() { - return true; - } + virtual bool reset() { return true; } - virtual void drop(std::deque *data) { - data->clear(); - } + virtual void drop(std::deque* data) { data->clear(); } - virtual std::deque* load() { - return nullptr; - } + virtual std::deque* load() { return nullptr; } }; /** @@ -1033,9 +1002,9 @@ public: */ class CacheOnePassInMemory : public IPyDataProviderCache { public: - CacheOnePassInMemory() : objPool_(new std::deque()), - droppedPool_(new std::deque()) - {} + CacheOnePassInMemory() + : objPool_(new std::deque()), + droppedPool_(new std::deque()) {} virtual bool reset() { if (objPool_->empty() && droppedPool_->empty()) { @@ -1048,25 +1017,22 @@ public: } } - virtual void drop(std::deque *data) { + virtual void drop(std::deque* data) { size_t orgSize = droppedPool_->size(); droppedPool_->resize(orgSize + data->size()); - for (size_t i=0; i < data->size(); ++i) { + for (size_t i = 0; i < data->size(); ++i) { std::swap((*droppedPool_)[orgSize + i], (*data)[i]); } data->clear(); } - virtual std::deque* load() { - return objPool_.get(); - } + virtual std::deque* load() { return objPool_.get(); } private: - std::unique_ptr > objPool_; - std::unique_ptr > droppedPool_; + std::unique_ptr> objPool_; + std::unique_ptr> droppedPool_; }; - IPyDataProviderCache* IPyDataProviderCache::create(CacheType ct) { switch (ct) { case NO_CACHE: diff --git a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp index c2625bce9ab0cac7c42a20379c42debea0510c57..8f7d2fb80e9b6f2b4c83d90a04dab5219435d344 100644 --- a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp +++ b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "Evaluator.h" #include "paddle/gserver/gradientmachines/NeuralNetwork.h" @@ -33,7 +32,8 @@ private: str.clear(); int prevLabel = -1; for (std::vector::const_iterator label = path.begin(); - label != path.end(); label++) { + label != path.end(); + label++) { if (*label != blank_ && (str.empty() || *label != str.back() || prevLabel == blank_)) { str.push_back(*label); @@ -58,8 +58,11 @@ private: /* "sp, dp, ip" is the weighting parameter of "substitution, deletion, * insertion" * in edit-distance error */ - real stringAlignment(std::vector& gtStr, std::vector& recogStr, - bool backtrace = true, real sp = 1.0, real dp = 1.0, + real stringAlignment(std::vector& gtStr, + std::vector& recogStr, + bool backtrace = true, + real sp = 1.0, + real dp = 1.0, real ip = 1.0) { std::vector> matrix; int substitutions, deletions, insertions; @@ -165,8 +168,8 @@ private: return distance / maxLen; } - real editDistance(real* output, int numTimes, int numClasses, int* labels, - int labelsLen) { + real editDistance( + real* output, int numTimes, int numClasses, int* labels, int labelsLen) { numTimes_ = numTimes; numClasses_ = numClasses; blank_ = numClasses_ - 1; @@ -207,7 +210,8 @@ public: real err = 0; err = editDistance( output.value->getData() + output.value->getWidth() * outputStarts[i], - outputStarts[i+1] - outputStarts[i], output.value->getWidth(), + outputStarts[i + 1] - outputStarts[i], + output.value->getWidth(), label.ids->getData() + labelStarts[i], labelStarts[i + 1] - labelStarts[i]); diff --git a/paddle/gserver/evaluators/ChunkEvaluator.cpp b/paddle/gserver/evaluators/ChunkEvaluator.cpp index 6f5d2b47c3a97d0c95fefd346add2f121ac51764..923e77fc9df919794902daed6113792e7f89a552 100644 --- a/paddle/gserver/evaluators/ChunkEvaluator.cpp +++ b/paddle/gserver/evaluators/ChunkEvaluator.cpp @@ -144,7 +144,8 @@ public: size_t numSequences = sequenceStartPositions->getSize() - 1; const int* starts = sequenceStartPositions->getData(); for (size_t i = 0; i < numSequences; ++i) { - eval1(output->getData() + starts[i], label->getData() + starts[i], + eval1(output->getData() + starts[i], + label->getData() + starts[i], starts[i + 1] - starts[i]); } return 0; diff --git a/paddle/gserver/evaluators/Evaluator.cpp b/paddle/gserver/evaluators/Evaluator.cpp index d43dceea7452724c1e45a1b7c5f5f1858d528df7..f5df2b18dedde9022d04b034912e59be00f15413 100644 --- a/paddle/gserver/evaluators/Evaluator.cpp +++ b/paddle/gserver/evaluators/Evaluator.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/utils/Stat.h" #include "paddle/gserver/evaluators/Evaluator.h" @@ -74,17 +73,19 @@ public: } const MatrixPtr errorMat = Matrix::create(output->getHeight(), - 1, /* trans= */ false, useGpu(arguments[0].deviceId)); + 1, + /* trans= */ false, + useGpu(arguments[0].deviceId)); errorMat->zeroMem(); if (label != nullptr) { errorMat->classificationError(output, label); } else if (dynamic_cast(multiBinaryLabel.get()) || dynamic_cast(multiBinaryLabel.get())) { - errorMat->classificationErrorMulti(*output, *multiBinaryLabel, - config_.classification_threshold()); + errorMat->classificationErrorMulti( + *output, *multiBinaryLabel, config_.classification_threshold()); } else { - errorMat->binaryClassificationError(0, *output, *multiBinaryLabel, - config_.classification_threshold()); + errorMat->binaryClassificationError( + 0, *output, *multiBinaryLabel, config_.classification_threshold()); } if (supportWeight) { @@ -126,8 +127,8 @@ public: int errCounter = 0; CpuVector errorVec(0, nullptr); for (size_t i = 0; i < sequenceStartPositions->getSize() - 1; ++i) { - errorVec.subVecFrom(errorMat->getData(), starts[i], - starts[i + 1] - starts[i]); + errorVec.subVecFrom( + errorMat->getData(), starts[i], starts[i + 1] - starts[i]); if (errorVec.getSum() > 0) { errCounter += 1; } @@ -330,8 +331,8 @@ public: } void distributeEval(ParameterClient2* client) { - client->reduce(sum_->getData(), sum_->getData(), colNum_, FLAGS_trainer_id, - 0); + client->reduce( + sum_->getData(), sum_->getData(), colNum_, FLAGS_trainer_id, 0); client->reduce(&numSamples_, &numSamples_, 1, FLAGS_trainer_id, 0); } @@ -379,8 +380,11 @@ real AucEvaluator::evalImp(std::vector& arguments) { } if (dynamic_cast(output.get())) { - Matrix::resizeOrCreate(cpuOutput_, insNum, outputDim, - /* trans=*/false, /* useGpu=*/false); + Matrix::resizeOrCreate(cpuOutput_, + insNum, + outputDim, + /* trans=*/false, + /* useGpu=*/false); cpuOutput_->copyFrom(*output); IVector::resizeOrCreate(cpuLabel_, insNum, false); cpuLabel_->copyFrom(*label); @@ -479,19 +483,24 @@ real RankAucEvaluator::evalImp(std::vector& arguments) { for (size_t i = 0; i < batchNum; ++i) { int beginPos = startPosData[i]; int endPos = startPosData[i + 1]; - batchAuc += calcRankAuc(outputData + beginPos, clickData + beginPos, - pvData + beginPos, endPos - beginPos); + batchAuc += calcRankAuc(outputData + beginPos, + clickData + beginPos, + pvData + beginPos, + endPos - beginPos); } return batchAuc; } -double RankAucEvaluator::calcRankAuc(real* outputData, real* clickData, - real* pvData, size_t size) { +double RankAucEvaluator::calcRankAuc(real* outputData, + real* clickData, + real* pvData, + size_t size) { outputPair_.clear(); for (size_t i = 0; i < size; ++i) { outputPair_.push_back(std::make_pair(outputData[i], i)); } - std::sort(outputPair_.begin(), outputPair_.end(), + std::sort(outputPair_.begin(), + outputPair_.end(), [](const std::pair& a, const std::pair& b) { return a.first > b.first; }); @@ -790,8 +799,12 @@ real PnpairEvaluator::evalImp(std::vector& arguments) { return 0; } -void PnpairEvaluator::stat(size_t start, size_t end, PredictionResult* answers, - double& pos, double& neg, double& spe) { +void PnpairEvaluator::stat(size_t start, + size_t end, + PredictionResult* answers, + double& pos, + double& neg, + double& spe) { for (size_t i = start; i < end; i++) { for (size_t j = i + 1; j < end; j++) { CHECK_EQ(answers[i].queryid, answers[j].queryid); @@ -817,7 +830,8 @@ void PnpairEvaluator::stat(size_t start, size_t end, PredictionResult* answers, } void PnpairEvaluator::calc(std::vector& predictArray) { - std::sort(predictArray.begin(), predictArray.end(), + std::sort(predictArray.begin(), + predictArray.end(), [](const PredictionResult& x, const PredictionResult& y) { return x.queryid < y.queryid; }); @@ -828,11 +842,16 @@ void PnpairEvaluator::calc(std::vector& predictArray) { auto start = predictArray.begin(); while (start != predictArray.end()) { auto end = std::find_if( - start + 1, predictArray.end(), + start + 1, + predictArray.end(), [=](const PredictionResult& x) { return x.queryid != start->queryid; }); CHECK(end != start); - stat(start - predictArray.begin(), end - predictArray.begin(), - predictArray.data(), pos, neg, special); + stat(start - predictArray.begin(), + end - predictArray.begin(), + predictArray.data(), + pos, + neg, + special); start = end; } @@ -1120,8 +1139,8 @@ public: auto resizeMatrix = [](MatrixPtr& dest, const MatrixPtr& src) { if (src && src->useGpu()) { - Matrix::resizeOrCreate(dest, src->getHeight(), src->getWidth(), false, - false); + Matrix::resizeOrCreate( + dest, src->getHeight(), src->getWidth(), false, false); dest->copyFrom(*src); } else { dest = src; diff --git a/paddle/gserver/evaluators/Evaluator.h b/paddle/gserver/evaluators/Evaluator.h index e9957a5ce22c7bdfbec9a783f5e1705df706caba..732abb6079523b1cce8d0727c94ef65581842b4c 100644 --- a/paddle/gserver/evaluators/Evaluator.h +++ b/paddle/gserver/evaluators/Evaluator.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "paddle/pserver/ParameterClient2.h" @@ -110,7 +109,7 @@ public: return os; } - friend std::ostream&& operator<<(std::ostream&& os, // NOLINT + friend std::ostream&& operator<<(std::ostream&& os, // NOLINT const Evaluator& evaluator) { evaluator.printStats(os); return std::move(os); @@ -184,7 +183,9 @@ private: AucEvaluator() {} - inline static double trapezoidArea(double X1, double X2, double Y1, + inline static double trapezoidArea(double X1, + double X2, + double Y1, double Y2) { return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0; } @@ -218,7 +219,9 @@ private: MatrixPtr pv_; std::vector> outputPair_; - double calcRankAuc(real* outputData, real* clickData, real* pvData, + double calcRankAuc(real* outputData, + real* clickData, + real* pvData, size_t size); }; /** @@ -269,10 +272,12 @@ private: IVectorPtr cpuLabel_; MatrixPtr cpuWeight_; - void calcStatsInfo(const MatrixPtr& output, const IVectorPtr& label, + void calcStatsInfo(const MatrixPtr& output, + const IVectorPtr& label, const MatrixPtr& weight); - void calcStatsInfoMulti(const MatrixPtr& output, const MatrixPtr& label, + void calcStatsInfoMulti(const MatrixPtr& output, + const MatrixPtr& label, const MatrixPtr& weight); inline static double calcPrecision(double TP, double FP) { @@ -333,8 +338,12 @@ public: } } - void stat(size_t start, size_t end, PredictionResult* answers, double& pos, - double& neg, double& spe); + void stat(size_t start, + size_t end, + PredictionResult* answers, + double& pos, + double& neg, + double& spe); void calc(std::vector& predictArray); virtual void finish() { calc(predictArray_); } diff --git a/paddle/gserver/gradientmachines/GradientMachine.cpp b/paddle/gserver/gradientmachines/GradientMachine.cpp index b20525f66431e16544ce1e05a617286bd5975cfc..3761fda5f370e3b1aef0e394286c49d8ec831694 100644 --- a/paddle/gserver/gradientmachines/GradientMachine.cpp +++ b/paddle/gserver/gradientmachines/GradientMachine.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "GradientMachine.h" #include "paddle/utils/Logging.h" @@ -29,7 +28,8 @@ limitations under the License. */ namespace paddle { GradientMachine* GradientMachine::create( - const ModelConfig& config, int mode, + const ModelConfig& config, + int mode, const std::vector& parameterTypes) { if (auto gm = IGradientMachineMode::tryCreateGradientMachine(mode, config)) { return gm; @@ -49,10 +49,11 @@ GradientMachine* GradientMachine::create( /* single thread calculate */ nn = NeuralNetwork::create(config); } - ParamInitCallback testParamInitCb = - [](int paramId, Parameter* para) { para->enableType(PARAMETER_VALUE); }; - nn->init(config, mode == kTesting ? testParamInitCb : nullptr, - parameterTypes); + ParamInitCallback testParamInitCb = [](int paramId, Parameter* para) { + para->enableType(PARAMETER_VALUE); + }; + nn->init( + config, mode == kTesting ? testParamInitCb : nullptr, parameterTypes); return nn; } LOG(FATAL) << "Unknown model type: " << config.type(); diff --git a/paddle/gserver/gradientmachines/GradientMachine.h b/paddle/gserver/gradientmachines/GradientMachine.h index 986a1ee71dbb00781c6af93a06f3e16d6639c307..27cdf7f7890673673d5be63fecdd61d5d2a11447 100644 --- a/paddle/gserver/gradientmachines/GradientMachine.h +++ b/paddle/gserver/gradientmachines/GradientMachine.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include @@ -84,10 +83,11 @@ public: * Parameter will have parameterTypes */ static GradientMachine* create( - const ModelConfig& config, int mode = kNormal, + const ModelConfig& config, + int mode = kNormal, const std::vector& parameterTypes = - std::vector{PARAMETER_VALUE, PARAMETER_GRADIENT, - PARAMETER_MOMENTUM}); + std::vector{ + PARAMETER_VALUE, PARAMETER_GRADIENT, PARAMETER_MOMENTUM}); /** * Create a gradient machine from the merged model file. @@ -137,7 +137,8 @@ public: * @note: if passType==PASS_TEST, then backward() should not be called */ virtual void forward(const std::vector& inArgs, - std::vector* outArgs, PassType passType) = 0; + std::vector* outArgs, + PassType passType) = 0; /** * @brief Backward propagation. @@ -211,7 +212,7 @@ public: * @note This function will only been implemented and used in a * multithreaded environment. */ - virtual void start(const TrainerConfig& config, + virtual void start(const TrainerConfig& config, DataProviderPtr dataProvider) { (void)config; (void)dataProvider; @@ -246,7 +247,6 @@ public: */ virtual void restart() {} - /// Set the gradient of the output from outside. virtual void setOutputGrad(const std::vector& args) { LOG(FATAL) << "Not implemented!"; diff --git a/paddle/gserver/gradientmachines/GradientMachineMode.h b/paddle/gserver/gradientmachines/GradientMachineMode.h index 9aff9c616cf514d53b5017dfdb6250a7cbce0198..f2f55a70671858145572e4a5c0f1c4b609145f98 100644 --- a/paddle/gserver/gradientmachines/GradientMachineMode.h +++ b/paddle/gserver/gradientmachines/GradientMachineMode.h @@ -23,10 +23,10 @@ public: virtual ~IGradientMachineMode() {} public: // interfaces - /** - * @brief create current mode's gradient machine by model config. - * @param config model config - */ + /** + * @brief create current mode's gradient machine by model config. + * @param config model config + */ virtual GradientMachine* create(const ModelConfig& config) = 0; /** @@ -37,11 +37,10 @@ public: // interfaces * @param isGpu is using gpu. * @return true if mode should be this mode. */ - virtual bool shouldBeMe( - const std::string& algo, - size_t trainerCount, - bool isLocal, - bool isGpu) const = 0; + virtual bool shouldBeMe(const std::string& algo, + size_t trainerCount, + bool isLocal, + bool isGpu) const = 0; /** * @brief Is data must be in cpu even if using gpu mode. @@ -57,13 +56,13 @@ public: // interfaces virtual bool needTrainWholeDataInOneBatch() const = 0; public: // static methods. - /** - * @brief register a custom gradient machine mode. - * @note For user to register a custom gradient machine mode, id should >= - * kCustom. - * @param mode mode id. - * @param ptr mode description object. - */ + /** + * @brief register a custom gradient machine mode. + * @note For user to register a custom gradient machine mode, id should >= + * kCustom. + * @param mode mode id. + * @param ptr mode description object. + */ static void regGradientMachineMode( int32_t mode, std::unique_ptr&& ptr) { modes_.insert(std::make_pair(mode, std::move(ptr))); @@ -102,9 +101,11 @@ public: // static methods. * @param [in] isGpu using gpu or not. * @return true if there is a custom mode fit these conditions. */ - static bool tryGetMode(int* mode, const std::string& algo, + static bool tryGetMode(int* mode, + const std::string& algo, int32_t trainerCount, - bool isLocal, bool isGpu) { + bool isLocal, + bool isGpu) { for (auto it = modes_.begin(); it != modes_.end(); ++it) { if (it->second->shouldBeMe(algo, trainerCount, isLocal, isGpu)) { *mode = it->first; @@ -130,8 +131,8 @@ public: // static methods. * @brief try to create gradient machine by mode & config. * @return nullptr if we cannot create a gradient machine by such mode. */ - static GradientMachine* tryCreateGradientMachine( - int32_t mode, const ModelConfig& config) { + static GradientMachine* tryCreateGradientMachine(int32_t mode, + const ModelConfig& config) { auto m = IGradientMachineMode::mode(mode); if (m) { return m->create(config); @@ -142,7 +143,7 @@ public: // static methods. private: static std::unordered_map> - modes_; + modes_; }; } // namespace paddle diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp index 0ded30eeb44e95b50ff91722ef96a9f24c81c16d..148451f18dceb0c470dadab01ff91915f994c68f 100644 --- a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp +++ b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "MultiGradientMachine.h" #include "paddle/utils/Logging.h" @@ -22,7 +21,8 @@ limitations under the License. */ #include "NeuralNetwork.h" #include "ParallelNeuralNetwork.h" -P_DEFINE_bool(allow_only_one_model_on_one_gpu, true, +P_DEFINE_bool(allow_only_one_model_on_one_gpu, + true, "If true, do not allow multiple models on one GPU device"); #ifdef PADDLE_METRIC_LEARNING P_DECLARE_bool(external); @@ -32,15 +32,15 @@ namespace paddle { // get types of the parameters which need to be merged after backward() static void fillMergeTypes(PassType passType, - std::vector* mergeTypes) { + std::vector* mergeTypes) { mergeTypes->clear(); if (passType != PASS_TEST) { mergeTypes->push_back(PARAMETER_GRADIENT); } } -MultiGradientMachine::MultiGradientMachine( - const ModelConfig& config, bool useGpu) +MultiGradientMachine::MultiGradientMachine(const ModelConfig& config, + bool useGpu) : useGpu_(useGpu), trainerBarrier_(FLAGS_trainer_count), allBarrier_(FLAGS_trainer_count + 1), @@ -65,13 +65,11 @@ MultiGradientMachine::MultiGradientMachine( if (para->useGpu()) return; if (para->isSparseRemoteUpdate()) { - para->enableType( - PARAMETER_VALUE, - FLAGS_loadsave_parameters_in_pserver - ? Parameter::MAT_SPARSE_ROW_PREFETCH - : Parameter::MAT_SPARSE_ROW_PREFETCH_FULL_SIZE); - para->enableType( - PARAMETER_GRADIENT, Parameter::MAT_SPARSE_ROW); + para->enableType(PARAMETER_VALUE, + FLAGS_loadsave_parameters_in_pserver + ? Parameter::MAT_SPARSE_ROW_PREFETCH + : Parameter::MAT_SPARSE_ROW_PREFETCH_FULL_SIZE); + para->enableType(PARAMETER_GRADIENT, Parameter::MAT_SPARSE_ROW); } else if (para->isGradSparseUpdate()) { para->enableType(PARAMETER_VALUE); para->enableType(PARAMETER_GRADIENT, Parameter::MAT_SPARSE_ROW_IDS); @@ -100,17 +98,16 @@ MultiGradientMachine::MultiGradientMachine( if (useGpu_) { numLogicalDevices_ = 1; - for (size_t pid = 0; pid < parameters_.size(); pid++) { + for (size_t pid = 0; pid < parameters_.size(); pid++) { if (parameters_[pid]->getConfig().device() + 1 > numLogicalDevices_) { numLogicalDevices_ = parameters_[pid]->getConfig().device() + 1; } } LOG(INFO) << "numLogicalDevices=" << numLogicalDevices_ - << " numThreads=" << numThreads_ - << " numDevices=" << numDevices_; + << " numThreads=" << numThreads_ << " numDevices=" << numDevices_; - if (numLogicalDevices_ * numThreads_ > numDevices_ - && FLAGS_allow_only_one_model_on_one_gpu) { + if (numLogicalDevices_ * numThreads_ > numDevices_ && + FLAGS_allow_only_one_model_on_one_gpu) { LOG(FATAL) << "trainer_count * num_devices_in_model " << "(" << numThreads_ << "*" << numLogicalDevices_ << ")" << "=" << numThreads_ * numLogicalDevices_ @@ -130,11 +127,7 @@ MultiGradientMachine::MultiGradientMachine( } for (int i = 0; i < numThreads_; ++i) { - threads_.emplace_back( - new TrainerThread( - config, - i, - this)); + threads_.emplace_back(new TrainerThread(config, i, this)); } bufferSizes_.resize(numLogicalDevices_, 0); @@ -162,7 +155,7 @@ MultiGradientMachine::MultiGradientMachine( // combination of all trainers mainPara into GradientMachine parameters hasNonstaticCpuParamters_ = false; - for (size_t pid = 0; pid < parameters_.size(); pid++) { + for (size_t pid = 0; pid < parameters_.size(); pid++) { if (parameters_[pid]->useGpu()) { parameters_[pid] = threads_[paraMainThread_[pid]]->getParameters()[pid]; } else if (!parameters_[pid]->isStatic()) { @@ -209,7 +202,7 @@ void MultiGradientMachine::allocGradBufs() { SetDevice device(logicalDeviceId2RealDeviceId(d, i)); for (size_t j = 0; j < mergeTypes_.size(); j++) { gradBufs_[i][d].bufs.push_back( - Vector::create(bufferSizes_[d], /* useGpu= */true)); + Vector::create(bufferSizes_[d], /* useGpu= */ true)); } } } @@ -249,18 +242,16 @@ void MultiGradientMachine::prefetch(const std::vector& inArgs) { } } -void MultiGradientMachine::forward( - const std::vector& inArgs, - std::vector* outArgs, - PassType passType) { +void MultiGradientMachine::forward(const std::vector& inArgs, + std::vector* outArgs, + PassType passType) { forwardImp(inArgs, outArgs, passType, TASK_FORWARD); } -void MultiGradientMachine::forwardImp( - const std::vector& inArgs, - std::vector* outArgs, - PassType passType, - TaskType taskType) { +void MultiGradientMachine::forwardImp(const std::vector& inArgs, + std::vector* outArgs, + PassType passType, + TaskType taskType) { updateThreadParameters(); passType_ = passType; @@ -282,18 +273,16 @@ void MultiGradientMachine::backward(const UpdateCallback& callback) { backwardImp(callback); } -void MultiGradientMachine::forwardBackward( - const std::vector& inArgs, - std::vector* outArgs, - PassType passType, - const UpdateCallback& callback) { +void MultiGradientMachine::forwardBackward(const std::vector& inArgs, + std::vector* outArgs, + PassType passType, + const UpdateCallback& callback) { backwardCallback_ = callback; forwardImp(inArgs, outArgs, passType, TASK_FORWARD_BACKWARD); backwardImp(callback); } -void MultiGradientMachine::backwardImp( - const UpdateCallback& callback) { +void MultiGradientMachine::backwardImp(const UpdateCallback& callback) { for (size_t i = 0; i < parameters_.size(); i++) { if (!parameters_[i]->useGpu() || parameters_[i]->isStatic()) continue; REGISTER_TIMER("controller_dequeue"); @@ -349,9 +338,8 @@ void MultiGradientMachine::eval(Evaluator* evaluator) { } } -void MultiGradientMachine::getOutArgs( - std::vector* outArgs, - PassType passType) { +void MultiGradientMachine::getOutArgs(std::vector* outArgs, + PassType passType) { for (auto& thread : threads_) { REGISTER_TIMER("waitOutArgs"); thread->waitOutArgsReady(); @@ -375,7 +363,6 @@ void MultiGradientMachine::getOutArgs( *outArgs = outArgs_; } - void MultiGradientMachine::setOutputGrad(const std::vector& args) { CHECK_EQ(args.size(), outArgs_.size()); for (size_t i = 0; i < args.size(); i++) { @@ -390,10 +377,9 @@ void MultiGradientMachine::startTask(TaskType taskType) { } } -TrainerThread::TrainerThread( - const ModelConfig& config, - int threadId, - MultiGradientMachine* multiMachine) +TrainerThread::TrainerThread(const ModelConfig& config, + int threadId, + MultiGradientMachine* multiMachine) : multiMachine_(multiMachine), config_(config), threadId_(threadId), @@ -407,8 +393,9 @@ TrainerThread::TrainerThread( partnerId_ = mod(threadId_ - 1, numThreads); - deviceId_ = !multiMachine_->useGpu() ? -1 - : multiMachine_->logicalDeviceId2RealDeviceId(0, threadId_); + deviceId_ = !multiMachine_->useGpu() + ? -1 + : multiMachine_->logicalDeviceId2RealDeviceId(0, threadId_); SetDevice gpuDevice(deviceId_); NeuralNetwork* nn = nullptr; @@ -418,22 +405,20 @@ TrainerThread::TrainerThread( nn = new ParallelNeuralNetwork(); for (auto& paraConfig : *config_.mutable_parameters()) { if (paraConfig.device() != -1) { - paraConfig.set_device( - multiMachine_->logicalDeviceId2RealDeviceId( + paraConfig.set_device(multiMachine_->logicalDeviceId2RealDeviceId( paraConfig.device(), threadId_)); } } for (auto& layerConfig : *config_.mutable_layers()) { if (layerConfig.device() != -1) { - layerConfig.set_device( - multiMachine_->logicalDeviceId2RealDeviceId( + layerConfig.set_device(multiMachine_->logicalDeviceId2RealDeviceId( layerConfig.device(), threadId_)); } } } // Only GPU do not share parameter values with main paramters. - ParamInitCallback slaveParamInitCb = std::bind(parameterInitNN, _1, _2, - &mainParas); + ParamInitCallback slaveParamInitCb = + std::bind(parameterInitNN, _1, _2, &mainParas); nn->init(config_, slaveParamInitCb); gradientMachine_.reset(nn); parameters_ = gradientMachine_->getParameters(); @@ -443,9 +428,8 @@ TrainerThread::TrainerThread( } } - backwardCallback_ = std::bind( - &TrainerThread::backwardCallback, - this, std::placeholders::_1); + backwardCallback_ = + std::bind(&TrainerThread::backwardCallback, this, std::placeholders::_1); gradStream_ = HPPL_STREAM_2; valueStream_ = HPPL_STREAM_3; @@ -454,25 +438,21 @@ TrainerThread::TrainerThread( parameterUpdated_ = false; } -TrainerThread::~TrainerThread() { - stop(); -} +TrainerThread::~TrainerThread() { stop(); } void TrainerThread::start() { - gradientMachine_->start(*(TrainerConfig*)nullptr, (DataProviderPtr)nullptr); + gradientMachine_->start(*(TrainerConfig*)nullptr, (DataProviderPtr) nullptr); - computeThread_.reset(new std::thread( - [this](){ computeThread(); })); + computeThread_.reset(new std::thread([this]() { computeThread(); })); if (multiMachine_->useGpu()) { - gradCollectThread_.reset(new std::thread( - [this](){ gradCollectThread(); })); + gradCollectThread_.reset( + new std::thread([this]() { gradCollectThread(); })); - valueDispatchThread_.reset(new std::thread( - [this](){ valueDispatchThread(); })); + valueDispatchThread_.reset( + new std::thread([this]() { valueDispatchThread(); })); - copyThread_.reset(new std::thread( - [this](){ copyGradToBufferThread(); })); + copyThread_.reset(new std::thread([this]() { copyGradToBufferThread(); })); } } @@ -565,20 +545,14 @@ void TrainerThread::forward() { { REGISTER_TIMER("wait_value"); - valueReadyCond_.wait( - [this]() { - return !parameterUpdated_; - }); + valueReadyCond_.wait([this]() { return !parameterUpdated_; }); } - { - fillMergeTypes(multiMachine_->getPassType(), &mergeTypes_); - } + { fillMergeTypes(multiMachine_->getPassType(), &mergeTypes_); } { REGISTER_TIMER("thread_forward"); - gradientMachine_->forward( - inArgs_, &outArgs_, multiMachine_->getPassType()); + gradientMachine_->forward(inArgs_, &outArgs_, multiMachine_->getPassType()); } outArgsReadySem_.post(); } @@ -602,9 +576,8 @@ void TrainerThread::backwardCallback(Parameter* para) { if (multiMachine_->getNumThreads() == 1) { // no need to do merge if there is only one thread doCallback(paramId); - } else if (threadId_ == - mod(multiMachine_->paraMainThread(paramId) - 1, - multiMachine_->getNumThreads())) { + } else if (threadId_ == mod(multiMachine_->paraMainThread(paramId) - 1, + multiMachine_->getNumThreads())) { notifyCopyGradToBuffer(paramId); } else { notifyGradientCollect(paramId); @@ -625,7 +598,7 @@ void TrainerThread::copyGradToBufferThread() { if (stopping_) break; int pdeviceId = multiMachine_->realDeviceId2LogicalDeviceId( - parameters_[pid]->getDeviceId(), threadId_); + parameters_[pid]->getDeviceId(), threadId_); auto& gradBuf = gradBufs[pdeviceId]; @@ -639,9 +612,9 @@ void TrainerThread::copyGradToBufferThread() { SetDevice setDevice(parameters_[pid]->getDeviceId()); for (size_t i = 0; i < mergeTypes_.size(); ++i) { gradBuf.bufs[i]->resize( - parameters_[pid]->getBuf(mergeTypes_[i])->getSize()); - gradBuf.bufs[i]->copyFrom( - *parameters_[pid]->getBuf(mergeTypes_[i]), gradStream_); + parameters_[pid]->getBuf(mergeTypes_[i])->getSize()); + gradBuf.bufs[i]->copyFrom(*parameters_[pid]->getBuf(mergeTypes_[i]), + gradStream_); } hl_stream_synchronize(gradStream_); } @@ -667,7 +640,7 @@ void TrainerThread::gradCollectThread() { if (++gradReadyCount[pid] < 2) continue; gradReadyCount[pid] = 0; int pdeviceId = multiMachine_->realDeviceId2LogicalDeviceId( - parameters_[pid]->getDeviceId(), threadId_); + parameters_[pid]->getDeviceId(), threadId_); auto& gradBuf = gradBufs[pdeviceId]; @@ -741,8 +714,7 @@ void TrainerThread::valueDispatchThread() { void TrainerThread::notifyValueReady(int paramId) { if (--updateCounter_ == 0) { - valueReadyCond_.notify_all( - [this] { parameterUpdated_ = false; }); + valueReadyCond_.notify_all([this] { parameterUpdated_ = false; }); } notifyValueDispatch(paramId); @@ -750,7 +722,7 @@ void TrainerThread::notifyValueReady(int paramId) { void TrainerThread::copyInArgs() { const std::vector& fullInArgs = multiMachine_->getInArgs(); - int numThreads = multiMachine_->getAllThreads().size(); + int numThreads = multiMachine_->getAllThreads().size(); int32_t numSequences = fullInArgs[0].getNumSequences(); int32_t startSeq = numSequences * threadId_ / numThreads; int32_t endSeq = numSequences * (threadId_ + 1) / numThreads; @@ -767,9 +739,11 @@ void TrainerThread::copyInArgs() { return; } - for (size_t i=0; i < fullInArgs.size(); i++) { + for (size_t i = 0; i < fullInArgs.size(); i++) { inArgs_[i].resizeAndCopyFrom( - fullInArgs[i], startSeq, copySize, + fullInArgs[i], + startSeq, + copySize, FLAGS_parallel_nn ? false : multiMachine_->useGpu()); } } @@ -814,10 +788,8 @@ void TrainerThread::mergeGradSparse( std::vector& ids = mainMat->getIds(threadId_); for (auto slaveParams : slaveParameters) { - SparseRowCpuMatrix* mat = - dynamic_cast((*slaveParams)[pid] - ->getMat(PARAMETER_GRADIENT) - .get()); + SparseRowCpuMatrix* mat = dynamic_cast( + (*slaveParams)[pid]->getMat(PARAMETER_GRADIENT).get()); mat->addTo(*mainMat, ids, threadId_, multiMachine_->getNumThreads()); // we use a sample hash method(%) instead of range partition, // because range partition has balance issue sometimes, @@ -847,9 +819,10 @@ void TrainerThread::mergeGradDense( Parameter* para, std::vector*>& slaveParameters) { size_t pid = para->getID(); - auto interval = - calcSplitArrayInterval(para->getSize(), (size_t)threadId_, - multiMachine_->getNumThreads(), 8LU /*for avx*/); + auto interval = calcSplitArrayInterval(para->getSize(), + (size_t)threadId_, + multiMachine_->getNumThreads(), + 8LU /*for avx*/); size_t startSeq = interval.first; size_t copySize = interval.second - interval.first; @@ -861,8 +834,7 @@ void TrainerThread::mergeGradDense( CpuVector slaveGradSub(0, nullptr); for (auto slaveParams : slaveParameters) { slaveGradSub.subVecFrom( - *(*slaveParams)[pid]->getBuf(PARAMETER_GRADIENT), - startSeq, copySize); + *(*slaveParams)[pid]->getBuf(PARAMETER_GRADIENT), startSeq, copySize); destGrad.add(slaveGradSub); } } @@ -876,7 +848,9 @@ void TrainerThread::copyOutputGrad() { int32_t copySize = endSeq - startSeq; outArgs_.resize(outputGradArgs.size()); for (size_t i = 0; i < outputGradArgs.size(); i++) { - outArgs_[i].resizeAndCopyFrom(outputGradArgs[i], startSeq, copySize, + outArgs_[i].resizeAndCopyFrom(outputGradArgs[i], + startSeq, + copySize, multiMachine_->useGpu(), HPPL_STREAM_DEFAULT); } diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.h b/paddle/gserver/gradientmachines/MultiGradientMachine.h index d13cf426c29e4e9f6806178f2362e8189fdb0dec..58c5486810cf280c48c62f2256480c1a4bb047bc 100644 --- a/paddle/gserver/gradientmachines/MultiGradientMachine.h +++ b/paddle/gserver/gradientmachines/MultiGradientMachine.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include @@ -52,7 +51,8 @@ struct GradBuffer { * * It handles GPU and Cpu parameters differently. In GPU, one computing thread * generally corresponds to one GPU device. Thus, each thread keeps a separate - * copy of the parameter in its own device's memory. In CPU, we only need to keep + * copy of the parameter in its own device's memory. In CPU, we only need to + keep * one copy of the parameters in the main memory. After, each computing thread * computes its own parameter gradient, the update process needs to accumulate * the parameter gradients from all the computing threads, and update the @@ -66,16 +66,21 @@ struct GradBuffer { * computing thread so that the parameters in all the computing threads are * synchronized. The scatter and gather process are implemented by ring-style * communication. Assume we have N computing threads, its thread ids will be - * 0, 1, ..., N-1. For each parameter, the id of the main thread is specified in - * paraMainThread_[pid], where pid is the id of the parameter. Each thread i only + * 0, 1, ..., N-1. For each parameter, the id of the main thread is specified + in + * paraMainThread_[pid], where pid is the id of the parameter. Each thread i + only * sends data to its partner thread (i - 1) % N. For example, for a parameter * gradient that is computed in thread 4, and its main thread is 2. Its - * traveling process would be 4, 5,..., N-1, 0, 1, 2. In each step, the gradient + * traveling process would be 4, 5,..., N-1, 0, 1, 2. In each step, the + gradient * buffer is added to the local gradient, and the local gradient is then copied * to the gradient buffer of the next thread. At last, its main thread 2 will * get the accumulated parameter gradient. For the same parameter, after its - * value is updated, the value's traveling process would be 2, 1, 0, N-1, ... 3. - * At the end, all the computing threads would have the updated parameter value. + * value is updated, the value's traveling process would be 2, 1, 0, N-1, ... + 3. + * At the end, all the computing threads would have the updated parameter + value. * * A computing thread (TrainerThread) uses 4 threads to do different jobs: * @@ -94,8 +99,10 @@ struct GradBuffer { * * Handling of sparse update * Currently, sparse update is only supported for CPU parameters. - * Sparse updates refers to gradient caculation where the gradient is sparse. For - * example, if the input argument to a 'fc' layer is sparse, the gradient of the + * Sparse updates refers to gradient caculation where the gradient is sparse. + For + * example, if the input argument to a 'fc' layer is sparse, the gradient of + the * weight matrix of this layer will be sparse. It is usually more efficient to * treat the gradient explicitly as sparse vector during the parameter update. @@ -104,7 +111,8 @@ struct GradBuffer { * For both types of sparse updates, there is one copy of parameter value and * gradient called main parameter value and gradient, and there is a copy of - * parameter value and gradient for each computing thread called slave parameter + * parameter value and gradient for each computing thread called slave + parameter * value and gradient. The slave parameter values are always shared with the * corresponding main parameter value. The slave parameter grad is a sparse row * matrix. The sparse pattern for slave parameter grads are different, because @@ -124,7 +132,8 @@ struct GradBuffer { * (SparseAutoGrowRowCpuMatrix). It is a sparse row matrix. * * During backward() of each TrainerThread, SparseAutoGrowRowCpuMatrix will - * gather all the non-zero gradient. And After backward(), they will be merged + * gather all the non-zero gradient. And After backward(), they will be + merged * into main parameter grad (SparseRowIdsCpuMatrix), with indices indicating * which rows have nonzero gradient. * @@ -136,9 +145,11 @@ struct GradBuffer { * parameter values that are prefetched is up-to-date. * * Main parameter grad type is MAT_SPARSE_ROW (SparseRowCpuMatrix). - * And it shares sparse pattern with value by sharing indexDictHandle_, which + * And it shares sparse pattern with value by sharing indexDictHandle_, + which * is an internal data structure used by SparseRowCpuMatrixto specify the - * sparsity pattern of Slave parameter value shares with main parameter value. + * sparsity pattern of Slave parameter value shares with main parameter + value. * * Slave parameter grad type is MAT_SPARSE_ROW_AUTO_GROW * (SparsePrefetchRowCpuMatrix). It is a sparse row matrix @@ -148,8 +159,10 @@ struct GradBuffer { * parameter server. * * During backward() of each TrainerThread, SparseAutoGrowRowCpuMatrix will - * gather all the non-zero gradient. And After backward(), they will be merged - * into main parameter grad (SparseRowCpuMatrix). And the framework will send + * gather all the non-zero gradient. And After backward(), they will be + merged + * into main parameter grad (SparseRowCpuMatrix). And the framework will + send * the merged gradient to parameter server. */ class MultiGradientMachine : public GradientMachine { @@ -165,18 +178,16 @@ public: virtual void prefetch(const std::vector& inArgs); - virtual void forward( - const std::vector& inArgs, - std::vector* outArgs, - PassType passType); + virtual void forward(const std::vector& inArgs, + std::vector* outArgs, + PassType passType); virtual void backward(const UpdateCallback& callback = nullptr); - void forwardBackward( - const std::vector& inArgs, - std::vector* outArgs, - PassType passType, - const UpdateCallback& callback); + void forwardBackward(const std::vector& inArgs, + std::vector* outArgs, + PassType passType, + const UpdateCallback& callback); virtual void onPassEnd(); @@ -186,9 +197,7 @@ public: virtual void eval(Evaluator* evaluator); - bool useGpu() const { - return useGpu_; - } + bool useGpu() const { return useGpu_; } /// @return whether to pass the gradients in outArgs_ to each threads. bool isPassGrad() { return isPassGrad_; } @@ -203,9 +212,7 @@ public: protected: friend class TrainerThread; - std::vector& getAllThreads() { - return threads_; - } + std::vector& getAllThreads() { return threads_; } /// Calculate the real device id based on the logical device id and the /// thread id. int logicalDeviceId2RealDeviceId(int logicalId, int threadId = 0) const { @@ -229,9 +236,7 @@ protected: std::vector*> getSlaveParameters(); - bool hasNonstaticCpuParamters() const { - return hasNonstaticCpuParamters_; - } + bool hasNonstaticCpuParamters() const { return hasNonstaticCpuParamters_; } /// Called TrainerThread to wait before merging CPU parameter gradients. void waitBeforeMerge() { trainerBarrier_.wait(); } @@ -244,59 +249,41 @@ protected: /// finishing void waitForCopyInArgs() { allBarrier_.wait(); } - TrainerThreadPtr& getThread(int threadId) { - return threads_[threadId]; - } + TrainerThreadPtr& getThread(int threadId) { return threads_[threadId]; } std::vector& getGradBuf(int threadId) { return gradBufs_[threadId]; } - PassType getPassType() const { - return passType_; - } + PassType getPassType() const { return passType_; } /// Called by TrainerThread to notify MultiGradientMachine that the gradient /// for paramId is ready void notifyGradientTransfer(int paramId); - const std::vector& getInArgs() { - return inArgs_; - } + const std::vector& getInArgs() { return inArgs_; } - TaskType getTaskType() const { - return taskType_; - } + TaskType getTaskType() const { return taskType_; } const UpdateCallback& getBackwardCallback() const { return backwardCallback_; } - int getNumDevices() const { - return numDevices_; - } + int getNumDevices() const { return numDevices_; } - int getNumLogicalDevices() const { - return numLogicalDevices_; - } + int getNumLogicalDevices() const { return numLogicalDevices_; } - int getNumThreads() const { - return numThreads_; - } + int getNumThreads() const { return numThreads_; } - int paraMainThread(int pid) const { - return paraMainThread_[pid]; - } + int paraMainThread(int pid) const { return paraMainThread_[pid]; } protected: - virtual void forwardImp( - const std::vector& inArgs, - std::vector* outArgs, - PassType passType, - TaskType taskType); + virtual void forwardImp(const std::vector& inArgs, + std::vector* outArgs, + PassType passType, + TaskType taskType); - virtual void backwardImp( - const UpdateCallback& callback = NULL); + virtual void backwardImp(const UpdateCallback& callback = NULL); /// update all parameters void updateThreadParameters(); @@ -329,9 +316,9 @@ protected: /// ParameterType which needs to be merged from each GPU std::vector mergeTypes_; - int numDevices_; /* number of gpu devices */ + int numDevices_; /* number of gpu devices */ int numLogicalDevices_; // number of GPU used by one NN - int numThreads_; /* number of train threads */ + int numThreads_; /* number of train threads */ UpdateCallback backwardCallback_; @@ -350,38 +337,25 @@ protected: class TrainerThread { public: - TrainerThread( - const ModelConfig& config, - int threadId, - MultiGradientMachine* multiMachine); + TrainerThread(const ModelConfig& config, + int threadId, + MultiGradientMachine* multiMachine); ~TrainerThread(); void start(); - void onPassEnd() { - gradientMachine_->onPassEnd(); - } + void onPassEnd() { gradientMachine_->onPassEnd(); } - void waitOutArgsReady() { - outArgsReadySem_.wait(); - } + void waitOutArgsReady() { outArgsReadySem_.wait(); } - void notifyTaskReady() { - taskReadySem_.post(); - } + void notifyTaskReady() { taskReadySem_.post(); } - int getDeviceId() const { - return deviceId_; - } + int getDeviceId() const { return deviceId_; } - GradientMachine* getGradientMachine() { - return gradientMachine_.get(); - } + GradientMachine* getGradientMachine() { return gradientMachine_.get(); } - const std::vector& getParameters() { - return parameters_; - } + const std::vector& getParameters() { return parameters_; } void stop(); @@ -391,26 +365,18 @@ public: return parameters_[paramId]->getBuf(PARAMETER_VALUE); } - const std::vector& getOutArgs() { - return outArgs_; - } + const std::vector& getOutArgs() { return outArgs_; } void incUpdateCounter(int n = 1) { updateCounter_ += n; parameterUpdated_ = true; } - void notifyGradientCollect(int paramId) { - gradQueue_.enqueue(paramId); - } + void notifyGradientCollect(int paramId) { gradQueue_.enqueue(paramId); } - void notifyCopyGradToBuffer(int paramId) { - gradBufQueue_.enqueue(paramId); - } + void notifyCopyGradToBuffer(int paramId) { gradBufQueue_.enqueue(paramId); } - void notifyValueDispatch(int paramId) { - valueReadyQueue_.enqueue(paramId); - } + void notifyValueDispatch(int paramId) { valueReadyQueue_.enqueue(paramId); } void prefetch(); @@ -421,16 +387,16 @@ protected: void mergeCpuGradients(); void mergeGradSparse( - Parameter* para, - std::vector*>& slaveParameters); + Parameter* para, + std::vector*>& slaveParameters); void mergeGradSparseRemote( - Parameter* para, - std::vector*>& slaveParameters); + Parameter* para, + std::vector*>& slaveParameters); void mergeGradDense( - Parameter* para, - std::vector*>& slaveParameters); + Parameter* para, + std::vector*>& slaveParameters); void computeThread(); void valueDispatchThread(); @@ -499,5 +465,4 @@ protected: bool inArgsCopied_; }; - } // namespace paddle diff --git a/paddle/gserver/gradientmachines/MultiNetwork.cpp b/paddle/gserver/gradientmachines/MultiNetwork.cpp index b85d2e0c99aba3051f951779190468d043e2c447..e5be19cad6b450850de4cc5776017b79d3243681 100644 --- a/paddle/gserver/gradientmachines/MultiNetwork.cpp +++ b/paddle/gserver/gradientmachines/MultiNetwork.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/utils/Stat.h" #include "paddle/utils/Util.h" #include @@ -24,7 +23,8 @@ limitations under the License. */ namespace paddle { -void MultiNetwork::init(const ModelConfig& config, ParamInitCallback callback, +void MultiNetwork::init(const ModelConfig& config, + ParamInitCallback callback, const std::vector& parameterTypes, bool useGpu) { CHECK_GT(config.sub_models_size(), 1) << "sub_models_size should GT 1"; @@ -40,10 +40,10 @@ void MultiNetwork::init(const ModelConfig& config, ParamInitCallback callback, std::string subModelName = config.sub_models(i).name(); if (FLAGS_parallel_nn) { subNetworks_[i - 1] = std::unique_ptr( - new ParallelNeuralNetwork(subModelName, this)); + new ParallelNeuralNetwork(subModelName, this)); } else { subNetworks_[i - 1] = std::unique_ptr( - NeuralNetwork::newNeuralNetwork(subModelName, this)); + NeuralNetwork::newNeuralNetwork(subModelName, this)); } subNetworks_[i - 1]->init(config); } @@ -64,7 +64,8 @@ void MultiNetwork::prefetch(const std::vector& inArgs) { } void MultiNetwork::forward(const std::vector& inArgs, - std::vector* outArgs, PassType passType) { + std::vector* outArgs, + PassType passType) { // split inArgs to several vectors std::vector> argumentGroups; Argument::splitByDataId(inArgs, &argumentGroups); diff --git a/paddle/gserver/gradientmachines/MultiNetwork.h b/paddle/gserver/gradientmachines/MultiNetwork.h index a162420c3bfe63fdca26dc5da0514dc7854df091..779a2267f55c8e1b5d120d9fd1e2a0d455cc5c59 100644 --- a/paddle/gserver/gradientmachines/MultiNetwork.h +++ b/paddle/gserver/gradientmachines/MultiNetwork.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "GradientMachine.h" @@ -27,19 +26,22 @@ public: explicit MultiNetwork(std::string subModelName = "") : NeuralNetwork(subModelName) {} - virtual void init(const ModelConfig& config, ParamInitCallback callback, + virtual void init(const ModelConfig& config, + ParamInitCallback callback, const std::vector& parameterTypes, bool useGpu); virtual void prefetch(const std::vector& inArgs); virtual void forward(const std::vector& inArgs, - std::vector* outArgs, PassType passType); + std::vector* outArgs, + PassType passType); virtual void backward(const UpdateCallback& callback = nullptr); void forwardBackward(const std::vector& inArgs, - std::vector* outArgs, PassType passType, + std::vector* outArgs, + PassType passType, const UpdateCallback& callback); virtual void onPassEnd(); @@ -52,8 +54,7 @@ public: return subNetworks_; } - virtual void start(const TrainerConfig& config, - DataProviderPtr dataProvider); + virtual void start(const TrainerConfig& config, DataProviderPtr dataProvider); virtual void finish(); diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp index c77b00eb06122becf232f786a348ae73033648d6..9932ea655ebdceb2eb1ae8920f4d320163d14262 100644 --- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp +++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/utils/Util.h" #include "paddle/utils/Logging.h" @@ -26,7 +25,8 @@ limitations under the License. */ #include "paddle/gserver/layers/AgentLayer.h" namespace paddle { -void parameterInitNN(int paramId, Parameter* para, +void parameterInitNN(int paramId, + Parameter* para, std::vector* sharedParams) { // Create parameters values. if (!para->useGpu() && sharedParams) { @@ -35,10 +35,10 @@ void parameterInitNN(int paramId, Parameter* para, (*sharedParams)[paramId]->getMat(PARAMETER_VALUE)); } else { if (para->isSparseRemoteUpdate()) { - para->enableType( - PARAMETER_VALUE, FLAGS_loadsave_parameters_in_pserver - ? Parameter::MAT_SPARSE_ROW_PREFETCH - : Parameter::MAT_SPARSE_ROW_PREFETCH_FULL_SIZE); + para->enableType(PARAMETER_VALUE, + FLAGS_loadsave_parameters_in_pserver + ? Parameter::MAT_SPARSE_ROW_PREFETCH + : Parameter::MAT_SPARSE_ROW_PREFETCH_FULL_SIZE); } else { para->enableType(PARAMETER_VALUE); } @@ -65,7 +65,8 @@ NeuralNetwork* NeuralNetwork::create(const ModelConfig& config) { std::map NeuralNetwork::dllInitMap; -void NeuralNetwork::init(const ModelConfig& config, ParamInitCallback callback, +void NeuralNetwork::init(const ModelConfig& config, + ParamInitCallback callback, const std::vector& parameterTypes, bool useGpu) { using std::placeholders::_1; @@ -89,12 +90,13 @@ void NeuralNetwork::init(const ModelConfig& config, ParamInitCallback callback, } else { parameters_.reserve(config.parameters_size()); for (const auto& para_config : config.parameters()) { - auto parameter = std::make_shared(para_config, useGpu, + auto parameter = std::make_shared(para_config, + useGpu, /*initialize=*/false); paramCallback(parameters_.size(), parameter.get()); if (!callback) { for (ParameterType type : - (parameter->isStatic() + (parameter->isStatic() ? std::vector{PARAMETER_VALUE} : parameterTypes)) { if (type != PARAMETER_VALUE && type != PARAMETER_GRADIENT) { @@ -117,18 +119,19 @@ void NeuralNetwork::init(const ModelConfig& config, ParamInitCallback callback, layerMap_[layer->getName()] = layer; }; - auto subModelConfig = - std::find_if(config.sub_models().begin(), config.sub_models().end(), - [=](const SubModelConfig& sub_model) { - return sub_model.name() == subModelName_; - }); + auto subModelConfig = std::find_if(config.sub_models().begin(), + config.sub_models().end(), + [=](const SubModelConfig& sub_model) { + return sub_model.name() == subModelName_; + }); bool useSubModel = (subModelConfig != config.sub_models().end()); CHECK_EQ(useSubModel, !subModelName_.empty()); if (useSubModel) { layers_.reserve(subModelConfig->layer_names_size()); for (const auto& layer_name : subModelConfig->layer_names()) { auto layer_config = - std::find_if(config.layers().begin(), config.layers().end(), + std::find_if(config.layers().begin(), + config.layers().end(), [=](const LayerConfig& layer_config) { return layer_config.name() == layer_name; }); @@ -176,14 +179,16 @@ void NeuralNetwork::init(const ModelConfig& config, ParamInitCallback callback, } } -void NeuralNetwork::connect(LayerPtr agentLayer, LayerPtr realLayer, +void NeuralNetwork::connect(LayerPtr agentLayer, + LayerPtr realLayer, int height) { AgentLayer* agent = dynamic_cast(agentLayer.get()); CHECK_NOTNULL(agent); agent->setRealLayer(realLayer, height); } -void NeuralNetwork::connect(std::string agentLayerName, NeuralNetwork* srcNN, +void NeuralNetwork::connect(std::string agentLayerName, + NeuralNetwork* srcNN, std::string realLayerName) { connect(this->getLayer(agentLayerName), srcNN->getLayer(realLayerName)); } @@ -195,7 +200,7 @@ void NeuralNetwork::prefetch(const std::vector& inArgs) { for (auto& para : parameters_) { if (para->isSparseRemoteUpdate()) { auto mat = dynamic_cast( - para->getMat(PARAMETER_VALUE).get()); + para->getMat(PARAMETER_VALUE).get()); para->clearGradient(); mat->clearIndices(); } @@ -217,10 +222,10 @@ void NeuralNetwork::prefetch(const std::vector& inArgs) { for (auto& para : parameters_) { if (para->isSparseRemoteUpdate()) { auto mat = dynamic_cast( - para->getMat(PARAMETER_VALUE).get()); + para->getMat(PARAMETER_VALUE).get()); mat->setupIndices(); auto matGrad = dynamic_cast( - para->getMat(PARAMETER_GRADIENT).get()); + para->getMat(PARAMETER_GRADIENT).get()); matGrad->reserveStore(); } } @@ -228,7 +233,8 @@ void NeuralNetwork::prefetch(const std::vector& inArgs) { } void NeuralNetwork::forward(const std::vector& inArgs, - std::vector* outArgs, PassType passType) { + std::vector* outArgs, + PassType passType) { CHECK_EQ(inArgs.size(), dataLayers_.size()); outArgs->resize(outputLayers_.size()); for (size_t i = 0; i != dataLayers_.size(); ++i) { @@ -344,11 +350,11 @@ protected: Evaluator* NeuralNetwork::makeEvaluator() { CombinedEvaluator* combinedEvaluator = new CombinedEvaluator(); - auto subModelConfig = - std::find_if(config_.sub_models().begin(), config_.sub_models().end(), - [=](const SubModelConfig& sub_model) { - return sub_model.name() == subModelName_; - }); + auto subModelConfig = std::find_if(config_.sub_models().begin(), + config_.sub_models().end(), + [=](const SubModelConfig& sub_model) { + return sub_model.name() == subModelName_; + }); bool useSubModel = (subModelConfig != config_.sub_models().end()); CHECK_EQ(useSubModel, !subModelName_.empty()); if (useSubModel) { @@ -356,7 +362,8 @@ Evaluator* NeuralNetwork::makeEvaluator() { for (int i = 0; i < subModelConfig->evaluator_names_size(); ++i) { // find evaluator by name auto thisEvalConfig = std::find_if( - config_.evaluators().begin(), config_.evaluators().end(), + config_.evaluators().begin(), + config_.evaluators().end(), [=](const EvaluatorConfig& ecfg) { return ecfg.name() == subModelConfig->evaluator_names(i); }); @@ -385,17 +392,17 @@ void NeuralNetwork::setOutputGrad(const std::vector& args) { } } -extern NeuralNetwork* newCustomNerualNetwork( - const std::string& name, NeuralNetwork* network) __attribute__((weak)); +extern NeuralNetwork* newCustomNerualNetwork(const std::string& name, + NeuralNetwork* network) + __attribute__((weak)); -NeuralNetwork* NeuralNetwork::newNeuralNetwork( - const std::string& name, - NeuralNetwork* rootNetwork) { - if (newCustomNerualNetwork) { - return newCustomNerualNetwork(name, rootNetwork); - } else { - return new NeuralNetwork(name, rootNetwork); - } +NeuralNetwork* NeuralNetwork::newNeuralNetwork(const std::string& name, + NeuralNetwork* rootNetwork) { + if (newCustomNerualNetwork) { + return newCustomNerualNetwork(name, rootNetwork); + } else { + return new NeuralNetwork(name, rootNetwork); + } } } // namespace paddle diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.h b/paddle/gserver/gradientmachines/NeuralNetwork.h index 06c679a63cc79b68b9fd27dfb64dfa9add8a1078..55ef45c5eeddc770ec3bc8fd0055d561eaf3b754 100644 --- a/paddle/gserver/gradientmachines/NeuralNetwork.h +++ b/paddle/gserver/gradientmachines/NeuralNetwork.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include @@ -52,14 +51,15 @@ namespace paddle { * GPU value: NORMAL * GPU param: NORMAL */ -void parameterInitNN(int paramId, Parameter* para, +void parameterInitNN(int paramId, + Parameter* para, std::vector* sharedParams); - class NeuralNetwork : public GradientMachine { public: virtual void init( - const ModelConfig& config, ParamInitCallback callback = nullptr, + const ModelConfig& config, + ParamInitCallback callback = nullptr, const std::vector& parameterTypes = std::vector{PARAMETER_VALUE, PARAMETER_GRADIENT, @@ -76,13 +76,15 @@ public: * @param agentLayer The up-submodel's input agent layer. */ static void connect(LayerPtr agentLayer, LayerPtr realLayer, int height = 0); - void connect(std::string agentLayerName, NeuralNetwork* srcNN, + void connect(std::string agentLayerName, + NeuralNetwork* srcNN, std::string realLayerName); virtual void prefetch(const std::vector& inArgs); virtual void forward(const std::vector& inArgs, - std::vector* outArgs, PassType passType); + std::vector* outArgs, + PassType passType); virtual void backward(const UpdateCallback& callback = nullptr); @@ -117,16 +119,15 @@ public: */ template void forEachLayer(T callback) { - for (auto & l : layers_) { + for (auto& l : layers_) { if (callback(l)) { break; } } } - static NeuralNetwork* newNeuralNetwork(const std::string& name = "", - NeuralNetwork* rootNetwork = nullptr); + NeuralNetwork* rootNetwork = nullptr); protected: /** @@ -139,8 +140,7 @@ protected: */ NeuralNetwork(std::string subModelName = "", NeuralNetwork* rootNetwork = nullptr) - : subModelName_(subModelName), - rootNetwork_(rootNetwork) {} + : subModelName_(subModelName), rootNetwork_(rootNetwork) {} std::string subModelName_; ModelConfig config_; diff --git a/paddle/gserver/gradientmachines/ParallelNeuralNetwork.cpp b/paddle/gserver/gradientmachines/ParallelNeuralNetwork.cpp index 22698f586701774d884e6eeca943f6bf75fe7a96..9dbf418c31b0969eef7477a22b6f1bf63dab9b03 100644 --- a/paddle/gserver/gradientmachines/ParallelNeuralNetwork.cpp +++ b/paddle/gserver/gradientmachines/ParallelNeuralNetwork.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/utils/Stat.h" #include "paddle/utils/Util.h" @@ -24,14 +23,16 @@ limitations under the License. */ namespace paddle { void ParallelNeuralNetwork::init( - const ModelConfig& config, ParamInitCallback callback, - const std::vector& parameterTypes, bool useGpu) { + const ModelConfig& config, + ParamInitCallback callback, + const std::vector& parameterTypes, + bool useGpu) { NeuralNetwork::init(config, callback, parameterTypes, useGpu); if (config.type() == "recurrent_nn") { LOG(FATAL) - << "You can not add `--parallel_nn=true` on the command line, " - << "parallel_nn training mode does not support the recurrent_nn model."; + << "You can not add `--parallel_nn=true` on the command line, " + << "parallel_nn training mode does not support the recurrent_nn model."; } useGpu_ = useGpu; @@ -54,8 +55,8 @@ void ParallelNeuralNetwork::addComputeThread(int deviceId) { } } - threads_.emplace_back(new ParallelThread(threads_.size(), deviceId, - deviceId >= 0 ? useGpu_ : false)); + threads_.emplace_back(new ParallelThread( + threads_.size(), deviceId, deviceId >= 0 ? useGpu_ : false)); } void ParallelNeuralNetwork::waitAllThread() { @@ -68,7 +69,8 @@ void ParallelNeuralNetwork::waitAllThread() { } } -void ParallelNeuralNetwork::dispatchByDeviceId(int deviceId, LayerPtr layer, +void ParallelNeuralNetwork::dispatchByDeviceId(int deviceId, + LayerPtr layer, TaskType task) { for (auto& thread : threads_) { if (thread->getDeviceId() == deviceId) { diff --git a/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h b/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h index 2a3db654f4e16c0ecd4be91425330208046b4a6c..71488bc3b7a52d851d0e3fb77c48f3fd36bdce83 100644 --- a/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h +++ b/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "NeuralNetwork.h" @@ -35,24 +34,27 @@ enum TaskType { class ParallelNeuralNetwork : public NeuralNetwork { public: ParallelNeuralNetwork(std::string subModelName = "", - NeuralNetwork* rootNetwork = nullptr) - : NeuralNetwork(subModelName, rootNetwork) {} + NeuralNetwork *rootNetwork = nullptr) + : NeuralNetwork(subModelName, rootNetwork) {} virtual void init( - const ModelConfig &config, ParamInitCallback callback = nullptr, - const std::vector & - parameterTypes = std::vector{PARAMETER_VALUE, - PARAMETER_GRADIENT, - PARAMETER_MOMENTUM}, + const ModelConfig &config, + ParamInitCallback callback = nullptr, + const std::vector + ¶meterTypes = std::vector{PARAMETER_VALUE, + PARAMETER_GRADIENT, + PARAMETER_MOMENTUM}, bool useGpu = FLAGS_use_gpu); virtual void forward(const std::vector &inArgs, - std::vector *outArgs, PassType passType); + std::vector *outArgs, + PassType passType); virtual void backward(const UpdateCallback &callback = nullptr); void forwardBackward(const std::vector &inArgs, - std::vector *outArgs, PassType passType, + std::vector *outArgs, + PassType passType, const UpdateCallback &callback = NULL); virtual void start(const TrainerConfig &config, DataProviderPtr dataProvider); diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp index 340cd1b9f8e927ded5d06ab0c7ab15ec75bc8469..516b61757698923eb0fde1f3b1d28074cac10044 100644 --- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp +++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp @@ -53,8 +53,8 @@ typedef int (*DiyStartCalcProbCallback)(size_t nNodes, int* nodes); * path. * @NOTE: Return -INFINITY will DROP this path IMMEDIATELY!! */ -typedef real (*DiyCalcProbCallback)(int handler, size_t nNodes, int* nodes, - real curProb, bool atEos); +typedef real (*DiyCalcProbCallback)( + int handler, size_t nNodes, int* nodes, real curProb, bool atEos); /** * Finish Custom Calculation of Probability callback type. @@ -190,13 +190,16 @@ public: }; void RecurrentGradientMachine::init( - const ModelConfig& config, ParamInitCallback callback, - const std::vector& parameterTypes, bool useGpu) { + const ModelConfig& config, + ParamInitCallback callback, + const std::vector& parameterTypes, + bool useGpu) { NeuralNetwork::init(config, callback, parameterTypes, useGpu); useGpu_ = useGpu; auto subModelConfig = - std::find_if(config.sub_models().begin(), config.sub_models().end(), + std::find_if(config.sub_models().begin(), + config.sub_models().end(), [this](const SubModelConfig& sub_model) { return sub_model.name() == this->subModelName_; }); @@ -224,7 +227,8 @@ void RecurrentGradientMachine::init( memoryFrameLines_[i].layerName = memoryConfig.layer_name(); memoryFrameLines_[i].linkName = memoryConfig.link_name(); auto agentConfig = - std::find_if(config.layers().begin(), config.layers().end(), + std::find_if(config.layers().begin(), + config.layers().end(), [&memoryConfig](const LayerConfig& layerConfig) { return layerConfig.name() == memoryConfig.link_name(); }); @@ -413,7 +417,8 @@ void RecurrentGradientMachine::forward(const std::vector& inArgs, // sample is one sentence if (shareInlinkInfo) { CHECK_EQ(input1.getBatchSize(), batchSize); - CHECK(std::equal(starts, starts + numSequences + 1, + CHECK(std::equal(starts, + starts + numSequences + 1, input1.sequenceStartPositions->getData(false))); } } @@ -428,7 +433,8 @@ void RecurrentGradientMachine::forward(const std::vector& inArgs, const Argument& input1 = inFrameLines_[i].inLayer->getOutput(); CHECK_EQ((size_t)input1.getNumSubSequences(), numSubSequences); if (shareInlinkInfo) { - CHECK(std::equal(subStarts, subStarts + numSubSequences + 1, + CHECK(std::equal(subStarts, + subStarts + numSubSequences + 1, input1.subSequenceStartPositions->getData(false))); } } @@ -460,8 +466,10 @@ void RecurrentGradientMachine::forward(const std::vector& inArgs, // inFrameLine select rows in real layer one time for (size_t i = 0; i < inFrameLines_.size(); i++) { int curInlinkId = shareInlinkInfo ? 0 : i; - selectRowsOneTime(inFrameLines_[i].inLayer, info_[curInlinkId].allIds, - &(inFrameLines_[i].outArg), passType); + selectRowsOneTime(inFrameLines_[i].inLayer, + info_[curInlinkId].allIds, + &(inFrameLines_[i].outArg), + passType); } } resizeOrCreateFrames(maxSequenceLength_); @@ -472,15 +480,17 @@ void RecurrentGradientMachine::forward(const std::vector& inArgs, auto scatterAgent = dynamic_cast(memoryFrameLine.rootAgent.get()); createMemoryFrameInfo(&memoryFrameLine, passType); - scatterAgent->setRealLayerAndOutput( - memoryFrameLine.rootLayer, memoryFrameLine.outArg, - memoryFrameLine.allIds, - /* idIndex */ 0, memoryFrameLine.allIds->getSize()); + scatterAgent->setRealLayerAndOutput(memoryFrameLine.rootLayer, + memoryFrameLine.outArg, + memoryFrameLine.allIds, + /* idIndex */ 0, + memoryFrameLine.allIds->getSize()); if (memoryFrameLine.is_sequence) { // memoryConfig is sequence int size = memoryFrameLine.sequenceStartPositions->getSize(); scatterAgent->setSequenceStartPositions( memoryFrameLine.sequenceStartPositions, - /* seqStartPosIndex */ 0, size); + /* seqStartPosIndex */ 0, + size); } } } @@ -489,7 +499,8 @@ void RecurrentGradientMachine::forward(const std::vector& inArgs, auto gatherAgent = dynamic_cast(outFrameLine.agentLayer.get()); CHECK_NOTNULL(gatherAgent); - gatherAgent->copyIdAndSequenceInfo(input, info_[targetInfoInlinkId_].allIds, + gatherAgent->copyIdAndSequenceInfo(input, + info_[targetInfoInlinkId_].allIds, info_[targetInfoInlinkId_].idIndex); } @@ -504,15 +515,15 @@ void RecurrentGradientMachine::forward(const std::vector& inArgs, auto scatterAgent = dynamic_cast(inFrameLine.agents[i].get()); scatterAgent->setRealLayerAndOutput(inFrameLine.inLayer, - inFrameLine.outArg, info.allIds, - info.idIndex[i], idSize); + inFrameLine.outArg, + info.allIds, + info.idIndex[i], + idSize); if (hasSubseq) { // size: the length of subsequence - int size = - info.seqStartPosIndex[i + 1] - info.seqStartPosIndex[i]; - scatterAgent->setSequenceStartPositions(info.sequenceStartPositions, - info.seqStartPosIndex[i], - size); + int size = info.seqStartPosIndex[i + 1] - info.seqStartPosIndex[i]; + scatterAgent->setSequenceStartPositions( + info.sequenceStartPositions, info.seqStartPosIndex[i], size); } } @@ -547,7 +558,7 @@ void RecurrentGradientMachine::forward(const std::vector& inArgs, if (hasSubseq) { for (auto& outFrameLine : outFrameLines_) { CHECK(outFrameLine.frames[i]->getOutput().sequenceStartPositions) - << "In hierachical RNN, all out links should be from sequences."; + << "In hierachical RNN, all out links should be from sequences."; } } } @@ -573,8 +584,10 @@ void RecurrentGradientMachine::backward(const UpdateCallback& callback) { } void RecurrentGradientMachine::forwardBackward( - const std::vector& inArgs, std::vector* outArgs, - PassType passType, const UpdateCallback& callback) { + const std::vector& inArgs, + std::vector* outArgs, + PassType passType, + const UpdateCallback& callback) { LOG(FATAL) << "should not use this function"; } @@ -729,12 +742,15 @@ void RecurrentGradientMachine::createMemoryFrameInfo( // copy and check scatterId copyScattedId(allIds, &(*memoryFrameLine).allIds, input.getBatchSize()); // memoryFrameLine select rows in real layer one time - selectRowsOneTime((*memoryFrameLine).rootLayer, (*memoryFrameLine).allIds, - &(*memoryFrameLine).outArg, passType); + selectRowsOneTime((*memoryFrameLine).rootLayer, + (*memoryFrameLine).allIds, + &(*memoryFrameLine).outArg, + passType); } void RecurrentGradientMachine::copyScattedId(std::vector& srcIds, - IVectorPtr* dstIds, int size) { + IVectorPtr* dstIds, + int size) { int idSize = srcIds.size(); CHECK_EQ(idSize, size); IVector::resizeOrCreate(*dstIds, idSize, useGpu_); @@ -756,12 +772,12 @@ void RecurrentGradientMachine::selectRowsOneTime(LayerPtr layer, int height = realV->getHeight(); int width = realV->getWidth(); Matrix::resizeOrCreate( - arg->value, height, width, /* trans */ false, useGpu_); + arg->value, height, width, /* trans */ false, useGpu_); arg->value->zeroMem(); arg->value->selectRows(*realV, *allIds); if (passType != PASS_TEST) { - Matrix::resizeOrCreate(arg->grad, height, width, /* trans */ false, - useGpu_); + Matrix::resizeOrCreate( + arg->grad, height, width, /* trans */ false, useGpu_); arg->grad->zeroMem(); } } @@ -833,8 +849,8 @@ void RecurrentGradientMachine::generateSequence() { << "boot layer must be a sequence when is_sequence = true"; } } - NeuralNetwork::connect(memoryFrameLine.agents[0], memoryFrameLine.bootLayer, - ids.size()); + NeuralNetwork::connect( + memoryFrameLine.agents[0], memoryFrameLine.bootLayer, ids.size()); } // boot layer forward @@ -847,14 +863,19 @@ void RecurrentGradientMachine::generateSequence() { size_t resultNum = generator_.config.num_results_per_sample(); IVector::resizeOrCreate( generator_.outArg.ids, - generator_.config.max_num_frames() * numSequences * resultNum, false); + generator_.config.max_num_frames() * numSequences * resultNum, + false); if (resultNum > 1) { CHECK_LE(resultNum, static_cast(generator_.config.beam_size())); - Matrix::resizeOrCreate(generator_.outArg.in, /* height */ numSequences, - /* width */ resultNum, false, /* useGpu */ false); + Matrix::resizeOrCreate(generator_.outArg.in, + /* height */ numSequences, + /* width */ resultNum, + false, + /* useGpu */ false); } ICpuGpuVector::resizeOrCreate(generator_.outArg.sequenceStartPositions, - numSequences + 1, /* useGpu */ false); + numSequences + 1, + /* useGpu */ false); if (getBeamSize() > 1) { beamSearch(numSequences); } else { @@ -906,7 +927,8 @@ void RecurrentGradientMachine::oneWaySearch(size_t batchSize) { auto scatterAgent = dynamic_cast( memoryFrameLine.scatterAgents[machineCur].get()); scatterAgent->setRealLayer(memoryFrameLine.frames[machinePrev], - scatterIds, memoryFrameLine.is_sequence); + scatterIds, + memoryFrameLine.is_sequence); scatterAgent->forward(PASS_TEST); NeuralNetwork::connect(memoryFrameLine.agents[machineCur], memoryFrameLine.scatterAgents[machineCur]); @@ -948,7 +970,8 @@ void RecurrentGradientMachine::oneWaySearch(size_t batchSize) { starts[0] = 0; generator_.ids.clear(); for (size_t i = 0; i < batchSize; ++i) { - generator_.ids.insert(generator_.ids.end(), finalPaths[i].ids.begin(), + generator_.ids.insert(generator_.ids.end(), + finalPaths[i].ids.begin(), finalPaths[i].ids.end()); starts[i + 1] = generator_.ids.size(); batchMachineIdVec_.insert(batchMachineIdVec_.end(), @@ -999,8 +1022,11 @@ void RecurrentGradientMachine::forwardFrame(int machineCur) { if (useGpu_) { IVector::resizeOrCreate(cpuId_, ids->getSize(), false /* useGpu */); cpuId_->copyFrom(*ids); - Matrix::resizeOrCreate(cpuProb_, in->getHeight(), in->getWidth(), - false /* trans */, false /* useGpu */); + Matrix::resizeOrCreate(cpuProb_, + in->getHeight(), + in->getWidth(), + false /* trans */, + false /* useGpu */); cpuProb_->copyFrom(*in); IVector::resizeOrCreate(cpuEos_, eos->getSize(), false /* useGpu */); cpuEos_->copyFrom(*eos); @@ -1011,7 +1037,8 @@ void RecurrentGradientMachine::forwardFrame(int machineCur) { } } -void RecurrentGradientMachine::singlePathExpand(Path& curPath, size_t curPathId, +void RecurrentGradientMachine::singlePathExpand(Path& curPath, + size_t curPathId, std::vector& newPaths, size_t expandWidth) { int calc_id = @@ -1037,8 +1064,8 @@ void RecurrentGradientMachine::singlePathExpand(Path& curPath, size_t curPathId, if (id == -1) break; real newLogProb = generator_.config.log_prob() ? std::log(prob) : prob; - Path newPath(curPath, id, newLogProb, curPathId /*machineId*/, - k /*topIndex*/); + Path newPath( + curPath, id, newLogProb, curPathId /*machineId*/, k /*topIndex*/); if (this->beamSearchCtrlCallbacks_) { if (beamSearchCtrlCallbacks_->stopDetermineCandidates( newPath.seqId, newPath.ids, newPath.probHistory)) @@ -1104,7 +1131,8 @@ size_t RecurrentGradientMachine::beamShrink(std::vector& newPaths, } std::nth_element(newPaths.begin() + totalExpandCount, newPaths.begin() + totalExpandCount + minNewPathSize, - newPaths.end(), Path::greaterPath); + newPaths.end(), + Path::greaterPath); newPaths.resize(totalExpandCount + minNewPathSize); real minPathLogProb = @@ -1116,7 +1144,8 @@ size_t RecurrentGradientMachine::beamShrink(std::vector& newPaths, // Remove the already formed paths that are relatively short finalPaths_[seqId].erase( - std::remove_if(finalPaths_[seqId].begin(), finalPaths_[seqId].end(), + std::remove_if(finalPaths_[seqId].begin(), + finalPaths_[seqId].end(), [&](Path& p) { return p.logProb < minPathLogProb; }), finalPaths_[seqId].end()); for (auto p : finalPaths_[seqId]) { @@ -1139,7 +1168,8 @@ void RecurrentGradientMachine::fillGenOutputs() { size_t minFinalPathsSize = std::min(numResults, finalPaths_[i].size()); std::partial_sort(finalPaths_[i].begin(), finalPaths_[i].begin() + minFinalPathsSize, - finalPaths_[i].end(), Path::greaterPath); + finalPaths_[i].end(), + Path::greaterPath); finalPaths_[i].resize(minFinalPathsSize); } @@ -1154,8 +1184,8 @@ void RecurrentGradientMachine::fillGenOutputs() { for (size_t j = 0; j < finalPaths_[i].size(); ++j) { Path& path = finalPaths_[i][j]; generator_.ids.push_back(path.ids.size()); // sequence size - generator_.ids.insert(generator_.ids.end(), path.ids.begin(), - path.ids.end()); + generator_.ids.insert( + generator_.ids.end(), path.ids.begin(), path.ids.end()); generator_.ids.push_back(-1); // end of sequence probs[i * numResults + j] = path.logProb; @@ -1198,8 +1228,12 @@ void RecurrentGradientMachine::createDataOutlink( } for (size_t i = 0; i < dataArgsSize_; i++) { - dataArgs_[i].concat(dataArgsFrame_[i], machineIdVec, starts, useGpu_, - HPPL_STREAM_1, PASS_TEST); + dataArgs_[i].concat(dataArgsFrame_[i], + machineIdVec, + starts, + useGpu_, + HPPL_STREAM_1, + PASS_TEST); auto dataAgent = dynamic_cast(outFrameLines_[i + 1].agentLayer.get()); @@ -1235,7 +1269,8 @@ void RecurrentGradientMachine::beamSearch(size_t batchSize) { auto ptr = new ScopedCallbacks(beamSearchStatistics_->onEachStepStarted, - beamSearchStatistics_->onEachStepStoped, i); + beamSearchStatistics_->onEachStepStoped, + i); statisticsBlock.reset(ptr); } if (stopBeamSearch_) break; @@ -1246,7 +1281,9 @@ void RecurrentGradientMachine::beamSearch(size_t batchSize) { std::vector*> prefixes; prefixes.resize(paths.size()); std::transform( - paths.begin(), paths.end(), prefixes.begin(), + paths.begin(), + paths.end(), + prefixes.begin(), [](const Path& p) { return const_cast*>(&p.ids); }); beamSearchCtrlCallbacks_->beamSearchCandidateAdjust( prefixes, frames_[machineCur].get(), i); diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h index 6328213793ed6ca39214ec00124570ecb1ce273b..cb74a67e52f5f48d106b9fe93b1230a1675d3341 100644 --- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h +++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h @@ -44,19 +44,22 @@ public: this->removeBeamSearchControlCallbacks(); } - virtual void init(const ModelConfig& config, ParamInitCallback callback, + virtual void init(const ModelConfig& config, + ParamInitCallback callback, const std::vector& parameterTypes, bool useGpu); virtual void prefetch(const std::vector& inArgs); virtual void forward(const std::vector& inArgs, - std::vector* outArgs, PassType passType); + std::vector* outArgs, + PassType passType); virtual void backward(const UpdateCallback& callback = nullptr); void forwardBackward(const std::vector& inArgs, - std::vector* outArgs, PassType passType, + std::vector* outArgs, + PassType passType, const UpdateCallback& callback); virtual void resetState() {} @@ -81,8 +84,8 @@ public: * beam search, so that user can customize different operations in different * beam search iterations. */ - typedef std::function*>&, - NeuralNetwork*, const int)> + typedef std::function*>&, NeuralNetwork*, const int)> BeamSearchCandidatesAdjustCallback; /** @@ -99,8 +102,9 @@ public: * * Return true if this prefix or candidate is expected to be dropped. */ - typedef std::function&, - const std::vector&)> DropCallback; + typedef std::function&, const std::vector&)> + DropCallback; /** * @brief NormOrDropNodeCallback @@ -115,8 +119,9 @@ public: * * The fourth parameter is the probability of the whole path. */ - typedef std::function&, - std::vector&, real*)> NormOrDropNodeCallback; + typedef std::function&, std::vector&, real*)> + NormOrDropNodeCallback; /** * @brief Register beam search control callbacks. Used for prediction. @@ -346,7 +351,8 @@ protected: * If hasSubseq, will also create scattered sequenceStartPositions infomation * for all realLayer of inFrameLines one time. */ - void createInFrameInfo(int inlinks_id, const Argument& input, + void createInFrameInfo(int inlinks_id, + const Argument& input, PassType passType); void createMemoryFrameInfo(MemoryFrameLine* memoryFrameLine, @@ -354,8 +360,10 @@ protected: void copyScattedId(std::vector& srcIds, IVectorPtr* dstIds, int size); - void selectRowsOneTime(LayerPtr layer, const IVectorPtr& allIds, - Argument* arg, PassType passType); + void selectRowsOneTime(LayerPtr layer, + const IVectorPtr& allIds, + Argument* arg, + PassType passType); void createSeqPos(const std::vector& sequenceStartPosition, ICpuGpuVectorPtr* sequenceStartPositions); @@ -459,7 +467,8 @@ private: * @param totalExpandCount : number of already shrinked paths in newPaths * @return size of retained paths at the end of a beam search iteration */ - size_t beamShrink(std::vector& newPaths, size_t seqId, + size_t beamShrink(std::vector& newPaths, + size_t seqId, size_t totalExpandCount); /* @@ -469,8 +478,10 @@ private: * @param curPathId : index of curPath in member newPaths * @param expandWidth : number of paths to be expanded */ - void singlePathExpand(Path& curPath, size_t curPathId, - std::vector& newPaths, size_t expandWidth); + void singlePathExpand(Path& curPath, + size_t curPathId, + std::vector& newPaths, + size_t expandWidth); /* * @brief A new beam search iteration. Each half-generated paths in previous diff --git a/paddle/gserver/layers/AddtoLayer.cpp b/paddle/gserver/layers/AddtoLayer.cpp index 083b1957f3a724370f1de0824a6ac79d74224a03..8a9aecfa19b815814a985183ee28344a6f4f9712 100644 --- a/paddle/gserver/layers/AddtoLayer.cpp +++ b/paddle/gserver/layers/AddtoLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "AddtoLayer.h" #include "paddle/utils/Logging.h" diff --git a/paddle/gserver/layers/AddtoLayer.h b/paddle/gserver/layers/AddtoLayer.h index 0f2ca0bf19ee7dea57230042dbb13e422e8821e4..883d186f3e63f3a60789c0a4f0e05db1202f3ec8 100644 --- a/paddle/gserver/layers/AddtoLayer.h +++ b/paddle/gserver/layers/AddtoLayer.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "Layer.h" @@ -21,15 +20,16 @@ limitations under the License. */ namespace paddle { -/** - * This layer just simply add all input layers together, then activate - * the sum inputs. Each input of this layer should be the same size, +/** + * This layer just simply add all input layers together, then activate + * the sum inputs. Each input of this layer should be the same size, * which is also the output size of this layer. * \f[ * y=f(\sum_{i}x_i + b) * \f] - * where \f$y\f$ is output, \f$x\f$ is input, \f$b\f$ is bias, and \f$f\f$ is activation function. - * + * where \f$y\f$ is output, \f$x\f$ is input, \f$b\f$ is bias, and \f$f\f$ is + * activation function. + * * The config file api is addto_layer. */ class AddtoLayer : public Layer { @@ -41,20 +41,20 @@ public: ~AddtoLayer() {} - /** - * Intialization of AddtoLayer. + /** + * Intialization of AddtoLayer. */ bool init(const LayerMap& layerMap, const ParameterMap& parameterMap); - /** + /** * Forward propagation. - * @note There is no weight matrix for each input, + * @note There is no weight matrix for each input, * because it just a simple add operation. */ void forward(PassType passType); - /** - * Backward propagation. + /** + * Backward propagation. */ void backward(const UpdateCallback& callback = nullptr); }; diff --git a/paddle/gserver/layers/AgentLayer.cpp b/paddle/gserver/layers/AgentLayer.cpp index 5e07446c71ff626684894cd99305ea8dc938d00d..eb89281cb1c75cb9b0679bd40ed4cfd4e2224188 100644 --- a/paddle/gserver/layers/AgentLayer.cpp +++ b/paddle/gserver/layers/AgentLayer.cpp @@ -44,8 +44,8 @@ void AgentLayer::forward(PassType passType) { if (realOutput.ids) { output_.ids->subVecFrom(*realOutput.ids, 0, numSamples_); } else { - output_.subArgFrom(realOutput, /* offset */ 0, numSamples_, getSize(), - useGpu_); + output_.subArgFrom( + realOutput, /* offset */ 0, numSamples_, getSize(), useGpu_); } } else { output_ = realOutput; @@ -64,9 +64,15 @@ void SequenceAgentLayer::forward(PassType passType) { int numRows = realOutput.sequenceStartPositions->getData(false)[numSamples_]; CHECK(!realOutput.ids) << "Not supported"; - output_.subArgFrom(realOutput, /* offset */ 0, numRows, getSize(), useGpu_, - /* trans */ false, /* seqFlag */ true, - /* seqStart */ 0, /* seqSize */ numSamples_ + 1); + output_.subArgFrom(realOutput, + /* offset */ 0, + numRows, + getSize(), + useGpu_, + /* trans */ false, + /* seqFlag */ true, + /* seqStart */ 0, + /* seqSize */ numSamples_ + 1); } else { output_ = realOutput; } @@ -107,7 +113,8 @@ void GatherAgentLayer::forward(PassType passType) { for (size_t i = 0; i < realLayers_.size(); ++i) { const MatrixPtr& realV = realLayers_[i]->getOutputValue(); idsVec_[i] = IVector::create(allIds_->getData() + idIndex_[i], - /* size */ realV->getHeight(), useGpu_); + /* size */ realV->getHeight(), + useGpu_); realV->addToRows(*outV, *idsVec_[i]); } } @@ -140,8 +147,8 @@ void ScatterAgentLayer::forward(PassType passType) { int width = this->getSize(); if (realOutArg_.value || realOutArg_.ids) { - output_.subArgFrom(realOutArg_, /* offset */ idIndex_, idSize_, width, - useGpu_); + output_.subArgFrom( + realOutArg_, /* offset */ idIndex_, idSize_, width, useGpu_); } else { // used in generation if (realLayer_->getOutput().ids) { IVector::resizeOrCreate(output_.ids, ids_->getSize(), useGpu_); @@ -223,8 +230,13 @@ void SequenceScatterAgentLayer::forward(PassType passType) { if (realOutArg_.value || realOutArg_.ids) { CHECK(realOutArg_.sequenceStartPositions); - output_.subArgFrom(realOutArg_, /* offset */ idIndex_, idSize_, width, - useGpu_, /* trans */ false, /* seqFlag */ true, + output_.subArgFrom(realOutArg_, + /* offset */ idIndex_, + idSize_, + width, + useGpu_, + /* trans */ false, + /* seqFlag */ true, /* seqStart */ seqStartPosIndex_, /* seqSize */ numSequences_); } else { @@ -247,8 +259,8 @@ void SequenceScatterAgentLayer::forward(PassType passType) { CHECK_NE(input.sequenceStartPositions.get(), output_.sequenceStartPositions.get()); - ICpuGpuVector::resizeOrCreate(output_.sequenceStartPositions, - numSequences + 1, false); + ICpuGpuVector::resizeOrCreate( + output_.sequenceStartPositions, numSequences + 1, false); int* outStarts = output_.sequenceStartPositions->getMutableData(false); ICpuGpuVector::resizeOrCreate(inputStartPos_, height, false); diff --git a/paddle/gserver/layers/AgentLayer.h b/paddle/gserver/layers/AgentLayer.h index 3d7bf5583407078da4d66264e62581a59d5013ae..0186653c0f26cd2b53fc6d96d0dfad09dab6fa5b 100644 --- a/paddle/gserver/layers/AgentLayer.h +++ b/paddle/gserver/layers/AgentLayer.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "Layer.h" @@ -82,7 +81,8 @@ public: bool init(const LayerMap& layerMap, const ParameterMap& parameterMap); // call before addRealLayer - void copyIdAndSequenceInfo(const Argument& input, const IVectorPtr& allIds, + void copyIdAndSequenceInfo(const Argument& input, + const IVectorPtr& allIds, const std::vector& idIndex); // add one real layer, can call many times @@ -140,11 +140,12 @@ public: * * @param layer[input] realLayer * @param ids[input] row id in real layer - * @param copyId[input] whether to copy a cpu version of ids, - * false(default) in ScatterAgentLayer, and + * @param copyId[input] whether to copy a cpu version of ids, + * false(default) in ScatterAgentLayer, and * true in SequenceScatterAgentLayer. */ - void setRealLayer(LayerPtr layer, const std::vector& ids, + void setRealLayer(LayerPtr layer, + const std::vector& ids, bool copyId = false) { realLayer_ = layer; IVector::resizeOrCreate(ids_, ids.size(), useGpu_); @@ -161,8 +162,11 @@ public: // set real layer and output, [idIndex, idIndex + idSize) of *ids* // are selected row for realOutArg in realLayer - void setRealLayerAndOutput(LayerPtr layer, const Argument& outArg, - const IVectorPtr& ids, int idIndex, int idSize) { + void setRealLayerAndOutput(LayerPtr layer, + const Argument& outArg, + const IVectorPtr& ids, + int idIndex, + int idSize) { realLayer_ = layer; realOutArg_ = outArg; ids_ = ids; @@ -170,9 +174,9 @@ public: idSize_ = idSize; } - void setSequenceStartPositions( - const ICpuGpuVectorPtr& sequenceStartPositions, - int seqStartPosIndex, int numSequences) { + void setSequenceStartPositions(const ICpuGpuVectorPtr& sequenceStartPositions, + int seqStartPosIndex, + int numSequences) { realOutArg_.sequenceStartPositions = sequenceStartPositions; seqStartPosIndex_ = seqStartPosIndex; numSequences_ = numSequences; diff --git a/paddle/gserver/layers/AverageLayer.cpp b/paddle/gserver/layers/AverageLayer.cpp index 7401cdc9a516bb1f0f68bfd27b4ad422bb7078fa..af64e15fe3ba68c62f164c45400f55fcaa937068 100644 --- a/paddle/gserver/layers/AverageLayer.cpp +++ b/paddle/gserver/layers/AverageLayer.cpp @@ -75,8 +75,8 @@ void AverageLayer::backward(const UpdateCallback& callback) { // empty sequence continue; } - dataMtx_->setData(gradientData + starts[sequenceId] * dim, sequenceLength, - dim); + dataMtx_->setData( + gradientData + starts[sequenceId] * dim, sequenceLength, dim); outMtx_->setData(gradient + sequenceId * dim); switch (mode_) { case kAverage: { diff --git a/paddle/gserver/layers/BatchNormBaseLayer.cpp b/paddle/gserver/layers/BatchNormBaseLayer.cpp index 8052b35ec69c500b9005d4ffef882ceafa3bdab8..2d5bcff29fd5ad33c8eba85fc803bbf89803782e 100644 --- a/paddle/gserver/layers/BatchNormBaseLayer.cpp +++ b/paddle/gserver/layers/BatchNormBaseLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/utils/Stat.h" #include "Layer.h" #include "BatchNormBaseLayer.h" diff --git a/paddle/gserver/layers/BatchNormBaseLayer.h b/paddle/gserver/layers/BatchNormBaseLayer.h index 2302d1a8e0b17f4b67835e65a3453f8f6e20f721..d65882d39df2bb93920dad37ebc78342e31aef85 100644 --- a/paddle/gserver/layers/BatchNormBaseLayer.h +++ b/paddle/gserver/layers/BatchNormBaseLayer.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "paddle/utils/Stat.h" @@ -21,14 +20,15 @@ limitations under the License. */ namespace paddle { /** - * @brief Batch normalization layer use to normalizes the input to across the batch. + * @brief Batch normalization layer use to normalizes the input to across the + * batch. * * By default, calculating global mean and variance statistics via a running * average in the training peroid. Then the pre-calculated global mean and * variance are used for testing. * * Moving mean and variance are located in Parameter object when constructing - * and the calculation will change them. Now we only save global mean and + * and the calculation will change them. Now we only save global mean and * variance of one thread in first node for GPU. * But the calculation in CPU is different, because parameters are shared by * multiple threads. Here using ShareCpuMatrix with lock to calculate. We @@ -41,8 +41,7 @@ namespace paddle { class BatchNormBaseLayer : public Layer { public: - explicit BatchNormBaseLayer(const LayerConfig& config) - : Layer(config) {} + explicit BatchNormBaseLayer(const LayerConfig& config) : Layer(config) {} ~BatchNormBaseLayer() {} @@ -55,8 +54,8 @@ public: virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap); - /** - * @brief Calculate feature map size. Some input uses frameHeight and + /** + * @brief Calculate feature map size. Some input uses frameHeight and * frameWidth to store feature size */ void calFeatureMapSize(); diff --git a/paddle/gserver/layers/BatchNormalizationLayer.cpp b/paddle/gserver/layers/BatchNormalizationLayer.cpp index b2921e6d40d3d5d3777fbb26fa9314aaa73f82da..e431c033117c5d405324e7440b84d0e79018b52a 100644 --- a/paddle/gserver/layers/BatchNormalizationLayer.cpp +++ b/paddle/gserver/layers/BatchNormalizationLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/utils/Stat.h" #ifndef PADDLE_ONLY_CPU #include "hl_batch_transpose.h" @@ -44,8 +43,8 @@ void BatchNormalizationLayer::calMeanAndStd(const MatrixPtr& mat) { tmpMat_->square(); savedInvVar_->zeroMem(); savedInvVar_->accumulateColSum(*tmpMat_); - savedInvVar_->mulScalar(1.0 / numSamples); // E[x^2] - savedInvVar_->addSquare(*savedMean_, -1.0); // E[x^2] - E^2[x] + savedInvVar_->mulScalar(1.0 / numSamples); // E[x^2] + savedInvVar_->addSquare(*savedMean_, -1.0); // E[x^2] - E^2[x] // Variance may be small negative value // because of the subtraction operation. @@ -104,17 +103,23 @@ void BatchNormalizationLayer::expandMat(const MatrixPtr& in, MatrixPtr& out) { #ifdef PADDLE_ONLY_CPU LOG(FATAL) << "paddle is compiled only for cpu"; #else - batchTranspose(in->getData(), out->getData(), imgPixels_, - channels_, batchSize); + batchTranspose( + in->getData(), out->getData(), imgPixels_, channels_, batchSize); #endif } else { for (size_t i = 0; i < batchSize; i++) { const MatrixPtr inTmp = - Matrix::create(in->getData() + i * imgPixels_ * channels_, channels_, - imgPixels_, false, useGpu_); + Matrix::create(in->getData() + i * imgPixels_ * channels_, + channels_, + imgPixels_, + false, + useGpu_); MatrixPtr outTmp = Matrix::create(out->getData() + i * imgPixels_ * channels_, - imgPixels_, channels_, false, useGpu_); + imgPixels_, + channels_, + false, + useGpu_); inTmp->transpose(outTmp, false); } } @@ -135,23 +140,27 @@ void BatchNormalizationLayer::shrinkMat(const MatrixPtr& in, MatrixPtr& out) { #ifdef PADDLE_ONLY_CPU LOG(FATAL) << "paddle is compiled only for cpu"; #else - batchTranspose(in->getData(), out->getData(), channels_, - imgPixels_, batchSize); + batchTranspose( + in->getData(), out->getData(), channels_, imgPixels_, batchSize); #endif } else { for (size_t i = 0; i < batchSize; i++) { const MatrixPtr inTmp = - Matrix::create(in->getData() + i * channels_ * imgPixels_, imgPixels_, - channels_, false, useGpu_); + Matrix::create(in->getData() + i * channels_ * imgPixels_, + imgPixels_, + channels_, + false, + useGpu_); MatrixPtr outTmp = - Matrix::create(out->getData() + i * imgPixels_ * channels_, channels_, - imgPixels_, useGpu_); + Matrix::create(out->getData() + i * imgPixels_ * channels_, + channels_, + imgPixels_, + useGpu_); inTmp->transpose(outTmp, false); } } } - void BatchNormalizationLayer::forward(PassType passType) { Layer::forward(passType); @@ -165,12 +174,12 @@ void BatchNormalizationLayer::forward(PassType passType) { useGlobalStats_ = config_.use_global_stats(); } - Matrix::resizeOrCreate(expandedIn_, batchSize * imgPixels_, channels_, false, - useGpu_); - Matrix::resizeOrCreate(normIn_, batchSize * imgPixels_, channels_, false, - useGpu_); - Matrix::resizeOrCreate(expandedOut_, batchSize * imgPixels_, channels_, false, - useGpu_); + Matrix::resizeOrCreate( + expandedIn_, batchSize * imgPixels_, channels_, false, useGpu_); + Matrix::resizeOrCreate( + normIn_, batchSize * imgPixels_, channels_, false, useGpu_); + Matrix::resizeOrCreate( + expandedOut_, batchSize * imgPixels_, channels_, false, useGpu_); expandMat(getInputValue(0), expandedIn_); if (useGlobalStats_) { @@ -184,7 +193,7 @@ void BatchNormalizationLayer::forward(PassType passType) { } normIn_->assign(*expandedIn_); - normIn_->addBias(*savedMean_, -1); // subtract mean. + normIn_->addBias(*savedMean_, -1); // subtract mean. normIn_->divRowVector(*savedInvVar_); // divide std. expandedOut_->assign(*normIn_); @@ -211,18 +220,18 @@ void BatchNormalizationLayer::backward(const UpdateCallback& callback) { Matrix::resizeOrCreate(meanGrad_, 1, channels_, false, useGpu_); Matrix::resizeOrCreate(stdGrad_, 1, channels_, false, useGpu_); - Matrix::resizeOrCreate(expandedInGrad_, batchSize * imgPixels_, channels_, - false, useGpu_); - Matrix::resizeOrCreate(inGrad_, batchSize, imgPixels_ * channels_, false, - useGpu_); - Matrix::resizeOrCreate(normInGrad_, batchSize * imgPixels_, channels_, false, - useGpu_); - Matrix::resizeOrCreate(expandedOutGrad_, batchSize * imgPixels_, channels_, - false, useGpu_); - Matrix::resizeOrCreate(tmpMat_, batchSize * imgPixels_, channels_, false, - useGpu_); - Matrix::resizeOrCreate(tmpGrad_, batchSize * imgPixels_, channels_, false, - useGpu_); + Matrix::resizeOrCreate( + expandedInGrad_, batchSize * imgPixels_, channels_, false, useGpu_); + Matrix::resizeOrCreate( + inGrad_, batchSize, imgPixels_ * channels_, false, useGpu_); + Matrix::resizeOrCreate( + normInGrad_, batchSize * imgPixels_, channels_, false, useGpu_); + Matrix::resizeOrCreate( + expandedOutGrad_, batchSize * imgPixels_, channels_, false, useGpu_); + Matrix::resizeOrCreate( + tmpMat_, batchSize * imgPixels_, channels_, false, useGpu_); + Matrix::resizeOrCreate( + tmpGrad_, batchSize * imgPixels_, channels_, false, useGpu_); expandMat(getOutputGrad(), expandedOutGrad_); diff --git a/paddle/gserver/layers/BatchNormalizationLayer.h b/paddle/gserver/layers/BatchNormalizationLayer.h index 175b9a80e63f796d272af3940705def7b9857df7..36925a5ed2d56e4a5c58525cc238164f72bef40c 100644 --- a/paddle/gserver/layers/BatchNormalizationLayer.h +++ b/paddle/gserver/layers/BatchNormalizationLayer.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "Layer.h" diff --git a/paddle/gserver/layers/BilinearInterpLayer.cpp b/paddle/gserver/layers/BilinearInterpLayer.cpp index ac5f87be7af070a1146f79b633c777e77633b80b..c30e26dc031378ce792534c5eec6c24fc0d20ef9 100644 --- a/paddle/gserver/layers/BilinearInterpLayer.cpp +++ b/paddle/gserver/layers/BilinearInterpLayer.cpp @@ -40,10 +40,10 @@ size_t BilinearInterpLayer::getSize() { CHECK(inImgH_ > 0 && inImgW_ > 0); CHECK(numChannels_); - ratioH_ = (outImgH_ > 1) ? - static_cast(inImgH_ - 1) / (outImgH_ - 1) : 0.f; - ratioW_ = (outImgW_ > 1) ? - static_cast(inImgW_ - 1) / (outImgW_ - 1) : 0.f; + ratioH_ = + (outImgH_ > 1) ? static_cast(inImgH_ - 1) / (outImgH_ - 1) : 0.f; + ratioW_ = + (outImgW_ > 1) ? static_cast(inImgW_ - 1) / (outImgW_ - 1) : 0.f; getOutput().setFrameHeight(outImgH_); getOutput().setFrameWidth(outImgW_); @@ -74,21 +74,33 @@ void BilinearInterpLayer::forward(PassType passType) { MatrixPtr outV = getOutputValue(); { REGISTER_TIMER_INFO("FwBilinearInterpTimer", getName().c_str()); - outV->bilinearForward(*inV, inImgH_, inImgW_, outImgH_, outImgW_, - numChannels_, ratioH_, ratioW_); + outV->bilinearForward(*inV, + inImgH_, + inImgW_, + outImgH_, + outImgW_, + numChannels_, + ratioH_, + ratioW_); } } void BilinearInterpLayer::backward(const UpdateCallback& callback) { - (void) callback; + (void)callback; MatrixPtr inputG = getInputGrad(0); MatrixPtr outG = getOutputGrad(); { REGISTER_TIMER_INFO("BwBilinearInterpTimer", getName().c_str()); if (inputG) { - inputG->bilinearBackward(*outG, outImgH_, outImgW_, inImgH_, inImgW_, - numChannels_, ratioH_, ratioW_); + inputG->bilinearBackward(*outG, + outImgH_, + outImgW_, + inImgH_, + inImgW_, + numChannels_, + ratioH_, + ratioW_); } } } diff --git a/paddle/gserver/layers/BlockExpandLayer.cpp b/paddle/gserver/layers/BlockExpandLayer.cpp index 8da159def82b0cb91bc8ffbd8f29891319fa6f35..17d77879b27be332a49eae4e476b776ec2f5c8e2 100644 --- a/paddle/gserver/layers/BlockExpandLayer.cpp +++ b/paddle/gserver/layers/BlockExpandLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "BlockExpandLayer.h" #include "paddle/utils/Logging.h" @@ -52,7 +51,7 @@ size_t BlockExpandLayer::getBlockNum() { if (imgSizeW_ == 0) { imgSizeW_ = blockConf.img_size_x(); } - size_t tmpH = 2 * paddingH_ + imgSizeH_ - blockH_; + size_t tmpH = 2 * paddingH_ + imgSizeH_ - blockH_; outputH_ = (int)tmpH < 0 ? 1 : 1 + (tmpH + strideH_ - 1) / strideH_; size_t tmpW = 2 * paddingW_ + imgSizeW_ - blockW_; outputW_ = (int)tmpW < 0 ? 1 : 1 + (tmpW + strideW_ - 1) / strideW_; @@ -73,8 +72,8 @@ void BlockExpandLayer::forward(PassType passType) { MatrixPtr input = getPrev(0)->getOutputValue(); Matrix::resizeOrCreate(outVTrans_, blockSize, blockNum, false, useGpu_); - ICpuGpuVector::resizeOrCreate(out.sequenceStartPositions, - batchSize + 1, false); + ICpuGpuVector::resizeOrCreate( + out.sequenceStartPositions, batchSize + 1, false); IVector::resizeOrCreate(out.cpuSequenceDims, 2 * batchSize, false); int* start = out.sequenceStartPositions->getMutableData(false); int* dims = out.cpuSequenceDims->getData(); @@ -82,14 +81,29 @@ void BlockExpandLayer::forward(PassType passType) { outVTrans_->zeroMem(); /* expand each block as one row */ MatrixPtr inputTmp = - Matrix::create(input->getData() + i * input->getWidth(), 1, - input->getWidth(), false, useGpu_); - outVTrans_->convExpand(*inputTmp, imgSizeH_, imgSizeW_, channels_, blockH_, - blockW_, strideH_, strideW_, paddingH_, paddingW_, - outputH_, outputW_); + Matrix::create(input->getData() + i * input->getWidth(), + 1, + input->getWidth(), + false, + useGpu_); + outVTrans_->convExpand(*inputTmp, + imgSizeH_, + imgSizeW_, + channels_, + blockH_, + blockW_, + strideH_, + strideW_, + paddingH_, + paddingW_, + outputH_, + outputW_); MatrixPtr outVTmp = - Matrix::create(outV->getData() + i * blockNum * blockSize, blockNum, - blockSize, false, useGpu_); + Matrix::create(outV->getData() + i * blockNum * blockSize, + blockNum, + blockSize, + false, + useGpu_); outVTrans_->transpose(outVTmp, false); start[i] = i * blockNum; dims[2 * i] = outputH_; @@ -115,15 +129,32 @@ void BlockExpandLayer::backward(const UpdateCallback& callback) { for (size_t i = 0; i < batchSize; i++) { MatrixPtr gradTmp = - Matrix::create(grad->getData() + i * blockNum * blockSize, blockNum, - blockSize, false, useGpu_); + Matrix::create(grad->getData() + i * blockNum * blockSize, + blockNum, + blockSize, + false, + useGpu_); gradTmp->transpose(gradTrans, false); MatrixPtr preGradTmp = - Matrix::create(preGrad->getData() + i * preGrad->getWidth(), 1, - preGrad->getWidth(), false, useGpu_); - preGradTmp->convShrink(*gradTrans, imgSizeH_, imgSizeW_, channels_, blockH_, - blockW_, strideH_, strideW_, paddingH_, paddingW_, - outputH_, outputW_, 1.0, 1.0); + Matrix::create(preGrad->getData() + i * preGrad->getWidth(), + 1, + preGrad->getWidth(), + false, + useGpu_); + preGradTmp->convShrink(*gradTrans, + imgSizeH_, + imgSizeW_, + channels_, + blockH_, + blockW_, + strideH_, + strideW_, + paddingH_, + paddingW_, + outputH_, + outputW_, + 1.0, + 1.0); } } diff --git a/paddle/gserver/layers/BlockExpandLayer.h b/paddle/gserver/layers/BlockExpandLayer.h index f8f81721278c6c70a2bbea5f10ab9a1b9e501b35..1496fb681acd7ca7190e43cce38c7eb347932d29 100644 --- a/paddle/gserver/layers/BlockExpandLayer.h +++ b/paddle/gserver/layers/BlockExpandLayer.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "Layer.h" diff --git a/paddle/gserver/layers/CRFDecodingLayer.cpp b/paddle/gserver/layers/CRFDecodingLayer.cpp index d3dfbb7c80f68b8134edc15625abf58504f27017..8986741dc307ba765707d6e5817a2e376b27828e 100644 --- a/paddle/gserver/layers/CRFDecodingLayer.cpp +++ b/paddle/gserver/layers/CRFDecodingLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "CRFDecodingLayer.h" namespace paddle { @@ -46,7 +45,8 @@ void CRFDecodingLayer::forward(PassType passType) { for (size_t i = 0; i < numSequences; ++i) { crf_->decode(output.value->getData() + numClasses_ * starts[i], - output_.ids->getData() + starts[i], starts[i + 1] - starts[i]); + output_.ids->getData() + starts[i], + starts[i + 1] - starts[i]); } if (inputLayers_.size() == 2) { diff --git a/paddle/gserver/layers/CRFDecodingLayer.h b/paddle/gserver/layers/CRFDecodingLayer.h index 005bffff6b6b803dba4c72fcbdd61cf09838f014..1914062011d3bceba2f8765fb3cfd2d29ca6d6e9 100644 --- a/paddle/gserver/layers/CRFDecodingLayer.h +++ b/paddle/gserver/layers/CRFDecodingLayer.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include diff --git a/paddle/gserver/layers/CRFLayer.cpp b/paddle/gserver/layers/CRFLayer.cpp index c1dcad2b5f2a840ba06e8ef9833eee7a6e5e20cb..ed4f864ba9167129db1a3f56403940d9d7807a15 100644 --- a/paddle/gserver/layers/CRFLayer.cpp +++ b/paddle/gserver/layers/CRFLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "CRFLayer.h" namespace paddle { @@ -73,12 +72,13 @@ void CRFLayer::forward(PassType passType) { crfs_.emplace_back(numClasses_, parameter_->getBuf(PARAMETER_VALUE)->getData(), parameter_->getBuf(PARAMETER_GRADIENT) - ? parameter_->getBuf(PARAMETER_GRADIENT)->getData() - : nullptr); + ? parameter_->getBuf(PARAMETER_GRADIENT)->getData() + : nullptr); } - output_.value->getData()[i] = crfs_[i].forward( - output.value->getData() + numClasses_ * starts[i], - label.ids->getData() + starts[i], starts[i + 1] - starts[i]); + output_.value->getData()[i] = + crfs_[i].forward(output.value->getData() + numClasses_ * starts[i], + label.ids->getData() + starts[i], + starts[i + 1] - starts[i]); } if (weightLayer_) { @@ -87,7 +87,7 @@ void CRFLayer::forward(PassType passType) { } } -void CRFLayer::backward(const UpdateCallback &callback) { +void CRFLayer::backward(const UpdateCallback& callback) { const Argument& output = getInput(0); const Argument& label = getInput(1); const int* starts = label.sequenceStartPositions->getData(false); @@ -100,7 +100,7 @@ void CRFLayer::backward(const UpdateCallback &callback) { starts[i + 1] - starts[i]); if (weightLayer_) { real weight = getInputValue(*weightLayer_)->getElement(i, 0); - MatrixPtr grad = output.grad->subRowMatrix(starts[i], starts[i+1]); + MatrixPtr grad = output.grad->subRowMatrix(starts[i], starts[i + 1]); grad->mulScalar(weight); } } diff --git a/paddle/gserver/layers/CRFLayer.h b/paddle/gserver/layers/CRFLayer.h index 58902a0d3b7e4cad67dac94be10c35ebbf83b001..21c7fc61e168cea438339db4e7abce59082fc58d 100644 --- a/paddle/gserver/layers/CRFLayer.h +++ b/paddle/gserver/layers/CRFLayer.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include @@ -39,7 +38,7 @@ protected: ParameterPtr parameter_; std::vector crfs_; LayerPtr weightLayer_; // weight for each sequence - real coeff_; // weight for the layer + real coeff_; // weight for the layer }; } // namespace paddle diff --git a/paddle/gserver/layers/CTCLayer.cpp b/paddle/gserver/layers/CTCLayer.cpp index 6b9ffc5c749fb45be567881b8e625b48e28f69b4..be5d2c8c75d6eb2381a2c1758088de0eff462200 100644 --- a/paddle/gserver/layers/CTCLayer.cpp +++ b/paddle/gserver/layers/CTCLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "CTCLayer.h" /* Please reference the Chapter7 in @@ -71,8 +70,7 @@ void CTCLayer::forwardImp(const Argument& softmaxSeqs, resizeOutput(numSequences, 1); std::vector out(numSequences); - const int* labelSeqsStarts = - labelSeqs.sequenceStartPositions->getData(false); + const int* labelSeqsStarts = labelSeqs.sequenceStartPositions->getData(false); const int* softmaxSeqsStarts = softmaxSeqs.sequenceStartPositions->getData(false); @@ -81,22 +79,22 @@ void CTCLayer::forwardImp(const Argument& softmaxSeqs, ctcs_.emplace_back(numClasses_, normByTimes_); } out[i] = ctcs_[i].forward( - softmaxSeqs.value->getData() + numClasses_ * softmaxSeqsStarts[i], - softmaxSeqsStarts[i + 1] - softmaxSeqsStarts[i], - labelSeqs.ids->getData() + labelSeqsStarts[i], - labelSeqsStarts[i + 1] - labelSeqsStarts[i]); + softmaxSeqs.value->getData() + numClasses_ * softmaxSeqsStarts[i], + softmaxSeqsStarts[i + 1] - softmaxSeqsStarts[i], + labelSeqs.ids->getData() + labelSeqsStarts[i], + labelSeqsStarts[i + 1] - labelSeqsStarts[i]); } output_.value->copyFrom(out.data(), numSequences); } -void CTCLayer::backward(const UpdateCallback &callback) { +void CTCLayer::backward(const UpdateCallback& callback) { (void)callback; if (useGpu_) { backwardImp(callback, tmpCpuInput_[0], tmpCpuInput_[1]); - const_cast(getInput(0)). - resizeAndCopyFrom(tmpCpuInput_[0], true, HPPL_STREAM_DEFAULT); - const_cast(getInput(1)). - resizeAndCopyFrom(tmpCpuInput_[1], true, HPPL_STREAM_DEFAULT); + const_cast(getInput(0)) + .resizeAndCopyFrom(tmpCpuInput_[0], true, HPPL_STREAM_DEFAULT); + const_cast(getInput(1)) + .resizeAndCopyFrom(tmpCpuInput_[1], true, HPPL_STREAM_DEFAULT); } else { backwardImp(callback, getInput(0), getInput(1)); } @@ -107,8 +105,7 @@ void CTCLayer::backwardImp(const UpdateCallback& callback, const Argument& labelSeqs) { size_t numSequences = labelSeqs.sequenceStartPositions->getSize() - 1; - const int* labelSeqsStarts = - labelSeqs.sequenceStartPositions->getData(false); + const int* labelSeqsStarts = labelSeqs.sequenceStartPositions->getData(false); const int* softmaxSeqsStarts = softmaxSeqs.sequenceStartPositions->getData(false); diff --git a/paddle/gserver/layers/CTCLayer.h b/paddle/gserver/layers/CTCLayer.h index 49a059e43e6af4194bf50fbab14f545b81f65795..18ba12583b5a22849f1ee849a3cce7249730fdaf 100644 --- a/paddle/gserver/layers/CTCLayer.h +++ b/paddle/gserver/layers/CTCLayer.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "Layer.h" @@ -28,7 +27,8 @@ public: void forwardImp(const Argument& softmaxSeqs, const Argument& labelSeqs); virtual void backward(const UpdateCallback& callback); void backwardImp(const UpdateCallback& callback, - const Argument& softmaxSeqs, const Argument& labelSeqs); + const Argument& softmaxSeqs, + const Argument& labelSeqs); protected: size_t numClasses_; diff --git a/paddle/gserver/layers/ConcatenateLayer.cpp b/paddle/gserver/layers/ConcatenateLayer.cpp index a986ec10b4a01c8cc87b067d13e76b8c456bda34..910eec8bbc10ef10f5dd4e4688eef5e87c21f506 100644 --- a/paddle/gserver/layers/ConcatenateLayer.cpp +++ b/paddle/gserver/layers/ConcatenateLayer.cpp @@ -97,8 +97,7 @@ void ConcatenateLayer::backward(const UpdateCallback& callback) { */ class ConcatenateLayer2 : public Layer { public: - explicit ConcatenateLayer2(const LayerConfig& config) : - Layer(config) {} + explicit ConcatenateLayer2(const LayerConfig& config) : Layer(config) {} ~ConcatenateLayer2() {} @@ -130,8 +129,8 @@ bool ConcatenateLayer2::init(const LayerMap& layerMap, size_t startCol = 0; size_t endCol = 0; for (size_t i = 0; i < inputLayers_.size(); i++) { - projections_.emplace_back(Projection::create(config_.inputs(i).proj_conf(), - parameters_[i], useGpu_)); + projections_.emplace_back(Projection::create( + config_.inputs(i).proj_conf(), parameters_[i], useGpu_)); endCol += projections_[i]->getOutputSize(); projCol_.push_back(std::make_pair(startCol, endCol)); diff --git a/paddle/gserver/layers/ContextProjection.cpp b/paddle/gserver/layers/ContextProjection.cpp index 3b1498f7e986737e01115c44f964b4a7ee924095..30dbf168fb6e439048e0168af572d1f20a303e79 100644 --- a/paddle/gserver/layers/ContextProjection.cpp +++ b/paddle/gserver/layers/ContextProjection.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/utils/Stat.h" #include "ContextProjection.h" @@ -21,7 +20,8 @@ namespace paddle { REGISTER_PROJECTION(context, ContextProjection); ContextProjection::ContextProjection(const ProjectionConfig& config, - ParameterPtr parameter, bool useGpu) + ParameterPtr parameter, + bool useGpu) : Projection(config, parameter, useGpu) { CHECK(config.has_context_start()); CHECK(config.has_context_length()); @@ -44,10 +44,13 @@ void ContextProjection::resetState() { CHECK_LE(config_.context_start() + config_.context_length(), 1) << "state is not allowed for future context"; if (config_.context_start() >= 0) return; - Matrix::resizeOrCreate(state_, -config_.context_start(), config_.input_size(), + Matrix::resizeOrCreate(state_, + -config_.context_start(), + config_.input_size(), false, // trans useGpu_); - Matrix::resizeOrCreate(state2_, -config_.context_start(), + Matrix::resizeOrCreate(state2_, + -config_.context_start(), config_.input_size(), false, // trans useGpu_); @@ -78,8 +81,7 @@ void ContextProjection::forward() { CHECK(in_->value); CHECK(in_->sequenceStartPositions); - auto startPositions = - in_->sequenceStartPositions->getVector(useGpu_); + auto startPositions = in_->sequenceStartPositions->getVector(useGpu_); int64_t inputDim = in_->value->getWidth(); int64_t dim = out_->value->getWidth(); @@ -88,9 +90,13 @@ void ContextProjection::forward() { REGISTER_TIMER_INFO("ContextProjectionForward", getName().c_str()); bool isPadding = config_.trainable_padding(); out_->value->contextProjectionForward( - in_->value, state_ ? state_ : isPadding ? weight_->getW() : nullptr, - *startPositions, config_.context_length(), config_.context_start(), - beginPad_, state_ ? true : isPadding); + in_->value, + state_ ? state_ : isPadding ? weight_->getW() : nullptr, + *startPositions, + config_.context_length(), + config_.context_start(), + beginPad_, + state_ ? true : isPadding); if (state_ && config_.context_start() < 0) { CHECK_EQ(1, in_->getNumSequences()); @@ -116,27 +122,35 @@ void ContextProjection::backward(const UpdateCallback& callback) { int64_t inputDim = in_->value->getWidth(); int64_t dim = out_->value->getWidth(); CHECK_EQ(dim, inputDim * config_.context_length()); - auto startPositions = - in_->sequenceStartPositions->getVector(useGpu_); + auto startPositions = in_->sequenceStartPositions->getVector(useGpu_); REGISTER_TIMER_INFO("ContextProjectionBackward", getName().c_str()); bool isPadding = config_.trainable_padding(); if (!out_->grad->useGpu()) { out_->grad->contextProjectionBackward( - in_->grad, isPadding ? weight_->getWGrad() : nullptr, *startPositions, - config_.context_length(), config_.context_start(), beginPad_, + in_->grad, + isPadding ? weight_->getWGrad() : nullptr, + *startPositions, + config_.context_length(), + config_.context_start(), + beginPad_, isPadding); } else { if (in_->grad) { - out_->grad->contextProjectionBackwardData(in_->grad, *startPositions, + out_->grad->contextProjectionBackwardData(in_->grad, + *startPositions, config_.context_length(), config_.context_start()); } if (isPadding && weight_->getWGrad()) { out_->grad->contextProjectionBackwardWeight( - weight_->getWGrad(), *startPositions, config_.context_length(), - config_.context_start(), weight_->getWGrad()->getHeight(), beginPad_); + weight_->getWGrad(), + *startPositions, + config_.context_length(), + config_.context_start(), + weight_->getWGrad()->getHeight(), + beginPad_); } } diff --git a/paddle/gserver/layers/ContextProjection.h b/paddle/gserver/layers/ContextProjection.h index 0786ee28f2eed9c73659eb2ca0d691da8d1e3e29..188dec0fb31bf468c76b9b922e0972c86e819a2d 100644 --- a/paddle/gserver/layers/ContextProjection.h +++ b/paddle/gserver/layers/ContextProjection.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "Projection.h" @@ -50,7 +49,8 @@ public: * and if it is set, constructor will set learned weight, which is used to * pad output. */ - ContextProjection(const ProjectionConfig& config, ParameterPtr parameter, + ContextProjection(const ProjectionConfig& config, + ParameterPtr parameter, bool useGpu); virtual void forward(); virtual void backward(const UpdateCallback& callback); diff --git a/paddle/gserver/layers/ConvBaseLayer.cpp b/paddle/gserver/layers/ConvBaseLayer.cpp index 6bc3b3b801796a227a7b767c8da048a3ccf88827..7637e245a38959220f0d1d52e1f705d86a7c7303 100644 --- a/paddle/gserver/layers/ConvBaseLayer.cpp +++ b/paddle/gserver/layers/ConvBaseLayer.cpp @@ -22,7 +22,8 @@ bool ConvBaseLayer::init(const LayerMap& layerMap, /* Initialize the basic parent class */ Layer::init(layerMap, parameterMap); isDeconv_ = (config_.type() == "exconv" || config_.type() == "cudnn_conv") - ? false : true; + ? false + : true; /* Initialize the convolutional layer parameter */ numFilters_ = config_.num_filters(); @@ -88,33 +89,25 @@ size_t ConvBaseLayer::calOutputSize() { auto setLayerSize = [&](IntV& inH, IntV& inW, IntV& outH, IntV& outW) { for (size_t i = 0; i < inputLayers_.size(); i++) { - inH.push_back(inputLayers_[i]->getOutput().getFrameHeight()); - inW.push_back(inputLayers_[i]->getOutput().getFrameWidth()); - if (isDeconv_) { - if (inH[i] == 0) - inH[i] = config_.inputs(i).conv_conf().output_x(); - if (inW[i] == 0) - inW[i] = config_.inputs(i).conv_conf().output_x(); - outH.push_back( - imageSize(inH[i], filterSizeY_[i], paddingY_[i], strideY_[i], - caffeMode_)); - outW.push_back( - imageSize(inW[i], filterSize_[i], padding_[i], stride_[i], - caffeMode_)); - } else { - if (inH[i] == 0) - inH[i] = config_.inputs(i).conv_conf().img_size(); - if (inW[i] == 0) - inW[i] = config_.inputs(i).conv_conf().img_size(); - outH.push_back( - outputSize(inH[i], filterSizeY_[i], paddingY_[i], strideY_[i], - caffeMode_)); - outW.push_back( - outputSize(inW[i], filterSize_[i], padding_[i], stride_[i], - caffeMode_)); - } - CHECK_EQ(outH[i], outH[0]); - CHECK_EQ(outW[i], outW[0]); + inH.push_back(inputLayers_[i]->getOutput().getFrameHeight()); + inW.push_back(inputLayers_[i]->getOutput().getFrameWidth()); + if (isDeconv_) { + if (inH[i] == 0) inH[i] = config_.inputs(i).conv_conf().output_x(); + if (inW[i] == 0) inW[i] = config_.inputs(i).conv_conf().output_x(); + outH.push_back(imageSize( + inH[i], filterSizeY_[i], paddingY_[i], strideY_[i], caffeMode_)); + outW.push_back(imageSize( + inW[i], filterSize_[i], padding_[i], stride_[i], caffeMode_)); + } else { + if (inH[i] == 0) inH[i] = config_.inputs(i).conv_conf().img_size(); + if (inW[i] == 0) inW[i] = config_.inputs(i).conv_conf().img_size(); + outH.push_back(outputSize( + inH[i], filterSizeY_[i], paddingY_[i], strideY_[i], caffeMode_)); + outW.push_back(outputSize( + inW[i], filterSize_[i], padding_[i], stride_[i], caffeMode_)); + } + CHECK_EQ(outH[i], outH[0]); + CHECK_EQ(outW[i], outW[0]); } getOutput().setFrameHeight(outH[0]); getOutput().setFrameWidth(outW[0]); diff --git a/paddle/gserver/layers/ConvBaseLayer.h b/paddle/gserver/layers/ConvBaseLayer.h index b80cab899585e7bd93bfc86d8afa116d343d36d7..85f57dbe0b7c9683ba0941ea0edc611f683cf1b4 100644 --- a/paddle/gserver/layers/ConvBaseLayer.h +++ b/paddle/gserver/layers/ConvBaseLayer.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "Layer.h" diff --git a/paddle/gserver/layers/ConvOperator.cpp b/paddle/gserver/layers/ConvOperator.cpp index 2d9c892fe595f2f4dcdc9dcc3cd392a6c29fac01..9b8e18b1ba2a4502bcdcecade94ec3e29730595c 100644 --- a/paddle/gserver/layers/ConvOperator.cpp +++ b/paddle/gserver/layers/ConvOperator.cpp @@ -155,9 +155,15 @@ void ConvOperator::reshape(int batchSize) { reshapeImageDescriptors(); if (!isSelectAlgo_) { - hl_conv_workspace(inputDesc_, outputDesc_, filterDesc_, convDesc_, - &fwdAlgo_, &fwdLimitBytes_, &bwdDataAlgo_, - &bwdDataLimitBytes_, &bwdFilterAlgo_, + hl_conv_workspace(inputDesc_, + outputDesc_, + filterDesc_, + convDesc_, + &fwdAlgo_, + &fwdLimitBytes_, + &bwdDataAlgo_, + &bwdDataLimitBytes_, + &bwdFilterAlgo_, &bwdFilterLimitBytes_); size_t maxWorkSpace = 0; @@ -171,26 +177,48 @@ void ConvOperator::reshape(int batchSize) { } void ConvOperator::computeConvSizes() { - hl_create_filter_descriptor(&filterDesc_, channels_, numFilters_, - filterSizeY_, filterSize_); + hl_create_filter_descriptor( + &filterDesc_, channels_, numFilters_, filterSizeY_, filterSize_); hl_create_tensor_descriptor(&inputDesc_); int outputX = outputSize(imgSize_, filterSize_, padding_, stride_, caffeMode_); CHECK_EQ(outputX, outputX_); hl_create_tensor_descriptor(&outputDesc_); - hl_create_convolution_descriptor(&convDesc_, inputDesc_, filterDesc_, - paddingY_, padding_, strideY_, stride_); + hl_create_convolution_descriptor(&convDesc_, + inputDesc_, + filterDesc_, + paddingY_, + padding_, + strideY_, + stride_); } void ConvOperator::reshapeImageDescriptors() { - hl_tensor_reshape(inputDesc_, 1, channels_, imageH_, imageW_, - channels_ * imageH_ * imageW_, imageH_ * imageW_, imageW_, + hl_tensor_reshape(inputDesc_, + 1, + channels_, + imageH_, + imageW_, + channels_ * imageH_ * imageW_, + imageH_ * imageW_, + imageW_, 1); - hl_tensor_reshape(outputDesc_, 1, numFilters_, outputH_, outputW_, - numFilters_ * outputH_ * outputW_, outputH_ * outputW_, - outputW_, 1); - hl_reset_convolution_descriptor(convDesc_, inputDesc_, filterDesc_, paddingY_, - padding_, strideY_, stride_); + hl_tensor_reshape(outputDesc_, + 1, + numFilters_, + outputH_, + outputW_, + numFilters_ * outputH_ * outputW_, + outputH_ * outputW_, + outputW_, + 1); + hl_reset_convolution_descriptor(convDesc_, + inputDesc_, + filterDesc_, + paddingY_, + padding_, + strideY_, + stride_); inputOffset_ = channels_ * imageH_ * imageW_; outputOffset_ = numFilters_ * outputH_ * outputW_; weightOffset_ = numFilters_ * channels_ * filterSize_ * filterSize_; @@ -220,17 +248,27 @@ void ConvOperator::forward() { reshape(batchSize); CHECK_EQ(ins_[1]->value->getHeight(), batchSize); checkFilterSize(ins_[1]->value); - Matrix::resizeOrCreate(out_->value, batchSize, - outputH_ * outputW_ * numFilters_, false, useGpu_); + Matrix::resizeOrCreate(out_->value, + batchSize, + outputH_ * outputW_ * numFilters_, + false, + useGpu_); { AsyncGpuBlock block; for (size_t batchId = 0; batchId < batchSize; ++batchId) { real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId; real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId; real *outData = out_->value->getData() + outputOffset_ * batchId; - hl_convolution_forward(inputDesc_, inputData, outputDesc_, outData, - filterDesc_, wgtData, convDesc_, workSpace_, - workSpaceInBytes_, fwdAlgo_); + hl_convolution_forward(inputDesc_, + inputData, + outputDesc_, + outData, + filterDesc_, + wgtData, + convDesc_, + workSpace_, + workSpaceInBytes_, + fwdAlgo_); } } } @@ -244,9 +282,15 @@ void ConvOperator::backward() { if (ins_[1]->grad) { real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId; real *weightGrad = ins_[1]->grad->getData() + weightOffset_ * batchId; - hl_convolution_backward_filter(inputDesc_, inputData, outputDesc_, - outGrad, filterDesc_, weightGrad, - convDesc_, workSpace_, workSpaceInBytes_, + hl_convolution_backward_filter(inputDesc_, + inputData, + outputDesc_, + outGrad, + filterDesc_, + weightGrad, + convDesc_, + workSpace_, + workSpaceInBytes_, bwdFilterAlgo_); } @@ -254,9 +298,16 @@ void ConvOperator::backward() { if (NULL != preGrad) { real *inputGrad = preGrad->getData() + inputOffset_ * batchId; real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId; - hl_convolution_backward_data( - inputDesc_, inputGrad, outputDesc_, outGrad, filterDesc_, wgtData, - convDesc_, workSpace_, workSpaceInBytes_, bwdDataAlgo_); + hl_convolution_backward_data(inputDesc_, + inputGrad, + outputDesc_, + outGrad, + filterDesc_, + wgtData, + convDesc_, + workSpace_, + workSpaceInBytes_, + bwdDataAlgo_); } } } diff --git a/paddle/gserver/layers/ConvProjection.cpp b/paddle/gserver/layers/ConvProjection.cpp index d1ce53fe26351926196a04418900a1555e0282c2..4ab0a1dc84164114df080bc1ae06905b15a3ff86 100644 --- a/paddle/gserver/layers/ConvProjection.cpp +++ b/paddle/gserver/layers/ConvProjection.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/utils/Stat.h" #include "ConvProjection.h" @@ -20,12 +19,12 @@ namespace paddle { REGISTER_PROJECTION(conv, ConvProjection); -ThreadLocalD> ConvProjection::convMem_; +ThreadLocalD> ConvProjection::convMem_; -ConvProjection::ConvProjection(const ProjectionConfig& config, - ParameterPtr parameter, bool useGpu) +ConvProjection::ConvProjection(const ProjectionConfig &config, + ParameterPtr parameter, + bool useGpu) : Projection(config, parameter, useGpu) { - CHECK(useGpu); // only support GPU getConvParams(); initCudnn(); @@ -59,12 +58,17 @@ void ConvProjection::getConvParams() { } void ConvProjection::initCudnn() { - hl_create_filter_descriptor(&filterDesc_, channels_, numFilters_, - filterH_, filterW_); + hl_create_filter_descriptor( + &filterDesc_, channels_, numFilters_, filterH_, filterW_); hl_create_tensor_descriptor(&inputDesc_); hl_create_tensor_descriptor(&outputDesc_); - hl_create_convolution_descriptor(&convDesc_, inputDesc_, filterDesc_, - paddingH_, paddingW_, strideH_, strideW_); + hl_create_convolution_descriptor(&convDesc_, + inputDesc_, + filterDesc_, + paddingH_, + paddingW_, + strideH_, + strideW_); // initialize all to default algorithms fwdAlgo_ = 0; @@ -80,11 +84,22 @@ void ConvProjection::initCudnn() { } void ConvProjection::reshapeTensorDesc(int batchSize) { - hl_tensor_reshape(inputDesc_, batchSize, channels_, imageH_, imageW_, - channels_ * imageH_ * imageW_, imageH_ * imageW_, - imageW_, 1); - hl_reset_convolution_descriptor(convDesc_, inputDesc_, filterDesc_, - paddingH_, paddingW_, strideH_, strideW_); + hl_tensor_reshape(inputDesc_, + batchSize, + channels_, + imageH_, + imageW_, + channels_ * imageH_ * imageW_, + imageH_ * imageW_, + imageW_, + 1); + hl_reset_convolution_descriptor(convDesc_, + inputDesc_, + filterDesc_, + paddingH_, + paddingW_, + strideH_, + strideW_); // The stride between two consecutive images in ConvProjection may not be 1, // for example, in the case of layer ConcatenateLayer2 with two @@ -98,8 +113,15 @@ void ConvProjection::reshapeTensorDesc(int batchSize) { nStride = out_->value->getStride(); } - hl_tensor_reshape(outputDesc_, batchSize, numFilters_, outputH_, outputW_, - nStride, outputH_ * outputW_, outputW_, 1); + hl_tensor_reshape(outputDesc_, + batchSize, + numFilters_, + outputH_, + outputW_, + nStride, + outputH_ * outputW_, + outputW_, + 1); } void ConvProjection::reshape(int batchSize) { @@ -111,20 +133,24 @@ void ConvProjection::reshape(int batchSize) { if (!isSelectAlgo_) { reshapeTensorDesc(batchSize); - hl_conv_workspace(inputDesc_, outputDesc_, filterDesc_, - convDesc_, &fwdAlgo_, &fwdLimitBytes_, - &bwdDataAlgo_, &bwdDataLimitBytes_, - &bwdFilterAlgo_, &bwdFilterLimitBytes_); + hl_conv_workspace(inputDesc_, + outputDesc_, + filterDesc_, + convDesc_, + &fwdAlgo_, + &fwdLimitBytes_, + &bwdDataAlgo_, + &bwdDataLimitBytes_, + &bwdFilterAlgo_, + &bwdFilterLimitBytes_); size_t maxWorkSpace = 0; maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_); maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_); workSpaceInBytes_ = maxWorkSpace; - VLOG(3) << getName() << " Fwd / BwdData / BwdFilter algo: " << fwdAlgo_ - << " / " << bwdDataAlgo_ - << " / " << bwdFilterAlgo_; + << " / " << bwdDataAlgo_ << " / " << bwdFilterAlgo_; } isSelectAlgo_ = true; @@ -134,7 +160,7 @@ void ConvProjection::forward() { int batchSize = in_->value->getHeight(); reshape(batchSize); - void* workSpace = NULL; + void *workSpace = NULL; if (workSpaceInBytes_ > 0) { workSpace = getSpaceBytes(workSpaceInBytes_); } @@ -145,17 +171,23 @@ void ConvProjection::forward() { real *inputData = in_->value->getData() + g * inputOffset_; real *wgtData = weight_->getW()->getData() + g * weightOffset_; real *outData = out_->value->getData() + g * outputOffset_; - hl_convolution_forward(inputDesc_, inputData, outputDesc_, - outData, filterDesc_, wgtData, - convDesc_, workSpace, - fwdLimitBytes_, fwdAlgo_); + hl_convolution_forward(inputDesc_, + inputData, + outputDesc_, + outData, + filterDesc_, + wgtData, + convDesc_, + workSpace, + fwdLimitBytes_, + fwdAlgo_); } } -void ConvProjection::backward(const UpdateCallback& callback) { +void ConvProjection::backward(const UpdateCallback &callback) { REGISTER_TIMER_INFO("CudnnConvBpTimer", getName().c_str()); - void* workSpace = NULL; + void *workSpace = NULL; if (workSpaceInBytes_ > 0) { workSpace = getSpaceBytes(workSpaceInBytes_); } @@ -165,35 +197,47 @@ void ConvProjection::backward(const UpdateCallback& callback) { if (weight_->getWGrad()) { real *inputData = in_->value->getData() + g * inputOffset_; real *weightGrad = weight_->getWGrad()->getData() + g * weightOffset_; - hl_convolution_backward_filter( - inputDesc_, inputData, outputDesc_, outGrad, filterDesc_, - weightGrad, convDesc_, workSpace, bwdFilterLimitBytes_, - bwdFilterAlgo_); + hl_convolution_backward_filter(inputDesc_, + inputData, + outputDesc_, + outGrad, + filterDesc_, + weightGrad, + convDesc_, + workSpace, + bwdFilterLimitBytes_, + bwdFilterAlgo_); } MatrixPtr preGrad = in_->grad; if (NULL != preGrad) { real *inputGrad = preGrad->getData() + g * inputOffset_; - real *wgtData = weight_->getW()->getData() + g* weightOffset_; - hl_convolution_backward_data( - inputDesc_, inputGrad, outputDesc_, outGrad, filterDesc_, - wgtData, convDesc_, workSpace, bwdDataLimitBytes_, - bwdDataAlgo_); + real *wgtData = weight_->getW()->getData() + g * weightOffset_; + hl_convolution_backward_data(inputDesc_, + inputGrad, + outputDesc_, + outGrad, + filterDesc_, + wgtData, + convDesc_, + workSpace, + bwdDataLimitBytes_, + bwdDataAlgo_); } } weight_->getParameterPtr()->incUpdate(callback); } -void* ConvProjection::getSpaceBytes(size_t size) { - std::vector& convMem = *convMem_; +void *ConvProjection::getSpaceBytes(size_t size) { + std::vector &convMem = *convMem_; if (convMem.empty()) { int numDevices = hl_get_device_count(); convMem.resize(numDevices); } int devId = hl_get_device(); - MemoryHandle** localMem = &(convMem[devId]); + MemoryHandle **localMem = &(convMem[devId]); if (NULL == *localMem || size > (*localMem)->getAllocSize()) { *localMem = new GpuMemoryHandle(size); } diff --git a/paddle/gserver/layers/ConvProjection.h b/paddle/gserver/layers/ConvProjection.h index d0bfe9a6edba05230202da065ca42741439ce190..779fe1455ade10ba55e32f4d9478d446b01b8a19 100644 --- a/paddle/gserver/layers/ConvProjection.h +++ b/paddle/gserver/layers/ConvProjection.h @@ -27,7 +27,8 @@ public: /** * Constructor. */ - ConvProjection(const ProjectionConfig& config, ParameterPtr parameter, + ConvProjection(const ProjectionConfig& config, + ParameterPtr parameter, bool useGpu); ~ConvProjection(); @@ -47,9 +48,15 @@ protected: imageW_ = in_->getFrameWidth(); if (imageH_ == 0) imageH_ = configImgH_; if (imageW_ == 0) imageW_ = configImgW_; - outputH_ = outputSize(imageH_, filterH_, paddingH_, strideH_, + outputH_ = outputSize(imageH_, + filterH_, + paddingH_, + strideH_, /* caffeMode */ true); - outputW_ = outputSize(imageW_, filterW_, paddingW_, strideW_, + outputW_ = outputSize(imageW_, + filterW_, + paddingW_, + strideW_, /* caffeMode */ true); const_cast(out_)->setFrameHeight(outputH_); diff --git a/paddle/gserver/layers/ConvShiftLayer.cpp b/paddle/gserver/layers/ConvShiftLayer.cpp index 6b3881e3cc80396bfa0b801ba296cb1118fabc74..6e77c1f14e6a6896f6ef7c4042954b25bd58266a 100644 --- a/paddle/gserver/layers/ConvShiftLayer.cpp +++ b/paddle/gserver/layers/ConvShiftLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/utils/Logging.h" #include "Layer.h" #include "paddle/math/Matrix.h" diff --git a/paddle/gserver/layers/ConvexCombinationLayer.cpp b/paddle/gserver/layers/ConvexCombinationLayer.cpp index a81cf939af671f3fb34fb52ae33035a7bb524aed..7e1fef8bc600329ac62002dab7b91238b83b8023 100644 --- a/paddle/gserver/layers/ConvexCombinationLayer.cpp +++ b/paddle/gserver/layers/ConvexCombinationLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/utils/Logging.h" #include "Layer.h" #include "paddle/math/Matrix.h" @@ -70,12 +69,21 @@ bool ConvexCombinationLayer::init(const LayerMap& layerMap, CHECK_EQ(weightDim * dataDim, inputLayers_[1]->getSize()) << "Dimension mismatch"; - tmpRow0 = Matrix::create(nullptr, /* height= */ 1, weightDim, - /* trans= */ false, useGpu_); - tmpRow1 = Matrix::create(nullptr, /* height= */ 1, dataDim, - /* trans= */ false, useGpu_); - tmpMtx0 = Matrix::create(nullptr, /* height= */ weightDim, dataDim, - /* trans= */ false, useGpu_); + tmpRow0 = Matrix::create(nullptr, + /* height= */ 1, + weightDim, + /* trans= */ false, + useGpu_); + tmpRow1 = Matrix::create(nullptr, + /* height= */ 1, + dataDim, + /* trans= */ false, + useGpu_); + tmpMtx0 = Matrix::create(nullptr, + /* height= */ weightDim, + dataDim, + /* trans= */ false, + useGpu_); return true; } diff --git a/paddle/gserver/layers/CosSimLayer.cpp b/paddle/gserver/layers/CosSimLayer.cpp index 05a70aeff5e8ff3789bca966d351bffc8efb1cb3..894cb5b0d8226cc3b4b60bac38801bf0a7ec6b6a 100644 --- a/paddle/gserver/layers/CosSimLayer.cpp +++ b/paddle/gserver/layers/CosSimLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "CosSimLayer.h" #include "paddle/utils/Logging.h" #include "paddle/utils/Stat.h" @@ -57,9 +56,12 @@ void CosSimLayer::backward(const UpdateCallback& callback) { REGISTER_TIMER_INFO("CosBpAtvTimer", getName().c_str()); MatrixPtr outG = this->getOutputGrad(); - outG->cosSimDerivative(*this->getOutputValue(), *getInputValue(0), - *getInputValue(1), *getInputGrad(0), - *getInputGrad(1), config_.cos_scale()); + outG->cosSimDerivative(*this->getOutputValue(), + *getInputValue(0), + *getInputValue(1), + *getInputGrad(0), + *getInputGrad(1), + config_.cos_scale()); } } diff --git a/paddle/gserver/layers/CosSimLayer.h b/paddle/gserver/layers/CosSimLayer.h index 65eb807ab2e6f16aab5ef2a9b08d697868c743a3..bc47998c11f267a1737ff82e8aa2958f6859bf86 100644 --- a/paddle/gserver/layers/CosSimLayer.h +++ b/paddle/gserver/layers/CosSimLayer.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "Layer.h" @@ -35,8 +34,7 @@ namespace paddle { */ class CosSimLayer : public Layer { public: - explicit CosSimLayer(const LayerConfig& config) - : Layer(config) {} + explicit CosSimLayer(const LayerConfig& config) : Layer(config) {} ~CosSimLayer() {} diff --git a/paddle/gserver/layers/CosSimVecMatLayer.cpp b/paddle/gserver/layers/CosSimVecMatLayer.cpp index 7d251ace6fdfde2506e4890b276db5b0d08d51f5..56d177da6458a590299fee5b24b8a9c935510916 100644 --- a/paddle/gserver/layers/CosSimVecMatLayer.cpp +++ b/paddle/gserver/layers/CosSimVecMatLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/utils/Logging.h" #include "Layer.h" #include "paddle/math/Matrix.h" @@ -67,19 +66,37 @@ bool CosSimVecMatLayer::init(const LayerMap& layerMap, CHECK_EQ(dataDim * numKeys, memoryDim) << "Dimension mismatch"; - tmpRow0 = Matrix::create(nullptr, /* height= */ 1, dataDim, - /* trans= */ false, useGpu_); - tmpRow1 = Matrix::create(nullptr, /* height= */ 1, dataDim, - /* trans= */ false, useGpu_); - tmpRow2 = Matrix::create(nullptr, /* height= */ numKeys, 1, - /* trans= */ false, useGpu_); - tmpRow3 = Matrix::create(nullptr, /* height= */ numKeys, 1, - /* trans= */ false, useGpu_); - - tmpMtx0 = Matrix::create(nullptr, /* height= */ numKeys, dataDim, - /* trans= */ false, useGpu_); - tmpMtx1 = Matrix::create(nullptr, /* height= */ numKeys, dataDim, - /* trans= */ false, useGpu_); + tmpRow0 = Matrix::create(nullptr, + /* height= */ 1, + dataDim, + /* trans= */ false, + useGpu_); + tmpRow1 = Matrix::create(nullptr, + /* height= */ 1, + dataDim, + /* trans= */ false, + useGpu_); + tmpRow2 = Matrix::create(nullptr, + /* height= */ numKeys, + 1, + /* trans= */ false, + useGpu_); + tmpRow3 = Matrix::create(nullptr, + /* height= */ numKeys, + 1, + /* trans= */ false, + useGpu_); + + tmpMtx0 = Matrix::create(nullptr, + /* height= */ numKeys, + dataDim, + /* trans= */ false, + useGpu_); + tmpMtx1 = Matrix::create(nullptr, + /* height= */ numKeys, + dataDim, + /* trans= */ false, + useGpu_); return true; } @@ -131,8 +148,12 @@ void CosSimVecMatLayer::backward(const UpdateCallback& callback) { tmpRow2->setData(outV->rowBuf(i)); tmpRow3->setData(outG->rowBuf(i)); - tmpRow3->cosSimDerivative(*(tmpRow2), *(tmpMtx0), *(tmpRow0), *(tmpMtx1), - *(tmpRow1), config_.cos_scale()); + tmpRow3->cosSimDerivative(*(tmpRow2), + *(tmpMtx0), + *(tmpRow0), + *(tmpMtx1), + *(tmpRow1), + config_.cos_scale()); } } else { CHECK(!inG0 || !inG1) << "Not supported"; diff --git a/paddle/gserver/layers/CostLayer.cpp b/paddle/gserver/layers/CostLayer.cpp index 3c2df52fed4f86675ce8f1ead6a3b66e4babde34..094c36ceb1f72ff9ee2cc9fa54de0b06312948fe 100644 --- a/paddle/gserver/layers/CostLayer.cpp +++ b/paddle/gserver/layers/CostLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include #include #include "paddle/utils/Logging.h" @@ -88,13 +87,15 @@ bool MultiClassCrossEntropy::init(const LayerMap& layerMap, return CostLayer::init(layerMap, parameterMap); } -void MultiClassCrossEntropy::forwardImp(Matrix& output, Argument& label, +void MultiClassCrossEntropy::forwardImp(Matrix& output, + Argument& label, Matrix& target) { target.oneHotCrossEntropy(output, *label.ids); } -void MultiClassCrossEntropy::backwardImp( - Matrix& output, Argument& label, Matrix& outputG) { +void MultiClassCrossEntropy::backwardImp(Matrix& output, + Argument& label, + Matrix& outputG) { outputG.oneHotCrossEntropyBp(output, *label.ids); } @@ -152,17 +153,19 @@ bool SoftBinaryClassCrossEntropy::init(const LayerMap& layerMap, return CostLayer::init(layerMap, parameterMap); } -void SoftBinaryClassCrossEntropy::forwardImp(Matrix& output, Argument& label, +void SoftBinaryClassCrossEntropy::forwardImp(Matrix& output, + Argument& label, Matrix& target) { - Matrix::resizeOrCreate(targetPerDim_, output.getHeight(), output.getWidth(), - false, useGpu_); + Matrix::resizeOrCreate( + targetPerDim_, output.getHeight(), output.getWidth(), false, useGpu_); targetPerDim_->softCrossEntropy(output, *label.value); targetPerDim_->rowSum(target); } -void SoftBinaryClassCrossEntropy::backwardImp( - Matrix& output, Argument& label, Matrix& outputG) { +void SoftBinaryClassCrossEntropy::backwardImp(Matrix& output, + Argument& label, + Matrix& outputG) { outputG.softCrossEntropyBp(output, *label.value); } @@ -177,13 +180,15 @@ bool SumOfSquaresCostLayer::init(const LayerMap& layerMap, return CostLayer::init(layerMap, parameterMap); } -void SumOfSquaresCostLayer::forwardImp(Matrix& output, Argument& label, +void SumOfSquaresCostLayer::forwardImp(Matrix& output, + Argument& label, Matrix& target) { target.sumOfSquares(output, *label.value); } -void SumOfSquaresCostLayer::backwardImp( - Matrix& output, Argument& label, Matrix& outputG) { +void SumOfSquaresCostLayer::backwardImp(Matrix& output, + Argument& label, + Matrix& outputG) { outputG.sumOfSquaresBp(output, *label.value); } @@ -219,8 +224,8 @@ void RankingCost::forward(PassType passType) { IVectorPtr idLabel = getInput(*getLabelLayer()).ids; CHECK(idLabel) << "label layer has neither value nor ids"; CHECK_EQ((size_t)batchSize, idLabel->getSize()); - Matrix::resizeOrCreate(labelBuf_, batchSize, /*width*/ 1, /*trans*/ false, - useGpu_); + Matrix::resizeOrCreate( + labelBuf_, batchSize, /*width*/ 1, /*trans*/ false, useGpu_); labelBuf_->copyFrom(*idLabel); label = labelBuf_; } @@ -261,8 +266,8 @@ void RankingCost::backward(const UpdateCallback& callback) { label = labelBuf_; } - Matrix::resizeOrCreate(marginGrad_, label->getHeight(), 1, /* trans= */ false, - useGpu_); + Matrix::resizeOrCreate( + marginGrad_, label->getHeight(), 1, /* trans= */ false, useGpu_); marginGrad_->zeroMem(); marginGrad_->logisticRegressionLossBp(*margin_, *label); if (weightLayer_) { @@ -317,15 +322,14 @@ void LambdaCost::forward(PassType passType) { real* outputData = output->getData(); real* targetData = target->getData(); - auto startPos = - getInput(*getOutputLayer()).sequenceStartPositions; + auto startPos = getInput(*getOutputLayer()).sequenceStartPositions; const int* startPosData = startPos->getData(false); size_t batchNum = startPos->getSize() - 1; for (size_t i = 0; i < batchNum; ++i) { int beginPos = startPosData[i]; int endPos = startPosData[i + 1]; - real NDCG = calcNDCG(outputData + beginPos, scoreData + beginPos, - endPos - beginPos); + real NDCG = calcNDCG( + outputData + beginPos, scoreData + beginPos, endPos - beginPos); for (int j = beginPos; j < endPos; ++j) { targetData[j] = NDCG; } @@ -336,23 +340,27 @@ void LambdaCost::backward(const UpdateCallback& callback) { (void)callback; MatrixPtr score = getInputValue(*getScoreLayer()); MatrixPtr output = getInputValue(*getOutputLayer()); - Matrix::resizeOrCreate(marginGrad_, score->getHeight(), 1, - /* trans= */ false, useGpu_); + Matrix::resizeOrCreate(marginGrad_, + score->getHeight(), + 1, + /* trans= */ false, + useGpu_); marginGrad_->zeroMem(); real* gradData = marginGrad_->getData(); real* scoreData = score->getData(); real* outputData = output->getData(); - auto startPos = - getInput(*getOutputLayer()).sequenceStartPositions; + auto startPos = getInput(*getOutputLayer()).sequenceStartPositions; const int* startPosData = startPos->getData(false); size_t batchNum = startPos->getSize() - 1; for (size_t i = 0; i < batchNum; ++i) { int beginPos = startPosData[i]; int endPos = startPosData[i + 1]; - calcGrad(outputData + beginPos, scoreData + beginPos, gradData + beginPos, + calcGrad(outputData + beginPos, + scoreData + beginPos, + gradData + beginPos, endPos - beginPos); } @@ -361,8 +369,10 @@ void LambdaCost::backward(const UpdateCallback& callback) { void LambdaCost::onPassEnd() {} -void LambdaCost::calcGrad(const real* outputScore, const real* score, - real* gradData, int size) { +void LambdaCost::calcGrad(const real* outputScore, + const real* score, + real* gradData, + int size) { CHECK_GE(size, truncationSize_) << "Invalid: (Sample num in the same list) < (NDCG truncation num) !"; int sortSize = maxSortSize_ == -1 ? size : std::min(maxSortSize_, size); @@ -372,13 +382,16 @@ void LambdaCost::calcGrad(const real* outputScore, const real* score, scorePair_.push_back(std::make_pair(score[i], i)); } if (size <= sortSize) { - std::sort(scorePair_.begin(), scorePair_.end(), + std::sort(scorePair_.begin(), + scorePair_.end(), [](const std::pair& a, const std::pair& b) { return a.first > b.first; }); } else { std::partial_sort( - scorePair_.begin(), scorePair_.begin() + sortSize, scorePair_.end(), + scorePair_.begin(), + scorePair_.begin() + sortSize, + scorePair_.end(), [](const std::pair& a, const std::pair& b) { return a.first > b.first; }); @@ -414,7 +427,8 @@ void LambdaCost::calcGrad(const real* outputScore, const real* score, } } -real LambdaCost::calcNDCG(const real* outputScore, const real* score, +real LambdaCost::calcNDCG(const real* outputScore, + const real* score, int size) { CHECK_GE(size, truncationSize_) << "Invalid: (Sample num in the same list) < (NDCG truncation num) !"; @@ -424,7 +438,8 @@ real LambdaCost::calcNDCG(const real* outputScore, const real* score, outputScorePair_.push_back(std::make_pair(outputScore[i], i)); } std::partial_sort( - outputScorePair_.begin(), outputScorePair_.begin() + truncationSize_, + outputScorePair_.begin(), + outputScorePair_.begin() + truncationSize_, outputScorePair_.end(), [](const std::pair& a, const std::pair& b) { return a.first > b.first; @@ -439,8 +454,10 @@ real LambdaCost::calcNDCG(const real* outputScore, const real* score, scoreVec_.resize(size); std::copy(score, score + size, scoreVec_.begin()); real maxDCG = 0; - std::partial_sort(scoreVec_.begin(), scoreVec_.begin() + truncationSize_, - scoreVec_.end(), std::greater()); + std::partial_sort(scoreVec_.begin(), + scoreVec_.begin() + truncationSize_, + scoreVec_.end(), + std::greater()); for (int i = 0; i < truncationSize_; ++i) { maxDCG += (std::pow(2, scoreVec_[i]) - 1) / std::log(i + 2); } @@ -460,7 +477,8 @@ bool MultiBinaryLabelCrossEntropy::init(const LayerMap& layerMap, return CostLayer::init(layerMap, parameterMap); } -void MultiBinaryLabelCrossEntropy::forwardImp(Matrix& output, Argument& label, +void MultiBinaryLabelCrossEntropy::forwardImp(Matrix& output, + Argument& label, Matrix& target) { MatrixPtr value = nullptr; if (label.ids) { @@ -475,16 +493,17 @@ void MultiBinaryLabelCrossEntropy::forwardImp(Matrix& output, Argument& label, dynamic_cast(value.get())) { target.multiBinaryLabelCrossEntropy(output, *value); } else { - Matrix::resizeOrCreate(targetPerDim_, output.getHeight(), output.getWidth(), - false, useGpu_); + Matrix::resizeOrCreate( + targetPerDim_, output.getHeight(), output.getWidth(), false, useGpu_); targetPerDim_->binaryLabelCrossEntropy(output, *value); targetPerDim_->rowSum(target); } } -void MultiBinaryLabelCrossEntropy::backwardImp( - Matrix& output, Argument& label, Matrix& outputG) { +void MultiBinaryLabelCrossEntropy::backwardImp(Matrix& output, + Argument& label, + Matrix& outputG) { MatrixPtr value = nullptr; if (label.ids) { CHECK(!value); @@ -519,8 +538,7 @@ bool HuberTwoClass::init(const LayerMap& layerMap, return true; } -void HuberTwoClass::forwardImp(Matrix &output, Argument &label, - Matrix &cost) { +void HuberTwoClass::forwardImp(Matrix& output, Argument& label, Matrix& cost) { if (useGpu_) { for (size_t i = 0; i < inputLayers_.size(); i++) { tmpCpuInput_[i].resizeAndCopyFrom( @@ -531,7 +549,8 @@ void HuberTwoClass::forwardImp(Matrix &output, Argument &label, forwardImpIn(output, label, cost); } -void HuberTwoClass::forwardImpIn(Matrix& output, Argument& label, +void HuberTwoClass::forwardImpIn(Matrix& output, + Argument& label, Matrix& target) { size_t numSamples = target.getHeight(); CHECK_EQ((*label.ids).getSize(), numSamples); @@ -539,7 +558,7 @@ void HuberTwoClass::forwardImpIn(Matrix& output, Argument& label, CHECK_EQ(output.getWidth(), (size_t)1); CHECK_EQ(target.getWidth(), (size_t)1); - real* out = useGpu_ ? tmpCpuInput_[0].value->getData(): output.getData(); + real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData(); int* lbl = useGpu_ ? tmpCpuInput_[1].ids->getData() : (*label.ids).getData(); std::vector cost(numSamples); for (size_t i = 0; i < numSamples; ++i) { @@ -554,19 +573,21 @@ void HuberTwoClass::forwardImpIn(Matrix& output, Argument& label, target.copyFrom(cost.data(), numSamples); } -void HuberTwoClass::backwardImp(Matrix &outputValue, - Argument &label, Matrix &outputGrad) { +void HuberTwoClass::backwardImp(Matrix& outputValue, + Argument& label, + Matrix& outputGrad) { if (useGpu_) { - backwardImpIn(*tmpCpuInput_[0].value, tmpCpuInput_[1], - *tmpCpuInput_[0].grad); + backwardImpIn( + *tmpCpuInput_[0].value, tmpCpuInput_[1], *tmpCpuInput_[0].grad); outputGrad.copyFrom(*tmpCpuInput_[0].grad); } else { backwardImpIn(outputValue, label, outputGrad); } } -void HuberTwoClass::backwardImpIn( - Matrix& output, Argument& label, Matrix& outputG) { +void HuberTwoClass::backwardImpIn(Matrix& output, + Argument& label, + Matrix& outputG) { size_t numSamples = output.getHeight(); real* out = output.getData(); real* grad = outputG.getData(); @@ -605,7 +626,7 @@ public: int batchSize = input->getHeight(); int size = 1; resizeOutput(batchSize, size); - output_.value->sumRows(*input, /* scaleSum= */1, /* scaleDest= */0); + output_.value->sumRows(*input, /* scaleSum= */ 1, /* scaleDest= */ 0); } virtual void backward(const UpdateCallback& callback = nullptr) { diff --git a/paddle/gserver/layers/CostLayer.h b/paddle/gserver/layers/CostLayer.h index f263c688213ae6a83d5db4a1025aa252344dfab8..120ff9bd2d1b402e8ef2d074a84b76b0183dcab0 100644 --- a/paddle/gserver/layers/CostLayer.h +++ b/paddle/gserver/layers/CostLayer.h @@ -42,10 +42,12 @@ public: virtual void backward(const UpdateCallback& callback = nullptr); - virtual void forwardImp(Matrix& outputValue, Argument& label, + virtual void forwardImp(Matrix& outputValue, + Argument& label, Matrix& cost) = 0; - virtual void backwardImp(Matrix& outputValue, Argument& label, + virtual void backwardImp(Matrix& outputValue, + Argument& label, Matrix& outputGrad) = 0; protected: @@ -225,7 +227,9 @@ public: void onPassEnd(); real calcNDCG(const real* outputScore, const real* score, int size); - void calcGrad(const real* outputScore, const real* score, real* gradData, + void calcGrad(const real* outputScore, + const real* score, + real* gradData, int size); private: @@ -274,6 +278,7 @@ public: */ class HuberTwoClass : public CostLayer { std::vector tmpCpuInput_; + public: explicit HuberTwoClass(const LayerConfig& config) : CostLayer(config) {} diff --git a/paddle/gserver/layers/CudnnBatchNormLayer.cpp b/paddle/gserver/layers/CudnnBatchNormLayer.cpp index 3c6d13b0bf92ea98eb5c3331a1fdff6b177529b6..6be62b1a25407a5340bb5cdd99745db5d33ec3da 100644 --- a/paddle/gserver/layers/CudnnBatchNormLayer.cpp +++ b/paddle/gserver/layers/CudnnBatchNormLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/utils/Stat.h" #include "Layer.h" #include "CudnnBatchNormLayer.h" @@ -65,16 +64,31 @@ void CudnnBatchNormLayer::forward(PassType passType) { REGISTER_TIMER_INFO("CudnnBatchFwTimer", getName().c_str()); real* savedMean = savedMean_->getData(); real* savedInvVar = savedInvVar_->getData(); - hl_batch_norm_forward_training(ioDesc_, input, ioDesc_, output, + hl_batch_norm_forward_training(ioDesc_, + input, + ioDesc_, + output, bnParamDesc_, - gamma, beta, 1.0 - movingAvgFraction_, - movingMean, movingVar, - EPS, savedMean, savedInvVar); + gamma, + beta, + 1.0 - movingAvgFraction_, + movingMean, + movingVar, + EPS, + savedMean, + savedInvVar); } else { // used movingMean and movingVar in testing - hl_batch_norm_forward_inference(ioDesc_, input, ioDesc_, output, - bnParamDesc_, gamma, beta, - movingMean, movingVar, EPS); + hl_batch_norm_forward_inference(ioDesc_, + input, + ioDesc_, + output, + bnParamDesc_, + gamma, + beta, + movingMean, + movingVar, + EPS); } /* activation */ { @@ -115,10 +129,19 @@ void CudnnBatchNormLayer::backward(const UpdateCallback& callback) { create(tmpBiasGrad_, 1, channels_, &betaGrad); } - hl_batch_norm_backward(ioDesc_, input, ioDesc_, outGrad, - ioDesc_, inGrad, bnParamDesc_, - gamma, gammaGrad, betaGrad, - EPS, savedMean, savedInvVar); + hl_batch_norm_backward(ioDesc_, + input, + ioDesc_, + outGrad, + ioDesc_, + inGrad, + bnParamDesc_, + gamma, + gammaGrad, + betaGrad, + EPS, + savedMean, + savedInvVar); { REGISTER_TIMER_INFO("WeightUpdate", getName().c_str()); diff --git a/paddle/gserver/layers/CudnnBatchNormLayer.h b/paddle/gserver/layers/CudnnBatchNormLayer.h index 03f4f591c3bfa0139c6b10f180fbdeaa19a231b8..6220e77ceb5e248e5678c9170e85aff1cb40e1cd 100644 --- a/paddle/gserver/layers/CudnnBatchNormLayer.h +++ b/paddle/gserver/layers/CudnnBatchNormLayer.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "paddle/utils/Stat.h" @@ -23,7 +22,8 @@ namespace paddle { /** * @brief Cudnn Batch normalization layer use to cuDNN lib to implentment. - * @note Cudnn version must >= v4.0, and better to use the latest version (v5.1). + * @note Cudnn version must >= v4.0, and better to use the latest version + * (v5.1). * * The config file api is batch_norm_layer. */ diff --git a/paddle/gserver/layers/CudnnConvLayer.cpp b/paddle/gserver/layers/CudnnConvLayer.cpp index 23ba2341185d1b86b90dee58939f8ca07fda9364..93c5565d2f401549959d6b067b05289592433a3a 100644 --- a/paddle/gserver/layers/CudnnConvLayer.cpp +++ b/paddle/gserver/layers/CudnnConvLayer.cpp @@ -32,16 +32,16 @@ bool CudnnConvLayer::init(const LayerMap &layerMap, numFilters_ = config_.num_filters(); CHECK(config_.shared_biases()); for (size_t i = 0; i < inputLayers_.size(); i++) { - ProjectionConfig* conf = new ProjectionConfig(); + ProjectionConfig *conf = new ProjectionConfig(); conf->set_type("conv"); conf->set_num_filters(numFilters_); - ConvConfig* convConf = conf->mutable_conv_conf(); + ConvConfig *convConf = conf->mutable_conv_conf(); *convConf = *(config_.mutable_inputs(i)->mutable_conv_conf()); conf->set_input_size(getPrev(i)->getSize()); conf->set_output_size(getSize()); projConf_.emplace_back(conf); - projections_.emplace_back(Projection::create(*projConf_[i], - parameters_[i], useGpu_)); + projections_.emplace_back( + Projection::create(*projConf_[i], parameters_[i], useGpu_)); } if (biases_.get() && sharedBiases_) { @@ -67,15 +67,21 @@ void CudnnConvLayer::forward(PassType passType) { if (biases_) { REGISTER_TIMER_INFO("CudnnConvBiasTimer", getName().c_str()); int batchSize = inputLayers_[0]->getOutputValue()->getHeight(); - hl_tensor_reshape(outputDesc_, batchSize, numFilters_ / groups_[0], - outputH_[0], outputW_[0], numFilters_ * outputH_[0] * outputW_[0], - outputH_[0] * outputW_[0], outputW_[0], 1); + hl_tensor_reshape(outputDesc_, + batchSize, + numFilters_ / groups_[0], + outputH_[0], + outputW_[0], + numFilters_ * outputH_[0] * outputW_[0], + outputH_[0] * outputW_[0], + outputW_[0], + 1); outputOffset_ = getOutputValue()->getWidth() / groups_[0]; for (int g = 0; g < groups_[0]; ++g) { real *biasData = biases_->getW()->getData() + biasOffset_ * g; real *outData = getOutputValue()->getData() + outputOffset_ * g; - hl_convolution_forward_add_bias(biasDesc_, biasData, - outputDesc_, outData); + hl_convolution_forward_add_bias( + biasDesc_, biasData, outputDesc_, outData); } } diff --git a/paddle/gserver/layers/CudnnConvLayer.h b/paddle/gserver/layers/CudnnConvLayer.h index 6390d96315cc4422c65e52f0d219b903c66f2cbd..6cfbadfb53839d847b8b2bcf768da0f473ac05e5 100644 --- a/paddle/gserver/layers/CudnnConvLayer.h +++ b/paddle/gserver/layers/CudnnConvLayer.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "ConvBaseLayer.h" diff --git a/paddle/gserver/layers/CudnnPoolLayer.cpp b/paddle/gserver/layers/CudnnPoolLayer.cpp index 24adb50a985ff4020f1716a053aba325fcf076f3..21d8e2579f77c98da1e30a205952fa53e02fb853 100644 --- a/paddle/gserver/layers/CudnnPoolLayer.cpp +++ b/paddle/gserver/layers/CudnnPoolLayer.cpp @@ -61,8 +61,13 @@ bool CudnnPoolLayer::init(const LayerMap &layerMap, strideHeight = strideY_; strideWidth = stride_; - hl_create_pooling_descriptor(&poolingDesc_, mode_, windowHeight, windowWidth, - heightPadding, widthPadding, strideHeight, + hl_create_pooling_descriptor(&poolingDesc_, + mode_, + windowHeight, + windowWidth, + heightPadding, + widthPadding, + strideHeight, strideWidth); return true; @@ -79,7 +84,10 @@ void CudnnPoolLayer::reshape(int batchSize) { } CHECK_EQ(inputLayers_[0]->getOutput().value->getWidth(), channels_ * imageH_ * imageW_); - outputH_ = outputSize(imageH_, sizeY_, confPaddingY_, strideY_, + outputH_ = outputSize(imageH_, + sizeY_, + confPaddingY_, + strideY_, /* caffeMode */ false); outputW_ = outputSize(imageW_, sizeX_, confPadding_, stride_, /* caffeMode */ false); @@ -113,8 +121,13 @@ void CudnnPoolLayer::backward(const UpdateCallback &callback) { real *inputGrad = getInputGrad(0)->getData(); real *outData = getOutputValue()->getData(); real *outGrad = getOutputGrad()->getData(); - hl_pooling_backward(inputDesc_, inputData, inputGrad, outputDesc_, outData, - outGrad, poolingDesc_); + hl_pooling_backward(inputDesc_, + inputData, + inputGrad, + outputDesc_, + outData, + outGrad, + poolingDesc_); } CudnnPoolLayer::~CudnnPoolLayer() { diff --git a/paddle/gserver/layers/CudnnPoolLayer.h b/paddle/gserver/layers/CudnnPoolLayer.h index 2ef94720d2b9f13597cb0fb546726a2c2a67cb36..6a6b28db961553506bcf5db206a65e1e9d90fe94 100644 --- a/paddle/gserver/layers/CudnnPoolLayer.h +++ b/paddle/gserver/layers/CudnnPoolLayer.h @@ -12,19 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "PoolLayer.h" namespace paddle { - /** - * @brief CudnnPoolLayer is subclass of PoolLayer, which is implemented by - * cudnn api and only supports GPU. - * - * The config file api is img_pool_layer. - */ +/** + * @brief CudnnPoolLayer is subclass of PoolLayer, which is implemented by + * cudnn api and only supports GPU. + * + * The config file api is img_pool_layer. + */ class CudnnPoolLayer : public PoolLayer { protected: diff --git a/paddle/gserver/layers/DataLayer.cpp b/paddle/gserver/layers/DataLayer.cpp index 79b9181e694f008d99bda170c562a524212b2c73..9a4b2e9d3e256119f3ff24cfcb80d68c81f67c65 100644 --- a/paddle/gserver/layers/DataLayer.cpp +++ b/paddle/gserver/layers/DataLayer.cpp @@ -32,19 +32,20 @@ void DataLayer::copyDataToOutput(Argument& output) { data_.value->getWidth(), useGpu(output.deviceId)); } else { - output.value->resize(data_.value->getHeight(), - data_.value->getWidth()); + output.value->resize(data_.value->getHeight(), data_.value->getWidth()); } output.value->copyFrom(*data_.value); } if (data_.grad) { - Matrix::resizeOrCreate(output.grad, data_.grad->getHeight(), + Matrix::resizeOrCreate(output.grad, + data_.grad->getHeight(), data_.grad->getWidth(), - /* trans= */ false, useGpu(output.deviceId)); + /* trans= */ false, + useGpu(output.deviceId)); } if (data_.ids) { - IVector::resizeOrCreate(output.ids, data_.ids->getSize(), - useGpu(output.deviceId)); + IVector::resizeOrCreate( + output.ids, data_.ids->getSize(), useGpu(output.deviceId)); output.ids->copyFrom(*data_.ids); } } diff --git a/paddle/gserver/layers/DataLayer.h b/paddle/gserver/layers/DataLayer.h index 3abec1b0653a812dcb0a8d5e0a24d8ead55c1d0b..da74702201bd3af3cd73ad51ef2579da97674bc6 100644 --- a/paddle/gserver/layers/DataLayer.h +++ b/paddle/gserver/layers/DataLayer.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include @@ -20,7 +19,7 @@ limitations under the License. */ #include "Layer.h" namespace paddle { -/** +/** * This layer just copy data to output, and has no backward propagation. * * The config file api is data_layer. @@ -34,12 +33,10 @@ public: /** * Prefetch sparse matrix/ids only. */ - void prefetch() { - output_ = data_; - } + void prefetch() { output_ = data_; } - /** - * Forward propagation. Copy data_ (value, in, grad, ids, cpuSequenceDims, + /** + * Forward propagation. Copy data_ (value, in, grad, ids, cpuSequenceDims, * sequenceStartPositions, subSequenceStartPositions, strs) to output_. */ virtual void forward(PassType passType) { diff --git a/paddle/gserver/layers/DataNormLayer.cpp b/paddle/gserver/layers/DataNormLayer.cpp index 150977ce1a589cc7cc2b00a495314218ecaa772c..b398f3dbedc44eb422124a725aa745f684e821e3 100644 --- a/paddle/gserver/layers/DataNormLayer.cpp +++ b/paddle/gserver/layers/DataNormLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "DataNormLayer.h" #include "paddle/utils/Logging.h" #include "paddle/utils/Stat.h" @@ -37,16 +36,28 @@ bool DataNormLayer::init(const LayerMap& layerMap, << "The parameter of DataNormLayer must be static"; weight_ = std::unique_ptr(new Weight(5, getSize(), parameters_[0])); - min_ = Matrix::create(nullptr, /* height= */ 1, getSize(), /* trans= */ false, - useGpu_); - rangeReciprocal_ = Matrix::create(nullptr, /* height= */ 1, getSize(), - /* trans= */ false, useGpu_); - mean_ = Matrix::create(nullptr, /* height= */ 1, getSize(), - /* trans= */ false, useGpu_); - stdReciprocal_ = Matrix::create(nullptr, /* height= */ 1, getSize(), - /* trans= */ false, useGpu_); - decimalReciprocal_ = Matrix::create(nullptr, /* height= */ 1, getSize(), - /* trans= */ false, useGpu_); + min_ = Matrix::create( + nullptr, /* height= */ 1, getSize(), /* trans= */ false, useGpu_); + rangeReciprocal_ = Matrix::create(nullptr, + /* height= */ 1, + getSize(), + /* trans= */ false, + useGpu_); + mean_ = Matrix::create(nullptr, + /* height= */ 1, + getSize(), + /* trans= */ false, + useGpu_); + stdReciprocal_ = Matrix::create(nullptr, + /* height= */ 1, + getSize(), + /* trans= */ false, + useGpu_); + decimalReciprocal_ = Matrix::create(nullptr, + /* height= */ 1, + getSize(), + /* trans= */ false, + useGpu_); min_->setData(weight_->getW()->getData()); rangeReciprocal_->setData(weight_->getW()->getData() + getSize()); diff --git a/paddle/gserver/layers/DataNormLayer.h b/paddle/gserver/layers/DataNormLayer.h index 232c73f0346a12d59fa0dc316ef510be75e6b2b1..1179d94fbbd4032c9275f0586de5b526eb21c095 100644 --- a/paddle/gserver/layers/DataNormLayer.h +++ b/paddle/gserver/layers/DataNormLayer.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "Layer.h" diff --git a/paddle/gserver/layers/DotMulOperator.cpp b/paddle/gserver/layers/DotMulOperator.cpp index e6d2375b474d811ce8d485ca838428dc2860b608..9409493fdaaf0e84ab2e650e2c5e3db0c1fb1fbc 100644 --- a/paddle/gserver/layers/DotMulOperator.cpp +++ b/paddle/gserver/layers/DotMulOperator.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "Operator.h" namespace paddle { @@ -42,8 +41,8 @@ DotMulOperator::DotMulOperator(const OperatorConfig& config, bool useGpu) } void DotMulOperator::forward() { - out_->value->addDotMul(*ins_[0]->value, *ins_[1]->value, 1, - config_.dotmul_scale()); + out_->value->addDotMul( + *ins_[0]->value, *ins_[1]->value, 1, config_.dotmul_scale()); } void DotMulOperator::backward() { diff --git a/paddle/gserver/layers/DotMulProjection.cpp b/paddle/gserver/layers/DotMulProjection.cpp index f6f14c4429e2637ae722105c164a776758e1ca11..862eeb6f01db04451afb8a91ecb2c04e0f796952 100644 --- a/paddle/gserver/layers/DotMulProjection.cpp +++ b/paddle/gserver/layers/DotMulProjection.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "Projection.h" namespace paddle { @@ -29,7 +28,8 @@ namespace paddle { class DotMulProjection : public Projection { public: DotMulProjection(const ProjectionConfig& config, - const ParameterPtr& parameter, bool useGpu); + const ParameterPtr& parameter, + bool useGpu); virtual void forward(); virtual void backward(const UpdateCallback& callback); @@ -41,7 +41,8 @@ protected: REGISTER_PROJECTION(dot_mul, DotMulProjection); DotMulProjection::DotMulProjection(const ProjectionConfig& config, - const ParameterPtr& parameter, bool useGpu) + const ParameterPtr& parameter, + bool useGpu) : Projection(config, parameter, useGpu) { weight_.reset(new Weight(1LU, config.output_size(), parameter)); } diff --git a/paddle/gserver/layers/EosIdCheckLayer.cpp b/paddle/gserver/layers/EosIdCheckLayer.cpp index 2d0778a451aae5997a4b39a7c106d96887a79a51..3a43705d263898bd407248b3d553185f7e40f798 100644 --- a/paddle/gserver/layers/EosIdCheckLayer.cpp +++ b/paddle/gserver/layers/EosIdCheckLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/utils/Logging.h" #include "Layer.h" @@ -20,7 +19,7 @@ namespace paddle { /** * A layer for checking EOS for each sample: * - output_id = (input_id == conf.eos_id) - * + * * The result is stored in output_.ids. * It is used by recurrent layer group. */ diff --git a/paddle/gserver/layers/ExpandConvBaseLayer.cpp b/paddle/gserver/layers/ExpandConvBaseLayer.cpp index 0bab0ca764f4fea7dc37f0eae096de1a79c9df21..71a69bd0d01f4f6fcd579a408008ad4e00b5fd4d 100644 --- a/paddle/gserver/layers/ExpandConvBaseLayer.cpp +++ b/paddle/gserver/layers/ExpandConvBaseLayer.cpp @@ -12,14 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "ExpandConvBaseLayer.h" #include "paddle/utils/Logging.h" namespace paddle { bool ExpandConvBaseLayer::init(const LayerMap &layerMap, - const ParameterMap ¶meterMap) { + const ParameterMap ¶meterMap) { /* Initialize the basic convolutional parent class */ ConvBaseLayer::init(layerMap, parameterMap); @@ -76,9 +75,11 @@ void ExpandConvBaseLayer::addSharedBias() { transOutValue_->reshape(transOutValue_->getElementCnt() / numFilters_, numFilters_); - MatrixPtr bias = - Matrix::create(biases_->getW()->getData(), 1, - biases_->getW()->getElementCnt(), false, useGpu_); + MatrixPtr bias = Matrix::create(biases_->getW()->getData(), + 1, + biases_->getW()->getElementCnt(), + false, + useGpu_); transOutValue_->addBias(*bias, 1.0f); transOutValue_->reshape(mapW, mapH); @@ -90,32 +91,46 @@ void ExpandConvBaseLayer::addSharedBias() { void ExpandConvBaseLayer::addUnsharedBias() { MatrixPtr outValue = getOutputValue(); - MatrixPtr bias = - Matrix::create(biases_->getW()->getData(), 1, - biases_->getW()->getElementCnt(), false, useGpu_); + MatrixPtr bias = Matrix::create(biases_->getW()->getData(), + 1, + biases_->getW()->getElementCnt(), + false, + useGpu_); outValue->addBias(*bias, 1.0f); } - -void ExpandConvBaseLayer::expandOneFrame(MatrixPtr image, size_t startIdx, - int inIdx) { +void ExpandConvBaseLayer::expandOneFrame(MatrixPtr image, + size_t startIdx, + int inIdx) { int channel = isDeconv_ ? numFilters_ : channels_[inIdx]; resetExpandInput(subK_[inIdx] * groups_[inIdx], subN_[inIdx]); real *imgData = image->getData() + startIdx * image->getWidth(); - MatrixPtr imageTmp = Matrix::create( - imgData, 1, imgSizeH_[inIdx] * imgSizeW_[inIdx] * channel, false, - useGpu_); - expandInput_->convExpand(*imageTmp, imgSizeH_[inIdx], imgSizeW_[inIdx], - channel, filterSize_[inIdx], - filterSize_[inIdx], stride_[inIdx], stride_[inIdx], - padding_[inIdx], padding_[inIdx], - outputH_[inIdx], outputW_[inIdx]); + MatrixPtr imageTmp = + Matrix::create(imgData, + 1, + imgSizeH_[inIdx] * imgSizeW_[inIdx] * channel, + false, + useGpu_); + expandInput_->convExpand(*imageTmp, + imgSizeH_[inIdx], + imgSizeW_[inIdx], + channel, + filterSize_[inIdx], + filterSize_[inIdx], + stride_[inIdx], + stride_[inIdx], + padding_[inIdx], + padding_[inIdx], + outputH_[inIdx], + outputW_[inIdx]); imageTmp->clear(); } -void ExpandConvBaseLayer::expandFwdOnce(MatrixPtr image, MatrixPtr out, - int inIdx, int startIdx) { +void ExpandConvBaseLayer::expandFwdOnce(MatrixPtr image, + MatrixPtr out, + int inIdx, + int startIdx) { int subM = subM_[inIdx]; int subN = subN_[inIdx]; int subK = subK_[inIdx]; @@ -124,8 +139,7 @@ void ExpandConvBaseLayer::expandFwdOnce(MatrixPtr image, MatrixPtr out, int numFilters = isDeconv_ ? channels_[inIdx] : numFilters_; - real *outData = - out->getData() + startIdx * subN * numFilters; + real *outData = out->getData() + startIdx * subN * numFilters; real *wgtData = weights_[inIdx]->getW()->getData(); real *expInData = expandInput_->getData(); @@ -145,7 +159,8 @@ void ExpandConvBaseLayer::expandFwdOnce(MatrixPtr image, MatrixPtr out, } } -void ExpandConvBaseLayer::bpropActs(MatrixPtr out, MatrixPtr image, +void ExpandConvBaseLayer::bpropActs(MatrixPtr out, + MatrixPtr image, int inpIdx) { int channel = isDeconv_ ? numFilters_ : channels_[inpIdx]; @@ -183,15 +198,26 @@ void ExpandConvBaseLayer::bpropActs(MatrixPtr out, MatrixPtr image, // shrink one frame outGrad MatrixPtr oneGradTmp = Matrix::create( expandInput_->getData(), subK * groups_[inpIdx], subN, false, useGpu_); - MatrixPtr vTmp = Matrix::create( - tgtGradData, 1, - imgSizeH_[inpIdx] * imgSizeW_[inpIdx] * channel, false, - useGpu_); - vTmp->convShrink(*oneGradTmp, imgSizeH_[inpIdx], imgSizeW_[inpIdx], - channel, filterSize_[inpIdx], - filterSize_[inpIdx], stride_[inpIdx], stride_[inpIdx], - padding_[inpIdx], padding_[inpIdx], - outputH_[inpIdx], outputW_[inpIdx], 1.0f, 1.0f); + MatrixPtr vTmp = + Matrix::create(tgtGradData, + 1, + imgSizeH_[inpIdx] * imgSizeW_[inpIdx] * channel, + false, + useGpu_); + vTmp->convShrink(*oneGradTmp, + imgSizeH_[inpIdx], + imgSizeW_[inpIdx], + channel, + filterSize_[inpIdx], + filterSize_[inpIdx], + stride_[inpIdx], + stride_[inpIdx], + padding_[inpIdx], + padding_[inpIdx], + outputH_[inpIdx], + outputW_[inpIdx], + 1.0f, + 1.0f); vTmp->clear(); oneGradTmp->clear(); @@ -200,8 +226,9 @@ void ExpandConvBaseLayer::bpropActs(MatrixPtr out, MatrixPtr image, } } -void ExpandConvBaseLayer::bpropWeights(MatrixPtr image, MatrixPtr out, - int inpIdx) { +void ExpandConvBaseLayer::bpropWeights(MatrixPtr image, + MatrixPtr out, + int inpIdx) { MatrixPtr weightGrad = weights_[inpIdx]->getWGrad(); int subM = subM_[inpIdx]; @@ -249,9 +276,11 @@ void ExpandConvBaseLayer::bpropSharedBias(MatrixPtr biases, MatrixPtr v) { } void ExpandConvBaseLayer::bpropBiases(MatrixPtr v) { - MatrixPtr biases = - Matrix::create(biases_->getWGrad()->getData(), 1, - biases_->getWGrad()->getElementCnt(), false, useGpu_); + MatrixPtr biases = Matrix::create(biases_->getWGrad()->getData(), + 1, + biases_->getWGrad()->getElementCnt(), + false, + useGpu_); if (sharedBiases_) { bpropSharedBias(biases, v); } else { diff --git a/paddle/gserver/layers/ExpandConvBaseLayer.h b/paddle/gserver/layers/ExpandConvBaseLayer.h index 9858fa348c3fc85fdea0c017ca44fa047a6eaf42..5939d27e2a873308d710c1670a3aec843c3573ad 100644 --- a/paddle/gserver/layers/ExpandConvBaseLayer.h +++ b/paddle/gserver/layers/ExpandConvBaseLayer.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "ConvBaseLayer.h" @@ -45,7 +44,7 @@ protected: public: explicit ExpandConvBaseLayer(const LayerConfig& config) - : ConvBaseLayer(config) {} + : ConvBaseLayer(config) {} ~ExpandConvBaseLayer() {} diff --git a/paddle/gserver/layers/ExpandConvLayer.cpp b/paddle/gserver/layers/ExpandConvLayer.cpp index 5ea1fdece5f7b83c7e1d576e7f02a4a2545f0cd8..0649289c1c671ae5952dd8db9d19f576da67409c 100644 --- a/paddle/gserver/layers/ExpandConvLayer.cpp +++ b/paddle/gserver/layers/ExpandConvLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/utils/Logging.h" #include "paddle/utils/Stat.h" #include "ExpandConvLayer.h" @@ -58,7 +57,6 @@ void ExpandConvLayer::forward(PassType passType) { forwardActivation(); } - void ExpandConvLayer::backward(const UpdateCallback &callback) { backwardActivation(); diff --git a/paddle/gserver/layers/ExpandConvLayer.h b/paddle/gserver/layers/ExpandConvLayer.h index c07188a406183416cd57e2d027ba1205f6b65176..82a9e88a4208ea98a97bd56ef2f9f38de4f0031e 100644 --- a/paddle/gserver/layers/ExpandConvLayer.h +++ b/paddle/gserver/layers/ExpandConvLayer.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "paddle/math/Matrix.h" @@ -31,8 +30,8 @@ namespace paddle { class ExpandConvLayer : public ExpandConvBaseLayer { public: - explicit ExpandConvLayer(const LayerConfig& config) : - ExpandConvBaseLayer(config) {} + explicit ExpandConvLayer(const LayerConfig& config) + : ExpandConvBaseLayer(config) {} ~ExpandConvLayer() {} diff --git a/paddle/gserver/layers/ExpandConvTransLayer.cpp b/paddle/gserver/layers/ExpandConvTransLayer.cpp index a3e160f1f4eb524d39ed90cb17f59f58c690f964..1132ab4f92000c96b22a295b360143d2f356ec5a 100644 --- a/paddle/gserver/layers/ExpandConvTransLayer.cpp +++ b/paddle/gserver/layers/ExpandConvTransLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/utils/Logging.h" #include "paddle/utils/Stat.h" #include "ExpandConvTransLayer.h" @@ -27,7 +26,7 @@ namespace paddle { REGISTER_LAYER(exconvt, ExpandConvTransLayer); bool ExpandConvTransLayer::init(const LayerMap &layerMap, - const ParameterMap ¶meterMap) { + const ParameterMap ¶meterMap) { /* Initialize the basic convolutional parent class */ ExpandConvBaseLayer::init(layerMap, parameterMap); @@ -88,5 +87,4 @@ void ExpandConvTransLayer::backward(const UpdateCallback &callback) { } } - } // namespace paddle diff --git a/paddle/gserver/layers/ExpandConvTransLayer.h b/paddle/gserver/layers/ExpandConvTransLayer.h index 87c464a97f2edd5c3528a4434a2aa741d10ddf2e..47efe3f65643fd17b86832fc240cda2e30d3fcc4 100644 --- a/paddle/gserver/layers/ExpandConvTransLayer.h +++ b/paddle/gserver/layers/ExpandConvTransLayer.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "paddle/math/Matrix.h" @@ -30,8 +29,8 @@ namespace paddle { */ class ExpandConvTransLayer : public ExpandConvBaseLayer { public: - explicit ExpandConvTransLayer(const LayerConfig& config) : - ExpandConvBaseLayer(config) {} + explicit ExpandConvTransLayer(const LayerConfig& config) + : ExpandConvBaseLayer(config) {} ~ExpandConvTransLayer() {} diff --git a/paddle/gserver/layers/FeatureMapExpandLayer.cpp b/paddle/gserver/layers/FeatureMapExpandLayer.cpp index d18b51dd7973737768b4fde37b67987abea9e2c6..97c8d143fe0d84c4e59e224962b53995ee50b844 100644 --- a/paddle/gserver/layers/FeatureMapExpandLayer.cpp +++ b/paddle/gserver/layers/FeatureMapExpandLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "Layer.h" #include "paddle/math/Matrix.h" #include "paddle/utils/Stat.h" @@ -79,9 +78,12 @@ void FeatureMapExpandLayer::forward(PassType passType) { for (size_t i = 0; i < batchSize; i++) { MatrixPtr outVTmp = Matrix::create(outputV->getData() + i * imgSize * numFilters_, - numFilters_, imgSize, false, useGpu_); - MatrixPtr inVTmp = Matrix::create(inputV->getData() + i * imgSize, 1, - imgSize, false, useGpu_); + numFilters_, + imgSize, + false, + useGpu_); + MatrixPtr inVTmp = Matrix::create( + inputV->getData() + i * imgSize, 1, imgSize, false, useGpu_); outVTmp->addRowVector(*inVTmp); } } @@ -101,9 +103,12 @@ void FeatureMapExpandLayer::backward(const UpdateCallback& callback) { for (size_t i = 0; i < batchSize; i++) { MatrixPtr outGradTmp = Matrix::create(outGrad->getData() + i * imgSize * numFilters_, - numFilters_, imgSize, false, useGpu_); - MatrixPtr inGradTmp = Matrix::create(inGrad->getData() + i * imgSize, 1, - imgSize, false, useGpu_); + numFilters_, + imgSize, + false, + useGpu_); + MatrixPtr inGradTmp = Matrix::create( + inGrad->getData() + i * imgSize, 1, imgSize, false, useGpu_); inGradTmp->collectBias(*outGradTmp, 1); } } diff --git a/paddle/gserver/layers/FullMatrixProjection.cpp b/paddle/gserver/layers/FullMatrixProjection.cpp index f17c1b05bd892c7d933e4910887f977ac5cda79b..35a5cb5b7a450e7233b6dddbef58a2acccfb1608 100644 --- a/paddle/gserver/layers/FullMatrixProjection.cpp +++ b/paddle/gserver/layers/FullMatrixProjection.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "FullMatrixProjection.h" namespace paddle { diff --git a/paddle/gserver/layers/FullMatrixProjection.h b/paddle/gserver/layers/FullMatrixProjection.h index e99444b33b82e4694ee6df4df5f5447bdc3baaa0..ddb1e7b18c4f967383feb922ce89d13a452109b2 100644 --- a/paddle/gserver/layers/FullMatrixProjection.h +++ b/paddle/gserver/layers/FullMatrixProjection.h @@ -30,7 +30,8 @@ namespace paddle { class FullMatrixProjection : public Projection { public: FullMatrixProjection(const ProjectionConfig& config, - const ParameterPtr& parameter, bool useGpu); + const ParameterPtr& parameter, + bool useGpu); virtual void forward(); virtual void backward(const UpdateCallback& callback); diff --git a/paddle/gserver/layers/FullyConnectedLayer.cpp b/paddle/gserver/layers/FullyConnectedLayer.cpp index c754f8fd9480de73067b295ffacbbaab1866568a..70c56499a7738c12db40bfd0ca5fec399d72f99b 100644 --- a/paddle/gserver/layers/FullyConnectedLayer.cpp +++ b/paddle/gserver/layers/FullyConnectedLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "FullyConnectedLayer.h" #include "paddle/utils/Logging.h" #include "paddle/utils/Stat.h" diff --git a/paddle/gserver/layers/FullyConnectedLayer.h b/paddle/gserver/layers/FullyConnectedLayer.h index 334eb4b722f4ff9a794a3818a1cf3087da27692f..e15e1236cdb75d1c41bbb993f86545334785909a 100644 --- a/paddle/gserver/layers/FullyConnectedLayer.h +++ b/paddle/gserver/layers/FullyConnectedLayer.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "Layer.h" @@ -20,9 +19,9 @@ limitations under the License. */ #include "paddle/utils/ThreadLocal.h" namespace paddle { -/** +/** * A layer has full connections to all neurons in the previous layer. - * It computes an inner product with a set of learned weights, and + * It computes an inner product with a set of learned weights, and * (optionally) adds biases. * * The config file api is fc_layer. @@ -34,8 +33,7 @@ protected: std::unique_ptr biases_; public: - explicit FullyConnectedLayer(const LayerConfig& config) - : Layer(config) {} + explicit FullyConnectedLayer(const LayerConfig& config) : Layer(config) {} ~FullyConnectedLayer() {} bool init(const LayerMap& layerMap, const ParameterMap& parameterMap); diff --git a/paddle/gserver/layers/GatedRecurrentLayer.cpp b/paddle/gserver/layers/GatedRecurrentLayer.cpp index e0c6ff7ea28418d7bfb2db0b20281165f328976d..495c2174f3e9afbee676622d53248c7f5aeea404 100644 --- a/paddle/gserver/layers/GatedRecurrentLayer.cpp +++ b/paddle/gserver/layers/GatedRecurrentLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "Layer.h" #include "GatedRecurrentLayer.h" #include "paddle/utils/Stat.h" @@ -30,8 +29,8 @@ bool GatedRecurrentLayer::init(const LayerMap& layerMap, CHECK_EQ(getSize() * 3, biasParameter_->getSize()); weight_.reset(new Weight(getSize(), getSize() * 3, parameters_[0])); gateWeight_.reset(new Weight(getSize(), getSize() * 2, parameters_[0], 0)); - stateWeight_.reset(new Weight(getSize(), getSize(), parameters_[0], - 2 * getSize() * getSize())); + stateWeight_.reset(new Weight( + getSize(), getSize(), parameters_[0], 2 * getSize() * getSize())); if (biasParameter_.get() != NULL) { bias_.reset(new Weight(1, getSize() * 3, biasParameter_)); } @@ -48,8 +47,8 @@ bool GatedRecurrentLayer::init(const LayerMap& layerMap, void GatedRecurrentLayer::resetState() { CHECK(!reversed_) << "state is not allowed for reversed gated " "recurrent layer"; - Matrix::resizeOrCreate(prevOutput_, 1, getSize(), /* trans= */ false, - useGpu_); + Matrix::resizeOrCreate( + prevOutput_, 1, getSize(), /* trans= */ false, useGpu_); prevOutput_->zeroMem(); // TODO(hedaoyuan): support prev_batch_state @@ -85,10 +84,16 @@ void GatedRecurrentLayer::forward(PassType passType) { // batchSize = length of total frames in a batch (NOT size of mini-batch) CHECK_EQ(starts[numSequences], batchSize); - Matrix::resizeOrCreate(gate_.value, /* height= */batchSize, - getSize() * 3, /* trans= */false, useGpu_); - Matrix::resizeOrCreate(resetOutput_.value, /* height= */batchSize, - getSize(), /* trans= */false, useGpu_); + Matrix::resizeOrCreate(gate_.value, + /* height= */ batchSize, + getSize() * 3, + /* trans= */ false, + useGpu_); + Matrix::resizeOrCreate(resetOutput_.value, + /* height= */ batchSize, + getSize(), + /* trans= */ false, + useGpu_); if (useBatch_) { forwardBatch(batchSize, numSequences, starts, input.value); @@ -105,10 +110,16 @@ void GatedRecurrentLayer::backward(const UpdateCallback& callback) { const int* starts = input.sequenceStartPositions->getData(false); size_t numSequences = input.getNumSequences(); - Matrix::resizeOrCreate(gate_.grad, /* height= */batchSize, - getSize() * 3, /* trans= */false, useGpu_); - Matrix::resizeOrCreate(resetOutput_.grad, /* height= */batchSize, - getSize(), /* trans= */false, useGpu_); + Matrix::resizeOrCreate(gate_.grad, + /* height= */ batchSize, + getSize() * 3, + /* trans= */ false, + useGpu_); + Matrix::resizeOrCreate(resetOutput_.grad, + /* height= */ batchSize, + getSize(), + /* trans= */ false, + useGpu_); if (useBatch_) { backwardBatch(batchSize, input.grad); @@ -125,7 +136,7 @@ void GatedRecurrentLayer::backward(const UpdateCallback& callback) { void GatedRecurrentLayer::forwardSequence(int batchSize, size_t numSequences, - const int *starts, + const int* starts, MatrixPtr inputValue) { REGISTER_TIMER_INFO("GruFwSequenceTime", getName().c_str()); gate_.value->assign(*inputValue); @@ -198,7 +209,7 @@ void GatedRecurrentLayer::forwardSequence(int batchSize, void GatedRecurrentLayer::backwardSequence(int batchSize, size_t numSequences, - const int *starts, + const int* starts, MatrixPtr inputGrad) { REGISTER_TIMER_INFO("GruBwSequenceTime", getName().c_str()); @@ -211,9 +222,10 @@ void GatedRecurrentLayer::backwardSequence(int batchSize, hl_gru_grad gruGrad; gruGrad.gateWeightGrad = - (gateWeight_->getWGrad() ? gateWeight_->getWGrad()->getData() : nullptr); + (gateWeight_->getWGrad() ? gateWeight_->getWGrad()->getData() : nullptr); gruGrad.stateWeightGrad = - (stateWeight_->getWGrad() ? stateWeight_->getWGrad()->getData() : nullptr); + (stateWeight_->getWGrad() ? stateWeight_->getWGrad()->getData() + : nullptr); gruGrad.gateGrad = gate_.grad->getData(); gruGrad.resetOutputGrad = resetOutput_.grad->getData(); gruGrad.outputGrad = output_.grad->getData(); @@ -298,11 +310,10 @@ void GatedRecurrentLayer::forwardBatch(int batchSize, if (!batchValue_) { batchValue_.reset(new SequenceToBatch(useGpu_)); } - batchValue_->resizeOrCreateBatch(batchSize, numSequences, starts, - reversed_); + batchValue_->resizeOrCreateBatch(batchSize, numSequences, starts, reversed_); batchValue_->resizeOrCreate(*output_.value); - batchValue_->copy(*inputValue, *gate_.value, /* seq2batch */true); + batchValue_->copy(*inputValue, *gate_.value, /* seq2batch */ true); if (bias_ && bias_->getWGrad()) { gate_.value->addBias(*(bias_->getW()), 1); } @@ -315,14 +326,14 @@ void GatedRecurrentLayer::forwardBatch(int batchSize, MatrixPtr outputValueTmp = batchValue_->getBatchValue(n); gruValue.outputValue = outputValueTmp->getData(); gruValue.gateValue = - (batchValue_->getBatchValue(*gate_.value, n))->getData(); + (batchValue_->getBatchValue(*gate_.value, n))->getData(); gruValue.resetOutputValue = - (batchValue_->getBatchValue(*resetOutput_.value, n))->getData(); + (batchValue_->getBatchValue(*resetOutput_.value, n))->getData(); batchSize = outputValueTmp->getHeight(); gruValue.prevOutValue = - (n == 0 ? nullptr - : (batchValue_->getBatchValue(n - 1, batchSize))->getData()); + (n == 0 ? nullptr + : (batchValue_->getBatchValue(n - 1, batchSize))->getData()); { if (useGpu_) { @@ -333,13 +344,10 @@ void GatedRecurrentLayer::forwardBatch(int batchSize, } } } - { - batchValue_->copyBackSeq(*output_.value); - } + { batchValue_->copyBackSeq(*output_.value); } } -void GatedRecurrentLayer::backwardBatch(int batchSize, - MatrixPtr inputGrad) { +void GatedRecurrentLayer::backwardBatch(int batchSize, MatrixPtr inputGrad) { REGISTER_TIMER_INFO("GruBwBatchTime", getName().c_str()); hl_gru_value gruValue; gruValue.gateWeight = (gateWeight_->getW())->getData(); @@ -347,18 +355,17 @@ void GatedRecurrentLayer::backwardBatch(int batchSize, hl_gru_grad gruGrad; gruGrad.gateWeightGrad = - (gateWeight_->getWGrad() ? gateWeight_->getWGrad()->getData() : nullptr); + (gateWeight_->getWGrad() ? gateWeight_->getWGrad()->getData() : nullptr); gruGrad.stateWeightGrad = - (stateWeight_->getWGrad() ? stateWeight_->getWGrad()->getData() : nullptr); + (stateWeight_->getWGrad() ? stateWeight_->getWGrad()->getData() + : nullptr); if (!batchGrad_) { batchGrad_.reset(new SequenceToBatch(useGpu_)); } batchGrad_->shareIndexWith(*batchValue_); - { - batchGrad_->copyFromSeq(*output_.grad); - } + { batchGrad_->copyFromSeq(*output_.grad); } { int numBatch = batchGrad_->getNumBatch(); @@ -366,39 +373,36 @@ void GatedRecurrentLayer::backwardBatch(int batchSize, AsyncGpuBlock asyncGpuBlock; for (int n = (int)numBatch - 1; n >= 0; n--) { gruValue.gateValue = - (batchGrad_->getBatchValue(*gate_.value, n))->getData(); + (batchGrad_->getBatchValue(*gate_.value, n))->getData(); gruValue.resetOutputValue = - (batchGrad_->getBatchValue(*resetOutput_.value, n))->getData(); + (batchGrad_->getBatchValue(*resetOutput_.value, n))->getData(); - MatrixPtr outputGradTmp = batchGrad_->getBatchValue(n); + MatrixPtr outputGradTmp = batchGrad_->getBatchValue(n); gruGrad.outputGrad = outputGradTmp->getData(); - gruGrad.gateGrad = - (batchGrad_->getBatchValue(*gate_.grad , n))->getData(); + gruGrad.gateGrad = (batchGrad_->getBatchValue(*gate_.grad, n))->getData(); gruGrad.resetOutputGrad = - (batchGrad_->getBatchValue(*resetOutput_.grad , n))->getData(); + (batchGrad_->getBatchValue(*resetOutput_.grad, n))->getData(); { batchSize = outputGradTmp->getHeight(); gruValue.prevOutValue = - (n == 0 ? nullptr - : (batchValue_->getBatchValue(n - 1, batchSize))->getData()); + (n == 0 ? nullptr : (batchValue_->getBatchValue(n - 1, batchSize)) + ->getData()); gruGrad.prevOutGrad = - (n == 0 ? nullptr - : (batchGrad_->getBatchValue(n - 1, batchSize))->getData()); + (n == 0 ? nullptr + : (batchGrad_->getBatchValue(n - 1, batchSize))->getData()); if (useGpu_) { - GruCompute::backward<1>(gruValue, gruGrad, getSize(), - batchSize); + GruCompute::backward<1>(gruValue, gruGrad, getSize(), batchSize); } else { - GruCompute::backward<0>(gruValue, gruGrad, getSize(), - batchSize); + GruCompute::backward<0>(gruValue, gruGrad, getSize(), batchSize); } } } } if (inputGrad) { - batchGrad_->add(*inputGrad, *gate_.grad, /* seq2batch */false); + batchGrad_->add(*inputGrad, *gate_.grad, /* seq2batch */ false); } if (bias_ && bias_->getWGrad()) { bias_->getWGrad()->collectBias(*gate_.grad, /* scale */ 1); diff --git a/paddle/gserver/layers/GatedRecurrentLayer.h b/paddle/gserver/layers/GatedRecurrentLayer.h index 19f71206bc00a15892815cc1e0c039659b841df6..3b8706a44e21e5a780c6423b65369dc5b695b59b 100644 --- a/paddle/gserver/layers/GatedRecurrentLayer.h +++ b/paddle/gserver/layers/GatedRecurrentLayer.h @@ -63,13 +63,19 @@ public: LayerStatePtr getState(); protected: - void forwardSequence(int batchSize, size_t numSequences, - const int *starts, MatrixPtr inputValue); - void backwardSequence(int batchSize, size_t numSequences, - const int *starts, MatrixPtr inputGrad); - - void forwardBatch(int batchSize, size_t numSequences, - const int *starts, MatrixPtr inputValue); + void forwardSequence(int batchSize, + size_t numSequences, + const int* starts, + MatrixPtr inputValue); + void backwardSequence(int batchSize, + size_t numSequences, + const int* starts, + MatrixPtr inputGrad); + + void forwardBatch(int batchSize, + size_t numSequences, + const int* starts, + MatrixPtr inputValue); void backwardBatch(int batchSize, MatrixPtr inputGrad); protected: diff --git a/paddle/gserver/layers/GetOutputLayer.cpp b/paddle/gserver/layers/GetOutputLayer.cpp index f036cd2b5284222bbcbcdfda7b7a0142eba750a7..01579d55fd9d0918b62ae0ddd9a7e90b4a697a13 100644 --- a/paddle/gserver/layers/GetOutputLayer.cpp +++ b/paddle/gserver/layers/GetOutputLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "Layer.h" namespace paddle { diff --git a/paddle/gserver/layers/GruCompute.cpp b/paddle/gserver/layers/GruCompute.cpp index c942122633c3d9e6dd89ce57c35d50db819ba3a1..d9d423af448fd267b777ef57964dced3b7a09f63 100644 --- a/paddle/gserver/layers/GruCompute.cpp +++ b/paddle/gserver/layers/GruCompute.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/utils/Util.h" #include "GruCompute.h" #include "hl_recurrent_apply.cuh" @@ -20,14 +19,12 @@ limitations under the License. */ namespace paddle { void GruCompute::init(LayerConfig &config) { - activeNode_ = hlActiveType(config.active_type()); - activeGate_ = hlActiveType(config.active_gate_type()); + activeNode_ = hlActiveType(config.active_type()); + activeGate_ = hlActiveType(config.active_gate_type()); } template <> -void GruCompute::forward<0>(hl_gru_value value, - int frameSize, - int batchSize) { +void GruCompute::forward<0>(hl_gru_value value, int frameSize, int batchSize) { hl_cpu_gru_forward(hppl::forward::gru_resetOutput(), hppl::forward::gru_finalOutput(), value, @@ -39,17 +36,17 @@ void GruCompute::forward<0>(hl_gru_value value, template <> void GruCompute::backward<0>(hl_gru_value value, - hl_gru_grad grad, - int frameSize, - int batchSize) { -hl_cpu_gru_backward(hppl::backward::gru_stateGrad(), - hppl::backward::gru_resetGrad(), - value, - grad, - frameSize, - batchSize, - activeNode_, - activeGate_); + hl_gru_grad grad, + int frameSize, + int batchSize) { + hl_cpu_gru_backward(hppl::backward::gru_stateGrad(), + hppl::backward::gru_resetGrad(), + value, + grad, + frameSize, + batchSize, + activeNode_, + activeGate_); } } // namespace paddle diff --git a/paddle/gserver/layers/GruCompute.h b/paddle/gserver/layers/GruCompute.h index 3a1b69b940d089d8f346756d312e0eb21d445e05..58b5aacba0403f8d10e34b055f5a69ad5ffa4837 100644 --- a/paddle/gserver/layers/GruCompute.h +++ b/paddle/gserver/layers/GruCompute.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "paddle/utils/TypeDefs.h" @@ -29,7 +28,9 @@ public: void forward(hl_gru_value value, int frameSize, int batchSize = 1); template - void backward(hl_gru_value value, hl_gru_grad grad, int frameSize, + void backward(hl_gru_value value, + hl_gru_grad grad, + int frameSize, int batchSize = 1); public: diff --git a/paddle/gserver/layers/GruStepLayer.cpp b/paddle/gserver/layers/GruStepLayer.cpp index 501229d10ab87af0baa8b5d3f94a218f2d064d61..6c9b0c5771bec765d043cd654fbb30ba56f8c813 100644 --- a/paddle/gserver/layers/GruStepLayer.cpp +++ b/paddle/gserver/layers/GruStepLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "Layer.h" #include "GruCompute.h" #include "paddle/utils/Stat.h" @@ -32,7 +31,8 @@ namespace paddle { * \f[ * update \ gate: z_t = actGate(xz_t + U_z * prev_out + bias_z) \\ * reset \ gate: r_t = actGate(xr_t + U_r * prev_out + bias_r) \\ - * output \ candidate: {h}_t = actNode(xi_t + U * dot(r_t, prev_out) + bias_o) \\ + * output \ candidate: {h}_t = actNode(xi_t + U * dot(r_t, prev_out) + bias_o) + * \\ * output: h_t = dot((1-z_t), prev_out) + dot(z_t, prev_out) * \f] * @@ -91,10 +91,16 @@ void GruStepLayer::forward(PassType passType) { int batchSize = input.getBatchSize(); resetOutput(batchSize, getSize()); - resetSpecifyOutput(gate_, batchSize, getSize() * 3, - /* isValueClean */ false, /* isGradClean */ false); - resetSpecifyOutput(resetOutput_, batchSize, getSize(), - /* isValueClean */ false, /* isGradClean */ false); + resetSpecifyOutput(gate_, + batchSize, + getSize() * 3, + /* isValueClean */ false, + /* isGradClean */ false); + resetSpecifyOutput(resetOutput_, + batchSize, + getSize(), + /* isValueClean */ false, + /* isGradClean */ false); gate_.value->assign(*input.value); if (bias_) { gate_.value->addBias(*(bias_->getW()), 1); @@ -103,7 +109,7 @@ void GruStepLayer::forward(PassType passType) { hl_gru_value gruValue; gruValue.gateWeight = weight_->getW()->getData(); gruValue.stateWeight = weight_->getW()->getData() + getSize() * getSize() * 2; - gruValue.gateValue = gate_.value->getData();; + gruValue.gateValue = gate_.value->getData(); gruValue.resetOutputValue = resetOutput_.value->getData(); gruValue.outputValue = output_.value->getData(); gruValue.prevOutValue = prevOutput.value->getData(); @@ -125,17 +131,18 @@ void GruStepLayer::backward(const UpdateCallback& callback) { hl_gru_value gruValue; gruValue.gateWeight = weight_->getW()->getData(); gruValue.stateWeight = weight_->getW()->getData() + getSize() * getSize() * 2; - gruValue.gateValue = gate_.value->getData();; + gruValue.gateValue = gate_.value->getData(); gruValue.resetOutputValue = resetOutput_.value->getData(); gruValue.outputValue = output_.value->getData(); gruValue.prevOutValue = prevOutput.value->getData(); - hl_gru_grad gruGrad; + hl_gru_grad gruGrad; gruGrad.gateWeightGrad = - (weight_->getWGrad() ? weight_->getWGrad()->getData() : nullptr); + (weight_->getWGrad() ? weight_->getWGrad()->getData() : nullptr); gruGrad.stateWeightGrad = - (weight_->getWGrad() ? - weight_->getWGrad()->getData() + getSize() * getSize() * 2 : nullptr); + (weight_->getWGrad() + ? weight_->getWGrad()->getData() + getSize() * getSize() * 2 + : nullptr); gruGrad.gateGrad = gate_.grad->getData(); gruGrad.resetOutputGrad = resetOutput_.grad->getData(); diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp b/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp index 7091c6aa222e52e09603d84f52f88de11b9a7d73..61bc77778501fb9421cd2a72459d35ac9f47a5cb 100644 --- a/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp +++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "HierarchicalSigmoidLayer.h" #include "paddle/utils/Util.h" @@ -61,10 +60,16 @@ void HierarchicalSigmoidLayer::forward(PassType passType) { int batchSize = getInputValue(0)->getHeight(); int size = getSize(); reserveOutput(batchSize, size); - Matrix::resizeOrCreate(preOutput_.value, batchSize, codeLength_, - /* trans */ false, useGpu(deviceId_)); - Matrix::resizeOrCreate(preOutput_.grad, batchSize, codeLength_, - /* trans */ false, useGpu(deviceId_)); + Matrix::resizeOrCreate(preOutput_.value, + batchSize, + codeLength_, + /* trans */ false, + useGpu(deviceId_)); + Matrix::resizeOrCreate(preOutput_.grad, + batchSize, + codeLength_, + /* trans */ false, + useGpu(deviceId_)); IVectorPtr label = getInput(*getLabelLayer()).ids; @@ -76,16 +81,18 @@ void HierarchicalSigmoidLayer::forward(PassType passType) { } for (size_t i = 0; i < inputLayers_.size() - 1; ++i) { MatrixPtr input = getInputValue(i); - preOutput_.value->mulByBitCode(numClasses_, *label, *weights_[i]->getW(), - *input); + preOutput_.value->mulByBitCode( + numClasses_, *label, *weights_[i]->getW(), *input); } // keep consistent with the clipping in the following softrelu preOutput_.value->clip(-40.0, 40.0); - preOutput_.value->sumByBitCode(numClasses_, *label, *output_.value, + preOutput_.value->sumByBitCode(numClasses_, + *label, + *output_.value, -1); // scaleSum preOutput_.value->softrelu(*preOutput_.value); - MatrixPtr sum = Matrix::create(batchSize, - 1, /* trans= */ false, useGpu(deviceId_)); + MatrixPtr sum = + Matrix::create(batchSize, 1, /* trans= */ false, useGpu(deviceId_)); preOutput_.value->rowSum(*sum); output_.value->add(*sum); } @@ -97,8 +104,8 @@ void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) { preOutput_.grad->subByBitCode(numClasses_, *label); if (biases_ && biases_->getWGrad()) { - preOutput_.grad->addByBitCodeBackward(numClasses_, *label, - *biases_->getWGrad()); + preOutput_.grad->addByBitCodeBackward( + numClasses_, *label, *biases_->getWGrad()); /* Increasing the number of gradient */ biases_->getParameterPtr()->incUpdate(callback); diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.h b/paddle/gserver/layers/HierarchicalSigmoidLayer.h index 1942c5fe1e4f4da1d3d9197a3ffd80e3e55ec2ac..10762bc92687a3ea8debb7b9aa26a0cf0f94421c 100644 --- a/paddle/gserver/layers/HierarchicalSigmoidLayer.h +++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "Layer.h" @@ -20,15 +19,15 @@ limitations under the License. */ namespace paddle { /** - * Organize the classes into a binary tree. At each node, a sigmoid function + * Organize the classes into a binary tree. At each node, a sigmoid function * is used to calculate the probability of belonging to the right branch. - * This idea is from "F. Morin, Y. Bengio (AISTATS 05): + * This idea is from "F. Morin, Y. Bengio (AISTATS 05): * Hierarchical Probabilistic Neural Network Language Model." * * Here we uses a simple way of making the binary tree. * Assuming the number of classes C = 6, * The classes are organized as a binary tree in the following way: - * + * * @code{.py} * *-*-*- 2 * | | |- 3 @@ -44,15 +43,15 @@ namespace paddle { * - Node 0 ... C-2 are internal nodes. * - Node C-1 ... 2C-2 are leaf nodes. * - Class c is represented by leaf node \f$c+C-1\f$. - * + * * We assign an id for each node: * - the id of root be 0. * - the left child of a node i is 2*i+1. * - the right child of a node i is 2*i+2. * * It's easy to see that: - * - the parent of node i is \f$\left\lfloor(i-1)/2\right\rfloor\f$. - * - the j-th level ancestor of node i is + * - the parent of node i is \f$\left\lfloor(i-1)/2\right\rfloor\f$. + * - the j-th level ancestor of node i is * \f$\left\lfloor(i+1)/2^{j+1}\right\rfloor - 1\f$. * - A node i is a left child of its parent if \f$(i-1)\%2==0\f$. * @@ -69,7 +68,7 @@ public: protected: /** * The last of inputs is label layer. - */ + */ LayerPtr getLabelLayer() { return inputLayers_.back(); } WeightList weights_; diff --git a/paddle/gserver/layers/IdentityProjection.cpp b/paddle/gserver/layers/IdentityProjection.cpp index 6b7d20cc507e453e49708c2418f6d67abf3326f8..b38656c960f17b2c2c315eba70c61c328ed3e49a 100644 --- a/paddle/gserver/layers/IdentityProjection.cpp +++ b/paddle/gserver/layers/IdentityProjection.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/utils/Stat.h" #include "Projection.h" @@ -29,7 +28,8 @@ namespace paddle { class IdentityProjection : public Projection { public: IdentityProjection(const ProjectionConfig& config, - const ParameterPtr& parameter, bool useGpu); + const ParameterPtr& parameter, + bool useGpu); virtual void forward(); virtual void backward(const UpdateCallback& callback); }; @@ -70,7 +70,8 @@ void IdentityProjection::backward(const UpdateCallback& callback) { class IdentityOffsetProjection : public Projection { public: IdentityOffsetProjection(const ProjectionConfig& config, - const ParameterPtr& parameter, bool useGpu); + const ParameterPtr& parameter, + bool useGpu); virtual void forward(); virtual void backward(const UpdateCallback& callback); }; diff --git a/paddle/gserver/layers/InterpolationLayer.cpp b/paddle/gserver/layers/InterpolationLayer.cpp index 4102df840a48412a9c4ceb476488febf43a8e80c..b00bee235693d56aecfdc676647e102fe8d0ebfc 100644 --- a/paddle/gserver/layers/InterpolationLayer.cpp +++ b/paddle/gserver/layers/InterpolationLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/utils/Logging.h" #include "Layer.h" #include "paddle/math/Matrix.h" @@ -26,8 +25,8 @@ namespace paddle { * \f[ * y.row[i] = w[i] * x_1.row[i] + (1 - w[i]) * x_2.row[i] * \f] - * where \f$x_1\f$ and \f$x_2\f$ are two (batchSize x dataDim) inputs, - * \f$w\f$ is (batchSize x 1) weight vector, + * where \f$x_1\f$ and \f$x_2\f$ are two (batchSize x dataDim) inputs, + * \f$w\f$ is (batchSize x 1) weight vector, * and \f$y\f$ is (batchSize x dataDim) output. * * The config file api is interpolation_layer. diff --git a/paddle/gserver/layers/Layer.cpp b/paddle/gserver/layers/Layer.cpp index 78d15c553021de6bbda210cb782c8a240cc2bf73..0f9e7c0ff89531edeb5e7c5b2bc03f28b0a08b94 100644 --- a/paddle/gserver/layers/Layer.cpp +++ b/paddle/gserver/layers/Layer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/utils/Util.h" #include "paddle/utils/Logging.h" @@ -123,19 +122,22 @@ LayerPtr Layer::create(const LayerConfig& config) { return LayerPtr(registrar_.createByType(config.type(), config)); } -void Layer::resetSpecifyOutput(Argument& output, size_t height, size_t width, - bool isValueClean, bool isGradClean) { +void Layer::resetSpecifyOutput(Argument& output, + size_t height, + size_t width, + bool isValueClean, + bool isGradClean) { SetDevice device(output.deviceId); - Matrix::resizeOrCreate(output.value, height, width, /* trans */ false, - useGpu(output.deviceId)); + Matrix::resizeOrCreate( + output.value, height, width, /* trans */ false, useGpu(output.deviceId)); if (isValueClean) { output.value->zeroMem(); } if (passType_ != PASS_TEST && needGradient()) { - Matrix::resizeOrCreate(output.grad, height, width, /* trans */ false, - useGpu(output.deviceId)); + Matrix::resizeOrCreate( + output.grad, height, width, /* trans */ false, useGpu(output.deviceId)); if (isGradClean) { output.grad->zeroMem(); } @@ -227,8 +229,10 @@ void Layer::waitAndMergeOutputGrad() { if (outputOtherDevice_.size() == 1) return; } - Matrix::resizeOrCreate(tmpGrad_, output_.grad->getHeight(), - output_.grad->getWidth(), /* trans */ false, + Matrix::resizeOrCreate(tmpGrad_, + output_.grad->getHeight(), + output_.grad->getWidth(), + /* trans */ false, useGpu(output_.deviceId)); for (; i != outputOtherDevice_.size(); i++) { @@ -258,8 +262,8 @@ void Layer::zeroGrad() { } void Layer::initNeedFlags() { - auto initFlag = [this](bool& flag, bool (Layer::*flagQueryFunc)() const, - ParameterType type) { + auto initFlag = [this]( + bool& flag, bool (Layer::*flagQueryFunc)() const, ParameterType type) { flag = false; if (biasParameter_ && biasParameter_->hasType(type)) { flag = true; @@ -293,10 +297,12 @@ void Layer::showOutputStats() { } MatrixPtr outSquare; if (dynamic_cast(out.get())) { - GpuSparseMatrix *tmp = dynamic_cast(out.get()); - outSquare = std::make_shared( - tmp->getHeight(), tmp->getWidth(), tmp->getElementCnt(), - tmp->getValueType(), tmp->getFormat()); + GpuSparseMatrix* tmp = dynamic_cast(out.get()); + outSquare = std::make_shared(tmp->getHeight(), + tmp->getWidth(), + tmp->getElementCnt(), + tmp->getValueType(), + tmp->getFormat()); } else { outSquare = out->clone(); } @@ -321,8 +327,7 @@ void Layer::showOutputStats() { std = std > 0 ? std : 0; LOG(INFO) << "The output state of " << config_.name() << ": mean=" << mean << ", " - << "std=" << std - << ", " + << "std=" << std << ", " << "min=" << min << ", " << "max=" << max; } @@ -348,8 +353,8 @@ void Layer::backwardActivation() { if (config_.error_clipping_threshold() > 0.0f) { if (FLAGS_log_error_clipping) { CpuVector outGradVec(0, nullptr); - outGradVec.subVecFrom(output_.grad->getData(), 0, - output_.grad->getElementCnt()); + outGradVec.subVecFrom( + output_.grad->getData(), 0, output_.grad->getElementCnt()); real maxAbsGrad = outGradVec.getAbsMax(); if (maxAbsGrad > config_.error_clipping_threshold()) { real avgAbsGrad = outGradVec.getAbsSum() / outGradVec.getSize(); @@ -376,16 +381,19 @@ void Layer::forwardDropOut() { if (passType_ == PASS_TRAIN || passType_ == PASS_METRIC_TRAIN || passType_ == PASS_METRIC_TRAIN_WITH_NOERROR) { // new dropOutMask_ if dropOutMask_ is null ptr - Matrix::resizeOrCreate(dropOutMask_, outV->getHeight(), outV->getWidth(), - false, useGpu(deviceId_)); + Matrix::resizeOrCreate(dropOutMask_, + outV->getHeight(), + outV->getWidth(), + false, + useGpu(deviceId_)); dropOutMask_->randomizeUniform(); // generate a uniform random matrix dropOutMask_->biggerThanScalar(config_.drop_rate()); // random mask outV->dotMul(*outV, *dropOutMask_); // dropout } else if (passType_ == PASS_GC) { // only initialize once if (!dropOutMask_) { - dropOutMask_ = Matrix::create(outV->getHeight(), outV->getWidth(), false, - useGpu(deviceId_)); + dropOutMask_ = Matrix::create( + outV->getHeight(), outV->getWidth(), false, useGpu(deviceId_)); // We use cpu matrix to generate mask so that the mask // will be same for both gpu version and cpu version. // This will help unittest to make sure they have same result. diff --git a/paddle/gserver/layers/Layer.h b/paddle/gserver/layers/Layer.h index ae7cdb0028120748a3377d8f522c4af03d9cb82d..3d427a1ac6e38f2bcd49195504d1086b83e3cdf3 100644 --- a/paddle/gserver/layers/Layer.h +++ b/paddle/gserver/layers/Layer.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include @@ -109,7 +108,7 @@ public: virtual void waitInputValue(); /** - * Copy layer's output_ to other device. + * Copy layer's output_ to other device. * If output layer is in other device, called after Layer::forward() function. */ virtual void copyOutputToOtherDevice(); @@ -189,8 +188,11 @@ protected: * Reset to value zero if isValueClean = true, * Reset to grad zero if isGradClean = true. */ - void resetSpecifyOutput(Argument& output, size_t height, size_t width, - bool isValueClean, bool isGradClean); + void resetSpecifyOutput(Argument& output, + size_t height, + size_t width, + bool isValueClean, + bool isGradClean); /** * Add output argument to other devices. @@ -204,48 +206,48 @@ public: /// Register a Layer static ClassRegistrar registrar_; - /** + /** * Get the flag whether layer need to compute gradient. */ bool needGradient() const { return needGradient_; } - /** + /** * Set the flag whether layer need to compute gradient. */ void setNeedGradient(bool need) { needGradient_ = need; } - /** + /** * Set the flag whether layer need to re-compute sequence information, * which includes sequenceStartPositions or subSequenceStartPositions. */ void setNeedSequenceInfo(bool need) { needSequenceInfo_ = need; } - /** + /** * Get layer's name. */ const std::string& getName() const { return config_.name(); } - /** + /** * Get layer's type. */ const std::string& getType() const { return config_.type(); } - /** + /** * Get layer's size. */ size_t getSize() const { return config_.size(); } - /** + /** * Get layer's deviceId. */ int getDeviceId() const { return deviceId_; } - /** + /** * Add the inputLayer. */ void addPrev(LayerPtr l) { inputLayers_.push_back(l); } - /** + /** * Get the size of inputLayer[i]. */ const LayerPtr& getPrev(size_t i) { return inputLayers_[i]; } @@ -265,7 +267,7 @@ public: */ const MatrixPtr& getOutputGrad() { return output_.grad; } /** - * If layer has multi-output, set output into outputMap_. + * If layer has multi-output, set output into outputMap_. */ void setOutput(const std::string& name, Argument* output) { outputMap_[name] = output; @@ -351,8 +353,8 @@ public: /** * Intialization for sub network if there has sub network. * @param rootNetwork root network - * @param config model config - * @param parameterTypes parameter's type + * @param config model config + * @param parameterTypes parameter's type * @param useGpu whether to use gpu or not */ virtual void initSubNetwork(NeuralNetwork* rootNetwork, @@ -391,7 +393,8 @@ public: /** * Reset the internal state variables. * Allocate them if they have not been allocated. - * This function need to called before Layer::forward() for generating sequence. + * This function need to called before Layer::forward() for generating + * sequence. * * This is used for sequence generation. When generating sequence, the * calculation at current timestamp depends on the state from previous @@ -407,7 +410,7 @@ public: virtual void setState(LayerStatePtr state) {} /** - * Get layer state. + * Get layer state. * @return A copy of internal state. */ virtual LayerStatePtr getState() { return nullptr; } diff --git a/paddle/gserver/layers/LinearChainCRF.cpp b/paddle/gserver/layers/LinearChainCRF.cpp index fb54fd26cf36e2b23deba7186c3dcdd0cc445870..2b3a50b2e29cd2291a9fc21980506baa6120563c 100644 --- a/paddle/gserver/layers/LinearChainCRF.cpp +++ b/paddle/gserver/layers/LinearChainCRF.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include #include "LinearChainCRF.h" diff --git a/paddle/gserver/layers/LinearChainCRF.h b/paddle/gserver/layers/LinearChainCRF.h index c33c83b25987e1b944a84d960cf6539cff1b872f..6368f2b9de2f993c6a113315be8d642784b04726 100644 --- a/paddle/gserver/layers/LinearChainCRF.h +++ b/paddle/gserver/layers/LinearChainCRF.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "paddle/math/Matrix.h" @@ -31,7 +30,8 @@ public: * \f$P(s) = (1/Z) exp(a_{s_1} + b_{s_L} * + \sum_{l=1}^L x_{s_l} * + \sum_{l=2}^L w_{s_{l-1},s_l})\f$ - * where \f$Z\f$ is a normalization value so that the sum of \f$P(s)\f$ over all possible + * where \f$Z\f$ is a normalization value so that the sum of \f$P(s)\f$ over + * all possible * sequences is \f$1\f$, and \f$x\f$ is the input feature to the CRF. */ LinearChainCRF(int numClasses, real* para, real* grad); diff --git a/paddle/gserver/layers/LinearChainCTC.cpp b/paddle/gserver/layers/LinearChainCTC.cpp index c0ffadbd91c78f5dcdb9fc2370aa7eb06bfb400e..3368eb4d8a796eef367042f78b8c18d47bc1330e 100644 --- a/paddle/gserver/layers/LinearChainCTC.cpp +++ b/paddle/gserver/layers/LinearChainCTC.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include #include "LinearChainCTC.h" #include @@ -90,7 +89,9 @@ LinearChainCTC::LinearChainCTC(int numClasses, bool normByTimes) Matrix::resizeOrCreate(gradTerms_, 1, numClasses_); } -real LinearChainCTC::forward(real* softmaxSeq, int softmaxSeqLen, int* labelSeq, +real LinearChainCTC::forward(real* softmaxSeq, + int softmaxSeqLen, + int* labelSeq, int labelSeqLen) { isInvalid_ = false; totalTime_ = softmaxSeqLen; @@ -215,7 +216,9 @@ real LinearChainCTC::forward(real* softmaxSeq, int softmaxSeqLen, int* labelSeq, return -logProb_; } -void LinearChainCTC::backward(real* softmaxSeq, real* grad, int* labelSeq, +void LinearChainCTC::backward(real* softmaxSeq, + real* grad, + int* labelSeq, int labelSeqLen) { /* if not meet the conditions of CTC computing, then set the grads to zeros */ if (isInvalid_) { @@ -246,9 +249,9 @@ void LinearChainCTC::backward(real* softmaxSeq, real* grad, int* labelSeq, logMul(logProb_, logActsData[i * numClasses_ + j]))) / totalTime_; } else { - grad[i * numClasses_ + j] += -safeExp(logDiv( - gradTermsData[j], - logMul(logProb_, logActsData[i * numClasses_ + j]))); + grad[i * numClasses_ + j] += -safeExp( + logDiv(gradTermsData[j], + logMul(logProb_, logActsData[i * numClasses_ + j]))); } } } diff --git a/paddle/gserver/layers/LinearChainCTC.h b/paddle/gserver/layers/LinearChainCTC.h index b09218e3e78e16bd13e9dcde8138dd68a579d4ad..0a93d2e9a6d0d697f5f081abe9fad69faac9b04b 100644 --- a/paddle/gserver/layers/LinearChainCTC.h +++ b/paddle/gserver/layers/LinearChainCTC.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include @@ -25,11 +24,15 @@ public: LinearChainCTC(int numClasses, bool normByTimes); // Calculate the negative log probability as loss - real forward(real* softmaxSeq, int softmaxSeqLen, int* labelSeq, + real forward(real* softmaxSeq, + int softmaxSeqLen, + int* labelSeq, int labelSeqLen); // calculate the gradient - void backward(real* softmaxSeq, real* softmaxSeqGrad, int* labelSeq, + void backward(real* softmaxSeq, + real* softmaxSeqGrad, + int* labelSeq, int labelSeqLen); protected: diff --git a/paddle/gserver/layers/LstmCompute.cpp b/paddle/gserver/layers/LstmCompute.cpp index ced9636d3528ace044bc925285ac5db88f2ddc4e..38057636edbea5d1d25d20740b16c319a653e42e 100644 --- a/paddle/gserver/layers/LstmCompute.cpp +++ b/paddle/gserver/layers/LstmCompute.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/utils/Util.h" #include "hl_recurrent_apply.cuh" #include "LstmCompute.h" @@ -27,22 +26,31 @@ void LstmCompute::init(LayerConfig &config) { template <> void LstmCompute::forwardOneSequence<0>(hl_lstm_value value, int frameSize) { - hl_cpu_lstm_forward(hppl::forward::lstm(), value, - frameSize, activeNode_, activeGate_, + hl_cpu_lstm_forward(hppl::forward::lstm(), + value, + frameSize, + activeNode_, + activeGate_, activeState_); } template <> -void LstmCompute::backwardOneSequence<0>(hl_lstm_value value, hl_lstm_grad grad, - int frameSize) { - hl_cpu_lstm_backward(hppl::backward::lstm(), value, grad, - frameSize, activeNode_, activeGate_, +void LstmCompute::backwardOneSequence<0>(hl_lstm_value value, + hl_lstm_grad grad, + int frameSize) { + hl_cpu_lstm_backward(hppl::backward::lstm(), + value, + grad, + frameSize, + activeNode_, + activeGate_, activeState_); } template <> -void LstmCompute::forwardBatch<0>(hl_lstm_value value, int frameSize, - int batchSize) { +void LstmCompute::forwardBatch<0>(hl_lstm_value value, + int frameSize, + int batchSize) { for (int b = 0; b < batchSize; b++) { forwardOneSequence<0>(value, frameSize); @@ -57,8 +65,10 @@ void LstmCompute::forwardBatch<0>(hl_lstm_value value, int frameSize, } template <> -void LstmCompute::backwardBatch<0>(hl_lstm_value value, hl_lstm_grad grad, - int frameSize, int batchSize) { +void LstmCompute::backwardBatch<0>(hl_lstm_value value, + hl_lstm_grad grad, + int frameSize, + int batchSize) { for (int b = 0; b < batchSize; b++) { backwardOneSequence<0>(value, grad, frameSize); diff --git a/paddle/gserver/layers/LstmCompute.h b/paddle/gserver/layers/LstmCompute.h index 638acdb56d75054387f5f368eaf8afc0dbed9107..97be7218f251f21a9a50c7f8ec28e7c487420a2f 100644 --- a/paddle/gserver/layers/LstmCompute.h +++ b/paddle/gserver/layers/LstmCompute.h @@ -35,7 +35,9 @@ public: void forwardBatch(hl_lstm_value value, int frameSize, int batchSize); template - void backwardBatch(hl_lstm_value value, hl_lstm_grad grad, int frameSize, + void backwardBatch(hl_lstm_value value, + hl_lstm_grad grad, + int frameSize, int batchSize); /** @@ -51,7 +53,8 @@ public: template void forwardOneSequence(hl_lstm_value value, int frameSize); template - void backwardOneSequence(hl_lstm_value value, hl_lstm_grad grad, + void backwardOneSequence(hl_lstm_value value, + hl_lstm_grad grad, int frameSize); public: diff --git a/paddle/gserver/layers/LstmLayer.cpp b/paddle/gserver/layers/LstmLayer.cpp index 61ad47a7fbd02f19a1a8e824b2cba3a3d114b9fc..e70a20e5c0217288b795f647f3918911e3713ceb 100644 --- a/paddle/gserver/layers/LstmLayer.cpp +++ b/paddle/gserver/layers/LstmLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "LstmLayer.h" #include "paddle/math/Matrix.h" #include "paddle/math/BaseMatrix.h" @@ -35,14 +34,26 @@ bool LstmLayer::init(const LayerMap &layerMap, if (biasParameter_.get() != NULL) { bias_.reset(new Weight(1, getSize() * 7, biasParameter_)); if (bias_->getW()) { - localBias_ = Matrix::create(nullptr, /* height= */ 1, getSize() * 4, - /* trans= */ false, useGpu_); - checkIg_ = Matrix::create(nullptr, /* height= */ 1, getSize(), - /* trans= */ false, useGpu_); - checkFg_ = Matrix::create(nullptr, /* height= */ 1, getSize(), - /* trans= */ false, useGpu_); - checkOg_ = Matrix::create(nullptr, /* height= */ 1, getSize(), - /* trans= */ false, useGpu_); + localBias_ = Matrix::create(nullptr, + /* height= */ 1, + getSize() * 4, + /* trans= */ false, + useGpu_); + checkIg_ = Matrix::create(nullptr, + /* height= */ 1, + getSize(), + /* trans= */ false, + useGpu_); + checkFg_ = Matrix::create(nullptr, + /* height= */ 1, + getSize(), + /* trans= */ false, + useGpu_); + checkOg_ = Matrix::create(nullptr, + /* height= */ 1, + getSize(), + /* trans= */ false, + useGpu_); localBias_->setData(bias_->getW()->getData()); checkIg_->setData(bias_->getW()->getData() + getSize() * 4); @@ -51,14 +62,26 @@ bool LstmLayer::init(const LayerMap &layerMap, } if (bias_->getWGrad()) { - localBiasGrad_ = Matrix::create(nullptr, /* height= */ 1, getSize() * 4, - /* trans= */ false, useGpu_); - checkIgGrad_ = Matrix::create(nullptr, /* height= */ 1, getSize(), - /* trans= */ false, useGpu_); - checkFgGrad_ = Matrix::create(nullptr, /* height= */ 1, getSize(), - /* trans= */ false, useGpu_); - checkOgGrad_ = Matrix::create(nullptr, /* height= */ 1, getSize(), - /* trans= */ false, useGpu_); + localBiasGrad_ = Matrix::create(nullptr, + /* height= */ 1, + getSize() * 4, + /* trans= */ false, + useGpu_); + checkIgGrad_ = Matrix::create(nullptr, + /* height= */ 1, + getSize(), + /* trans= */ false, + useGpu_); + checkFgGrad_ = Matrix::create(nullptr, + /* height= */ 1, + getSize(), + /* trans= */ false, + useGpu_); + checkOgGrad_ = Matrix::create(nullptr, + /* height= */ 1, + getSize(), + /* trans= */ false, + useGpu_); localBiasGrad_->setData(bias_->getWGrad()->getData()); checkIgGrad_->setData(bias_->getWGrad()->getData() + getSize() * 4); checkFgGrad_->setData(bias_->getWGrad()->getData() + getSize() * 5); @@ -84,8 +107,8 @@ bool LstmLayer::init(const LayerMap &layerMap, void LstmLayer::resetState() { CHECK(!reversed_) << "state is not allowed for reversed lstmemory layer"; - Matrix::resizeOrCreate(prevOutput_, 1, getSize(), /* trans= */ false, - useGpu_); + Matrix::resizeOrCreate( + prevOutput_, 1, getSize(), /* trans= */ false, useGpu_); Matrix::resizeOrCreate(prevState_, 1, getSize(), /* trans= */ false, useGpu_); prevOutput_->resize(0, getSize()); prevState_->resize(0, getSize()); @@ -138,8 +161,10 @@ void LstmLayer::forward(PassType passType) { CHECK_EQ(starts[numSequences], batchSize); Matrix::resizeOrCreate(gate_.value, - /* height= */ batchSize, getSize() * 4, - /* trans= */ false, useGpu_); + /* height= */ batchSize, + getSize() * 4, + /* trans= */ false, + useGpu_); if (prevOutput_) { size_t prevNumSeq = useBatch_ ? numSequences : 1; if (prevOutput_->getHeight() == 0) { @@ -151,18 +176,29 @@ void LstmLayer::forward(PassType passType) { CHECK_EQ(prevOutput_->getHeight(), prevNumSeq) << "the number of sequences must be the same"; } - Matrix::resizeOrCreate(totalState_, prevState_->getHeight() + batchSize, - getSize(), /*trans*/ false, useGpu_); - state_.value = Matrix::create(nullptr, /* height= */ batchSize, getSize(), - /* trans= */ false, useGpu_); + Matrix::resizeOrCreate(totalState_, + prevState_->getHeight() + batchSize, + getSize(), + /*trans*/ false, + useGpu_); + state_.value = Matrix::create(nullptr, + /* height= */ batchSize, + getSize(), + /* trans= */ false, + useGpu_); state_.value->setData(totalState_->getData() + prevState_->getHeight() * getSize()); } else { - Matrix::resizeOrCreate(state_.value, /* height= */ batchSize, getSize(), - /* trans= */ false, useGpu_); + Matrix::resizeOrCreate(state_.value, + /* height= */ batchSize, + getSize(), + /* trans= */ false, + useGpu_); } Matrix::resizeOrCreate(preOutput_.value, - /* height= */ batchSize, getSize(), /* trans= */ false, + /* height= */ batchSize, + getSize(), + /* trans= */ false, useGpu_); if (!useBatch_) { @@ -171,7 +207,7 @@ void LstmLayer::forward(PassType passType) { if (!useSeqParallel_) { forwardBatch(batchSize, numSequences, starts, input.value); } else { - const int* starts = input.sequenceStartPositions->getData(useGpu_); + const int *starts = input.sequenceStartPositions->getData(useGpu_); forwardSeqParallel(batchSize, numSequences, starts, input.value); } } @@ -188,13 +224,19 @@ void LstmLayer::backward(const UpdateCallback &callback) { size_t numSequences = input.getNumSequences(); Matrix::resizeOrCreate(gate_.grad, - /* height= */ batchSize, getSize() * 4, - /* trans= */ false, useGpu_); + /* height= */ batchSize, + getSize() * 4, + /* trans= */ false, + useGpu_); Matrix::resizeOrCreate(state_.grad, - /* height= */ batchSize, getSize(), /* trans= */ false, + /* height= */ batchSize, + getSize(), + /* trans= */ false, useGpu_); Matrix::resizeOrCreate(preOutput_.grad, - /* height= */ batchSize, getSize(), /* trans= */ false, + /* height= */ batchSize, + getSize(), + /* trans= */ false, useGpu_); state_.grad->zero(); @@ -205,7 +247,7 @@ void LstmLayer::backward(const UpdateCallback &callback) { if (!useSeqParallel_) { backwardBatch(batchSize, numSequences, starts, input.grad); } else { - const int* starts = input.sequenceStartPositions->getData(useGpu_); + const int *starts = input.sequenceStartPositions->getData(useGpu_); backwardSeqParallel(batchSize, numSequences, starts, input.grad); } } @@ -216,8 +258,10 @@ void LstmLayer::backward(const UpdateCallback &callback) { weight_->getParameterPtr()->incUpdate(callback); } -void LstmLayer::forwardSequence(int batchSize, size_t numSequences, - const int *starts, MatrixPtr inputValue) { +void LstmLayer::forwardSequence(int batchSize, + size_t numSequences, + const int *starts, + MatrixPtr inputValue) { REGISTER_TIMER_INFO("LstmFwSequenceTime", getName().c_str()); gate_.value->assign(*inputValue); if (bias_) { @@ -255,10 +299,16 @@ void LstmLayer::forwardSequence(int batchSize, size_t numSequences, } }; - MatrixPtr frameGate = Matrix::create(nullptr, /* height= */ 1, getSize() * 4, - /* trans= */ false, useGpu_); - MatrixPtr frameOutput = Matrix::create(nullptr, /* height= */ 1, getSize(), - /* trans= */ false, useGpu_); + MatrixPtr frameGate = Matrix::create(nullptr, + /* height= */ 1, + getSize() * 4, + /* trans= */ false, + useGpu_); + MatrixPtr frameOutput = Matrix::create(nullptr, + /* height= */ 1, + getSize(), + /* trans= */ false, + useGpu_); if (!reversed_) { if (prevState_) { @@ -316,8 +366,10 @@ void LstmLayer::forwardSequence(int batchSize, size_t numSequences, } } -void LstmLayer::backwardSequence(int batchSize, size_t numSequences, - const int *starts, MatrixPtr inputGrad) { +void LstmLayer::backwardSequence(int batchSize, + size_t numSequences, + const int *starts, + MatrixPtr inputGrad) { REGISTER_TIMER_INFO("LstmBwSequenceTime", getName().c_str()); MatrixPtr weightT = weight_->getW()->getTranspose(); @@ -381,10 +433,16 @@ void LstmLayer::backwardSequence(int batchSize, size_t numSequences, } }; - MatrixPtr frameGate = Matrix::create(nullptr, /* height= */ 1, getSize() * 4, - /* trans= */ false, useGpu_); - MatrixPtr frameOutput = Matrix::create(nullptr, /* height= */ 1, getSize(), - /* trans= */ false, useGpu_); + MatrixPtr frameGate = Matrix::create(nullptr, + /* height= */ 1, + getSize() * 4, + /* trans= */ false, + useGpu_); + MatrixPtr frameOutput = Matrix::create(nullptr, + /* height= */ 1, + getSize(), + /* trans= */ false, + useGpu_); { AsyncGpuBlock asyncGpuBlock; @@ -422,11 +480,15 @@ void LstmLayer::backwardSequence(int batchSize, size_t numSequences, if (!reversed_) { weight_->getWGrad()->mul( output_.value->subMatrix(start, length - 1)->getTranspose(), - gate_.grad->subMatrix(start + 1, length - 1), 1, 1); + gate_.grad->subMatrix(start + 1, length - 1), + 1, + 1); } else { weight_->getWGrad()->mul( output_.value->subMatrix(start + 1, length - 1)->getTranspose(), - gate_.grad->subMatrix(start, length - 1), 1, 1); + gate_.grad->subMatrix(start, length - 1), + 1, + 1); } } } @@ -440,8 +502,10 @@ void LstmLayer::backwardSequence(int batchSize, size_t numSequences, } } -void LstmLayer::forwardBatch(int batchSize, size_t numSequences, - const int *starts, MatrixPtr inputValue) { +void LstmLayer::forwardBatch(int batchSize, + size_t numSequences, + const int *starts, + MatrixPtr inputValue) { REGISTER_TIMER_INFO("LstmFwBatchTime", getName().c_str()); hl_lstm_value lstmValue; @@ -452,8 +516,8 @@ void LstmLayer::forwardBatch(int batchSize, size_t numSequences, if (!batchValue_) { batchValue_.reset(new SequenceToBatch(useGpu_)); } - batchValue_->resizeOrCreateBatch(batchSize, numSequences, starts, reversed_, - prevOutput_ ? true : false); + batchValue_->resizeOrCreateBatch( + batchSize, numSequences, starts, reversed_, prevOutput_ ? true : false); batchValue_->resizeOrCreate(*output_.value); batchValue_->copy(*inputValue, *gate_.value, /* seq2batch */ true); @@ -479,8 +543,11 @@ void LstmLayer::forwardBatch(int batchSize, size_t numSequences, MatrixPtr batch1 = batchValue_->getBatchValue(n - 1, batchSize); gateValue->mul(batch1, weight_->getW(), 1, 1); } else if (prevOutput_) { - Matrix::resizeOrCreate(prevBatchOutput2_, gateValue->getHeight(), - getSize(), false, useGpu_); + Matrix::resizeOrCreate(prevBatchOutput2_, + gateValue->getHeight(), + getSize(), + false, + useGpu_); batchValue_->prevOutput2Batch(*prevOutput_, *prevBatchOutput2_); gateValue->mul(prevBatchOutput2_, weight_->getW(), 1, 1); @@ -525,8 +592,10 @@ void LstmLayer::getPrevBatchState(size_t numSequences) { batchValue_->getSeqOutputFromBatch(*prevState_, *state_.value); } -void LstmLayer::backwardBatch(int batchSize, size_t numSequences, - const int *starts, MatrixPtr inputGrad) { +void LstmLayer::backwardBatch(int batchSize, + size_t numSequences, + const int *starts, + MatrixPtr inputGrad) { REGISTER_TIMER_INFO("LstmBwBatchTime", getName().c_str()); hl_lstm_value lstmValue; @@ -593,11 +662,11 @@ void LstmLayer::backwardBatch(int batchSize, size_t numSequences, } } if (useGpu_) { - LstmCompute::backwardBatch<1>(lstmValue, lstmGrad, - getSize(), batchSize); + LstmCompute::backwardBatch<1>( + lstmValue, lstmGrad, getSize(), batchSize); } else { - LstmCompute::backwardBatch<0>(lstmValue, lstmGrad, - getSize(), batchSize); + LstmCompute::backwardBatch<0>( + lstmValue, lstmGrad, getSize(), batchSize); } } @@ -611,8 +680,8 @@ void LstmLayer::backwardBatch(int batchSize, size_t numSequences, MatrixPtr outputValue = batchValue_->getBatchValue(n - 1, batchSize); weight_->getWGrad()->mul(outputValue->getTranspose(), gateGrad, 1, 1); } else if (prevOutput_ && weight_->getWGrad()) { - weight_->getWGrad()->mul(prevBatchOutput2_->getTranspose(), gateGrad, 1, - 1); + weight_->getWGrad()->mul( + prevBatchOutput2_->getTranspose(), gateGrad, 1, 1); } } } @@ -625,8 +694,10 @@ void LstmLayer::backwardBatch(int batchSize, size_t numSequences, } } -void LstmLayer::forwardSeqParallel(int batchSize, size_t numSequences, - const int *starts, MatrixPtr inputValue) { +void LstmLayer::forwardSeqParallel(int batchSize, + size_t numSequences, + const int *starts, + MatrixPtr inputValue) { REGISTER_TIMER_INFO("LstmFwSeqParallelTime", getName().c_str()); gate_.value->assign(*inputValue); if (bias_) { @@ -641,14 +712,27 @@ void LstmLayer::forwardSeqParallel(int batchSize, size_t numSequences, real *checkFg = checkFg_->getData(); real *checkOg = checkOg_->getData(); real *weight = weight_->getW()->getData(); - hl_lstm_parallel_forward( - gateValue, stateValue, preOutputValue, outputValue, checkIg, checkFg, - checkOg, weight, starts, getSize(), numSequences, reversed_, activeNode_, - activeGate_, activeState_); + hl_lstm_parallel_forward(gateValue, + stateValue, + preOutputValue, + outputValue, + checkIg, + checkFg, + checkOg, + weight, + starts, + getSize(), + numSequences, + reversed_, + activeNode_, + activeGate_, + activeState_); } -void LstmLayer::backwardSeqParallel(int batchSize, size_t numSequences, - const int *starts, MatrixPtr inputGrad) { +void LstmLayer::backwardSeqParallel(int batchSize, + size_t numSequences, + const int *starts, + MatrixPtr inputGrad) { REGISTER_TIMER_INFO("LstmBwSeqParallelTime", getName().c_str()); real *gateValue = gate_.value->getData(); real *gateGrad = gate_.grad->getData(); @@ -675,11 +759,27 @@ void LstmLayer::backwardSeqParallel(int batchSize, size_t numSequences, checkOgGrad = nullptr; } - hl_lstm_parallel_backward_data( - gateValue, gateGrad, stateValue, stateGrad, preOutputValue, preOutputGrad, - outputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg, - checkOgGrad, weight, starts, getSize(), numSequences, reversed_, - activeNode_, activeGate_, activeState_); + hl_lstm_parallel_backward_data(gateValue, + gateGrad, + stateValue, + stateGrad, + preOutputValue, + preOutputGrad, + outputGrad, + checkIg, + checkIgGrad, + checkFg, + checkFgGrad, + checkOg, + checkOgGrad, + weight, + starts, + getSize(), + numSequences, + reversed_, + activeNode_, + activeGate_, + activeState_); if (inputGrad) { inputGrad->add(*gate_.grad); @@ -691,9 +791,14 @@ void LstmLayer::backwardSeqParallel(int batchSize, size_t numSequences, real *outputValue = output_.value->getData(); if (weight_->getWGrad()) { real *weightGrad = weight_->getWGrad()->getData(); - hl_lstm_parallel_backward_weight(weightGrad, outputValue, gateGrad, - starts, getSize(), batchSize, - numSequences, reversed_); + hl_lstm_parallel_backward_weight(weightGrad, + outputValue, + gateGrad, + starts, + getSize(), + batchSize, + numSequences, + reversed_); } } diff --git a/paddle/gserver/layers/LstmLayer.h b/paddle/gserver/layers/LstmLayer.h index e080a401416d55c8684342e00313ae4d5c9cf4e0..5b936ff44ef1bc26850c5051f4d5561529002cd4 100644 --- a/paddle/gserver/layers/LstmLayer.h +++ b/paddle/gserver/layers/LstmLayer.h @@ -97,12 +97,16 @@ protected: * @param starts Each start position of each samples. * @param inputValue The input values. */ - void forwardSequence(int batchSize, size_t numSequences, const int *starts, + void forwardSequence(int batchSize, + size_t numSequences, + const int *starts, MatrixPtr inputValue); /** * Compute lstm backward one sequence by one sequence. */ - void backwardSequence(int batchSize, size_t numSequences, const int *starts, + void backwardSequence(int batchSize, + size_t numSequences, + const int *starts, MatrixPtr inputGrad); /** @@ -121,12 +125,16 @@ protected: * } * @endcode */ - void forwardBatch(int batchSize, size_t numSequences, const int *starts, + void forwardBatch(int batchSize, + size_t numSequences, + const int *starts, MatrixPtr inputValue); /** * Compute lstm backward one batch by one batch. */ - void backwardBatch(int batchSize, size_t numSequences, const int *starts, + void backwardBatch(int batchSize, + size_t numSequences, + const int *starts, MatrixPtr inputGrad); /** @@ -134,13 +142,17 @@ protected: * batch value. It will launch one kernel to parallelly compute forward * propagation in sequence level. */ - void forwardSeqParallel(int batchSize, size_t numSequences, const int *starts, + void forwardSeqParallel(int batchSize, + size_t numSequences, + const int *starts, MatrixPtr inputValue); /** * Backward propagation corresponding to forwardSeqParallel. */ - void backwardSeqParallel(int batchSize, size_t numSequences, - const int *starts, MatrixPtr inputGrad); + void backwardSeqParallel(int batchSize, + size_t numSequences, + const int *starts, + MatrixPtr inputGrad); /** * This function is used for sequence generation and get output after * forwardBatch. diff --git a/paddle/gserver/layers/LstmStepLayer.cpp b/paddle/gserver/layers/LstmStepLayer.cpp index fb0fdbf7e9c9a1a479b47ecf9463b26393642be2..e7a8d519f2dc5eade613f3ad1981434ae8d59b7c 100644 --- a/paddle/gserver/layers/LstmStepLayer.cpp +++ b/paddle/gserver/layers/LstmStepLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "Layer.h" #include "LstmCompute.h" #include "paddle/utils/Stat.h" @@ -49,24 +48,36 @@ bool LstmStepLayer::init(const LayerMap& layerMap, if (!Layer::init(layerMap, parameterMap)) return false; CHECK_EQ(2U, inputLayers_.size()); - checkIg_ = - Matrix::create(nullptr, - /* height= */ 1, getSize(), /* trans= */ false, useGpu_); - checkFg_ = - Matrix::create(nullptr, - /* height= */ 1, getSize(), /* trans= */ false, useGpu_); - checkOg_ = - Matrix::create(nullptr, - /* height= */ 1, getSize(), /* trans= */ false, useGpu_); - checkIgGrad_ = - Matrix::create(nullptr, - /* height= */ 1, getSize(), /* trans= */ false, useGpu_); - checkFgGrad_ = - Matrix::create(nullptr, - /* height= */ 1, getSize(), /* trans= */ false, useGpu_); - checkOgGrad_ = - Matrix::create(nullptr, - /* height= */ 1, getSize(), /* trans= */ false, useGpu_); + checkIg_ = Matrix::create(nullptr, + /* height= */ 1, + getSize(), + /* trans= */ false, + useGpu_); + checkFg_ = Matrix::create(nullptr, + /* height= */ 1, + getSize(), + /* trans= */ false, + useGpu_); + checkOg_ = Matrix::create(nullptr, + /* height= */ 1, + getSize(), + /* trans= */ false, + useGpu_); + checkIgGrad_ = Matrix::create(nullptr, + /* height= */ 1, + getSize(), + /* trans= */ false, + useGpu_); + checkFgGrad_ = Matrix::create(nullptr, + /* height= */ 1, + getSize(), + /* trans= */ false, + useGpu_); + checkOgGrad_ = Matrix::create(nullptr, + /* height= */ 1, + getSize(), + /* trans= */ false, + useGpu_); if (biasParameter_.get() != NULL) { CHECK_EQ(getSize() * 3, biasParameter_->getSize()); @@ -101,12 +112,21 @@ void LstmStepLayer::forward(PassType passType) { CHECK_EQ(getSize(), prevState.value->getWidth()); int batchSize = input.getBatchSize(); reserveOutput(batchSize, getSize()); - resetSpecifyOutput(state_, batchSize, getSize(), /* isValueClean */ false, + resetSpecifyOutput(state_, + batchSize, + getSize(), + /* isValueClean */ false, /* isGradClean */ true); - resetSpecifyOutput(gate_, batchSize, getSize() * 4, - /* isValueClean */ false, /* isGradClean */ false); - resetSpecifyOutput(stateActive_, batchSize, getSize(), - /* isValueClean */ false, /* isGradClean */ false); + resetSpecifyOutput(gate_, + batchSize, + getSize() * 4, + /* isValueClean */ false, + /* isGradClean */ false); + resetSpecifyOutput(stateActive_, + batchSize, + getSize(), + /* isValueClean */ false, + /* isGradClean */ false); gate_.value->assign(*input.value); hl_lstm_value lstmValue; @@ -156,11 +176,9 @@ void LstmStepLayer::backward(const UpdateCallback& callback) { lstmGrad.checkOgGrad = checkOgGrad_->getData(); if (useGpu_) { - LstmCompute::backwardBatch<1>(lstmValue, lstmGrad, getSize(), - batchSize); + LstmCompute::backwardBatch<1>(lstmValue, lstmGrad, getSize(), batchSize); } else { - LstmCompute::backwardBatch<0>(lstmValue, lstmGrad, getSize(), - batchSize); + LstmCompute::backwardBatch<0>(lstmValue, lstmGrad, getSize(), batchSize); } if (input.grad) { diff --git a/paddle/gserver/layers/MDLstmLayer.cpp b/paddle/gserver/layers/MDLstmLayer.cpp index 8ca92dee6d0720ad385ba85da3db2ba36372c43d..93f52c1c314105f9d0b2530218d43045224df948 100644 --- a/paddle/gserver/layers/MDLstmLayer.cpp +++ b/paddle/gserver/layers/MDLstmLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "LstmLayer.h" #include "paddle/math/Matrix.h" #include "paddle/math/BaseMatrix.h" @@ -106,7 +105,8 @@ public: bool end() { return end_; } - bool getPrePos(const std::vector& delays, int idx, + bool getPrePos(const std::vector& delays, + int idx, std::vector& prePos) { bool isAvial = true; prePos.clear(); @@ -129,7 +129,8 @@ public: return isAvial; } - bool getNextPos(const std::vector& delays, int idx, + bool getNextPos(const std::vector& delays, + int idx, std::vector& nextPos) { bool isAvial = true; nextPos.clear(); @@ -232,24 +233,46 @@ bool MDLstmLayer::init(const LayerMap& layerMap, new Weight(numBlocks_, numBlocks_ * (3 + numDims_), parameters_[0])); if (biasParameter_.get() != NULL) { bias_.reset(new Weight(1, numBlocks_ * (5 + 2 * numDims_), biasParameter_)); - localBias_ = - Matrix::create(nullptr, /* height= */ 1, numBlocks_ * (3 + numDims_), - /* trans= */ false, useGpu_); - checkIg_ = Matrix::create(nullptr, /* height= */ 1, numBlocks_, - /* trans= */ false, useGpu_); - checkFg_ = Matrix::create(nullptr, /* height= */ numDims_, numBlocks_, - /* trans= */ false, useGpu_); - checkOg_ = Matrix::create(nullptr, /* height= */ 1, numBlocks_, - /* trans= */ false, useGpu_); - localBiasGrad_ = - Matrix::create(nullptr, /* height= */ 1, numBlocks_ * (3 + numDims_), - /* trans= */ false, useGpu_); - checkIgGrad_ = Matrix::create(nullptr, /* height= */ 1, numBlocks_, - /* trans= */ false, useGpu_); - checkFgGrad_ = Matrix::create(nullptr, /* height= */ numDims_, numBlocks_, - /* trans= */ false, useGpu_); - checkOgGrad_ = Matrix::create(nullptr, /* height= */ 1, numBlocks_, - /* trans= */ false, useGpu_); + localBias_ = Matrix::create(nullptr, + /* height= */ 1, + numBlocks_ * (3 + numDims_), + /* trans= */ false, + useGpu_); + checkIg_ = Matrix::create(nullptr, + /* height= */ 1, + numBlocks_, + /* trans= */ false, + useGpu_); + checkFg_ = Matrix::create(nullptr, + /* height= */ numDims_, + numBlocks_, + /* trans= */ false, + useGpu_); + checkOg_ = Matrix::create(nullptr, + /* height= */ 1, + numBlocks_, + /* trans= */ false, + useGpu_); + localBiasGrad_ = Matrix::create(nullptr, + /* height= */ 1, + numBlocks_ * (3 + numDims_), + /* trans= */ false, + useGpu_); + checkIgGrad_ = Matrix::create(nullptr, + /* height= */ 1, + numBlocks_, + /* trans= */ false, + useGpu_); + checkFgGrad_ = Matrix::create(nullptr, + /* height= */ numDims_, + numBlocks_, + /* trans= */ false, + useGpu_); + checkOgGrad_ = Matrix::create(nullptr, + /* height= */ 1, + numBlocks_, + /* trans= */ false, + useGpu_); localBias_->setData(bias_->getW()->getData()); checkIg_->setData(bias_->getW()->getData() + numBlocks_ * (3 + numDims_)); @@ -315,49 +338,79 @@ void MDLstmLayer::forward(PassType passType) { frameOutput_.reserve(batchSize); Matrix::resizeOrCreate(gate_.value, - /* height= */ batchSize, numBlocks_ * (3 + numDims_), - /* trans= */ false, useGpu_); + /* height= */ batchSize, + numBlocks_ * (3 + numDims_), + /* trans= */ false, + useGpu_); for (int i = frameGate_.size(); i < batchSize; i++) { Argument arg; - arg.value = - Matrix::create(nullptr, /* height= */ 1, numBlocks_ * (3 + numDims_), - /* trans= */ false, useGpu_); - arg.grad = - Matrix::create(nullptr, /* height= */ 1, numBlocks_ * (3 + numDims_), - /* trans= */ false, useGpu_); + arg.value = Matrix::create(nullptr, + /* height= */ 1, + numBlocks_ * (3 + numDims_), + /* trans= */ false, + useGpu_); + arg.grad = Matrix::create(nullptr, + /* height= */ 1, + numBlocks_ * (3 + numDims_), + /* trans= */ false, + useGpu_); frameGate_.push_back(arg); } for (int i = frameInputGate_.size(); i < batchSize; i++) { Argument arg; - arg.value = Matrix::create(nullptr, /* height= */ 1, numBlocks_, - /* trans= */ false, useGpu_); - arg.grad = Matrix::create(nullptr, /* height= */ 1, numBlocks_, - /* trans= */ false, useGpu_); + arg.value = Matrix::create(nullptr, + /* height= */ 1, + numBlocks_, + /* trans= */ false, + useGpu_); + arg.grad = Matrix::create(nullptr, + /* height= */ 1, + numBlocks_, + /* trans= */ false, + useGpu_); frameInputGate_.push_back(arg); } for (int i = frameForgetGate_.size(); i < batchSize; i++) { Argument arg; - arg.value = Matrix::create(nullptr, /* height= */ numDims_, numBlocks_, - /* trans= */ false, useGpu_); - arg.grad = Matrix::create(nullptr, /* height= */ numDims_, numBlocks_, - /* trans= */ false, useGpu_); + arg.value = Matrix::create(nullptr, + /* height= */ numDims_, + numBlocks_, + /* trans= */ false, + useGpu_); + arg.grad = Matrix::create(nullptr, + /* height= */ numDims_, + numBlocks_, + /* trans= */ false, + useGpu_); frameForgetGate_.push_back(arg); } for (int i = frameOutputGate_.size(); i < batchSize; i++) { Argument arg; - arg.value = Matrix::create(nullptr, /* height= */ 1, numBlocks_, - /* trans= */ false, useGpu_); - arg.grad = Matrix::create(nullptr, /* height= */ 1, numBlocks_, - /* trans= */ false, useGpu_); + arg.value = Matrix::create(nullptr, + /* height= */ 1, + numBlocks_, + /* trans= */ false, + useGpu_); + arg.grad = Matrix::create(nullptr, + /* height= */ 1, + numBlocks_, + /* trans= */ false, + useGpu_); frameOutputGate_.push_back(arg); } for (int i = frameInputNode_.size(); i < batchSize; i++) { Argument arg; - arg.value = Matrix::create(nullptr, /* height= */ 1, numBlocks_, - /* trans= */ false, useGpu_); - arg.grad = Matrix::create(nullptr, /* height= */ 1, numBlocks_, - /* trans= */ false, useGpu_); + arg.value = Matrix::create(nullptr, + /* height= */ 1, + numBlocks_, + /* trans= */ false, + useGpu_); + arg.grad = Matrix::create(nullptr, + /* height= */ 1, + numBlocks_, + /* trans= */ false, + useGpu_); frameInputNode_.push_back(arg); } for (int i = frameState_.size(); i < batchSize; i++) { @@ -374,10 +427,16 @@ void MDLstmLayer::forward(PassType passType) { } for (int i = frameOutput_.size(); i < batchSize; i++) { Argument arg; - arg.value = Matrix::create(nullptr, /* height= */ 1, numBlocks_, - /* trans= */ false, useGpu_); - arg.grad = Matrix::create(nullptr, /* height= */ 1, numBlocks_, - /* trans= */ false, useGpu_); + arg.value = Matrix::create(nullptr, + /* height= */ 1, + numBlocks_, + /* trans= */ false, + useGpu_); + arg.grad = Matrix::create(nullptr, + /* height= */ 1, + numBlocks_, + /* trans= */ false, + useGpu_); frameOutput_.push_back(arg); } @@ -432,13 +491,19 @@ void MDLstmLayer::forwardGate2OutputSequence(int start, *frameState_[start + preOffsetV[i]].value, *checkIg_, 1.0, 1.0); MatrixPtr fgGateOneDim = Matrix::create( - frameForgetGate_[idxCurr].value->getData() + i * numBlocks_, 1, - numBlocks_, false, useGpu_); + frameForgetGate_[idxCurr].value->getData() + i * numBlocks_, + 1, + numBlocks_, + false, + useGpu_); MatrixPtr checkFgOneDim = - Matrix::create(checkFg_->getData() + i * numBlocks_, 1.0, numBlocks_, - false, useGpu_); - fgGateOneDim->addDotMul(*frameState_[start + preOffsetV[i]].value, - *checkFgOneDim, 1.0, 1.0); + Matrix::create(checkFg_->getData() + i * numBlocks_, + 1.0, + numBlocks_, + false, + useGpu_); + fgGateOneDim->addDotMul( + *frameState_[start + preOffsetV[i]].value, *checkFgOneDim, 1.0, 1.0); } } activationGate_->forward(frameInputGate_[idxCurr]); @@ -449,18 +514,22 @@ void MDLstmLayer::forwardGate2OutputSequence(int start, for (int i = 0; i < numDims_; i++) { if (preOffsetV[i] >= 0) { MatrixPtr fgGateOneDim = Matrix::create( - frameForgetGate_[idxCurr].value->getData() + i * numBlocks_, 1, - numBlocks_, false, useGpu_); + frameForgetGate_[idxCurr].value->getData() + i * numBlocks_, + 1, + numBlocks_, + false, + useGpu_); frameState_[idxCurr].value->addDotMul( *frameState_[start + preOffsetV[i]].value, *fgGateOneDim, 1.0, 1.0); } } frameState_[idxCurr].value->addDotMul(*frameInputNode_[idxCurr].value, - *frameInputGate_[idxCurr].value, 1.0, + *frameInputGate_[idxCurr].value, + 1.0, 1.0); - frameOutputGate_[idxCurr].value->addDotMul(*frameState_[idxCurr].value, - *checkOg_, 1.0, 1.0); + frameOutputGate_[idxCurr].value->addDotMul( + *frameState_[idxCurr].value, *checkOg_, 1.0, 1.0); activationGate_->forward(frameOutputGate_[idxCurr]); framePreOutput_[idxCurr].value->copyFrom(*(frameState_[idxCurr].value)); @@ -493,8 +562,10 @@ void MDLstmLayer::backward(const UpdateCallback& callback) { size_t numSequences = input.getNumSequences(); Matrix::resizeOrCreate(gate_.grad, - /* height= */ batchSize, numBlocks_ * (3 + numDims_), - /* trans= */ false, useGpu_); + /* height= */ batchSize, + numBlocks_ * (3 + numDims_), + /* trans= */ false, + useGpu_); for (int i = 0; i < batchSize; i++) { if (frameState_[i].grad == NULL) @@ -576,8 +647,8 @@ void MDLstmLayer::backwardGate2OutputSequence(int start, *framePreOutput_[idxCurr].value); activationGate_->backward(frameOutputGate_[idxCurr]); - frameState_[idxCurr].grad->addDotMul(*frameOutputGate_[idxCurr].grad, - *checkOg_, 1.0, 1.0); + frameState_[idxCurr].grad->addDotMul( + *frameOutputGate_[idxCurr].grad, *checkOg_, 1.0, 1.0); for (int i = 0; i < numDims_; i++) { if (nextOffsetV[i] >= 0) { frameState_[idxCurr].grad->addDotMul( @@ -586,18 +657,26 @@ void MDLstmLayer::backwardGate2OutputSequence(int start, MatrixPtr fgGateOneDimGrad = Matrix::create( frameForgetGate_[start + nextOffsetV[i]].grad->getData() + i * numBlocks_, - 1, numBlocks_, false, useGpu_); + 1, + numBlocks_, + false, + useGpu_); MatrixPtr fgGateOneDimVal = Matrix::create( frameForgetGate_[start + nextOffsetV[i]].value->getData() + i * numBlocks_, - 1, numBlocks_, false, useGpu_); + 1, + numBlocks_, + false, + useGpu_); MatrixPtr checkFgOneDim = Matrix::create( checkFg_->getData() + i * numBlocks_, 1, numBlocks_, false, useGpu_); - frameState_[idxCurr].grad->addDotMul(*fgGateOneDimGrad, *checkFgOneDim, - 1.0, 1.0); frameState_[idxCurr].grad->addDotMul( - *frameState_[start + nextOffsetV[i]].grad, *fgGateOneDimVal, 1.0, + *fgGateOneDimGrad, *checkFgOneDim, 1.0, 1.0); + frameState_[idxCurr].grad->addDotMul( + *frameState_[start + nextOffsetV[i]].grad, + *fgGateOneDimVal, + 1.0, 1.0); } } @@ -611,11 +690,15 @@ void MDLstmLayer::backwardGate2OutputSequence(int start, for (int i = 0; i < numDims_; i++) { if (preOffsetV[i] >= 0) { MatrixPtr fgGateOneDimGrad = Matrix::create( - frameForgetGate_[idxCurr].grad->getData() + i * numBlocks_, 1, - numBlocks_, false, useGpu_); + frameForgetGate_[idxCurr].grad->getData() + i * numBlocks_, + 1, + numBlocks_, + false, + useGpu_); fgGateOneDimGrad->addDotMul(*frameState_[idxCurr].grad, *frameState_[start + preOffsetV[i]].value, - 1.0, 1.0); + 1.0, + 1.0); } } @@ -627,22 +710,30 @@ void MDLstmLayer::backwardGate2OutputSequence(int start, for (int i = 0; i < numDims_; i++) { if (preOffsetV[i] >= 0) { checkIgGrad_->addDotMul(*frameInputGate_[idxCurr].grad, - *frameState_[start + preOffsetV[i]].value, 1.0, + *frameState_[start + preOffsetV[i]].value, + 1.0, 1.0); MatrixPtr fgGateOneDimGrad = Matrix::create( - frameForgetGate_[idxCurr].grad->getData() + i * numBlocks_, 1, - numBlocks_, false, useGpu_); + frameForgetGate_[idxCurr].grad->getData() + i * numBlocks_, + 1, + numBlocks_, + false, + useGpu_); MatrixPtr checkFgOneDimGrad = - Matrix::create(checkFgGrad_->getData() + i * numBlocks_, 1, - numBlocks_, false, useGpu_); + Matrix::create(checkFgGrad_->getData() + i * numBlocks_, + 1, + numBlocks_, + false, + useGpu_); checkFgOneDimGrad->addDotMul(*fgGateOneDimGrad, *frameState_[start + preOffsetV[i]].value, - 1.0, 1.0); + 1.0, + 1.0); } } - checkOgGrad_->addDotMul(*frameOutputGate_[idxCurr].grad, - *frameState_[idxCurr].value, 1.0, 1.0); + checkOgGrad_->addDotMul( + *frameOutputGate_[idxCurr].grad, *frameState_[idxCurr].value, 1.0, 1.0); } } @@ -660,7 +751,9 @@ void MDLstmLayer::backwardOneSequence(int start, CoordIterator& coordIter) { if (weight_->getWGrad()) { weight_->getWGrad()->mul( frameOutput_[start + preOffset].value->getTranspose(), - frameGate_[start + offset].grad, 1.0, 1.0); + frameGate_[start + offset].grad, + 1.0, + 1.0); } } } diff --git a/paddle/gserver/layers/MaxIdLayer.cpp b/paddle/gserver/layers/MaxIdLayer.cpp index b80de87b4e9cc56b272f172c304027026039be06..22670fa1210e1199266cb16a1f08826c3010a84e 100644 --- a/paddle/gserver/layers/MaxIdLayer.cpp +++ b/paddle/gserver/layers/MaxIdLayer.cpp @@ -45,7 +45,10 @@ public: const Argument& input = getInput(0); size_t batchSize = input.getBatchSize(); IVector::resizeOrCreate(output_.ids, batchSize * beamSize_, useGpu_); - Matrix::resizeOrCreate(output_.in, batchSize, beamSize_, false, + Matrix::resizeOrCreate(output_.in, + batchSize, + beamSize_, + false, /* useGpu */ useGpu_); output_.value = nullptr; input.value->rowMax(*output_.ids, *output_.in); diff --git a/paddle/gserver/layers/MaxLayer.cpp b/paddle/gserver/layers/MaxLayer.cpp index c4ffe894eccd61b6fe6baf9cafa95faf543d8c98..42bc6bb815232ff8dfa6b49ebf47b10c252e28c5 100644 --- a/paddle/gserver/layers/MaxLayer.cpp +++ b/paddle/gserver/layers/MaxLayer.cpp @@ -23,8 +23,8 @@ REGISTER_LAYER(max, MaxLayer); void MaxLayer::forward(PassType passType) { SequencePoolLayer::forward(passType); - IVector::resizeOrCreate(maxIndex_, newBatchSize_ * getSize(), - useGpu(deviceId_)); + IVector::resizeOrCreate( + maxIndex_, newBatchSize_ * getSize(), useGpu(deviceId_)); maxIndex_->zeroMem(); MatrixPtr inputValue = getInputValue(0); diff --git a/paddle/gserver/layers/MaxLayer.h b/paddle/gserver/layers/MaxLayer.h index e6dcfe9c6759d137d90d8d6f382b91a9fb551323..74df0b8b576c8ea1eef56d465e8c4ceee5019fdb 100644 --- a/paddle/gserver/layers/MaxLayer.h +++ b/paddle/gserver/layers/MaxLayer.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "SequencePoolLayer.h" diff --git a/paddle/gserver/layers/MixedLayer.cpp b/paddle/gserver/layers/MixedLayer.cpp index 26b1360290ffba316816db898855d8c0b9bdaaa7..1392188fcae715734d96b1402924515fa3618965 100644 --- a/paddle/gserver/layers/MixedLayer.cpp +++ b/paddle/gserver/layers/MixedLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/utils/Stat.h" #include "MixedLayer.h" @@ -29,8 +28,8 @@ bool MixedLayer::init(const LayerMap& layerMap, projections_.resize(inputLayers_.size()); for (size_t i = 0; i < inputLayers_.size(); i++) { if (config_.inputs(i).has_proj_conf()) { - projections_[i].reset(Projection::create(config_.inputs(i).proj_conf(), - parameters_[i], useGpu_)); + projections_[i].reset(Projection::create( + config_.inputs(i).proj_conf(), parameters_[i], useGpu_)); } else { CHECK(!parameters_[i]) << "should no parameters for operators"; } @@ -46,8 +45,7 @@ bool MixedLayer::init(const LayerMap& layerMap, if (biasParameter_.get() != NULL) { sharedBias_ = config_.shared_biases(); size_t psize = config_.bias_size(); - biases_ = std::unique_ptr( - new Weight(1, psize, biasParameter_)); + biases_ = std::unique_ptr(new Weight(1, psize, biasParameter_)); } return true; diff --git a/paddle/gserver/layers/MixedLayer.h b/paddle/gserver/layers/MixedLayer.h index 5842e51e1d79d959d580e9cb92bead2d1961c9e6..271e0c2538d3b7239a5d54ec43180dddff569b76 100644 --- a/paddle/gserver/layers/MixedLayer.h +++ b/paddle/gserver/layers/MixedLayer.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "Layer.h" @@ -22,8 +21,8 @@ limitations under the License. */ namespace paddle { /** - * A mixed layer has multiple input layers. - * Each input layer was processed by a Projection or Operator. + * A mixed layer has multiple input layers. + * Each input layer was processed by a Projection or Operator. * The results of all projections or Operators are summed together with bias * (if configured), and then go through an activation function and dropout * (if configured). @@ -43,7 +42,7 @@ public: virtual void backward(const UpdateCallback& callback = nullptr); virtual void resetState(); /** - * setState() should be called after getState(). + * setState() should be called after getState(). * Argument state consists of all projections states. */ virtual void setState(LayerStatePtr state); diff --git a/paddle/gserver/layers/MultinomialSampler.cpp b/paddle/gserver/layers/MultinomialSampler.cpp index 518dc0c60cbdc2a95b7eb9c8ff33dd6a9fb87c98..e85dca72d3162d857e768221e970fe8e3951ae9c 100644 --- a/paddle/gserver/layers/MultinomialSampler.cpp +++ b/paddle/gserver/layers/MultinomialSampler.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "MultinomialSampler.h" namespace paddle { diff --git a/paddle/gserver/layers/MultinomialSampler.h b/paddle/gserver/layers/MultinomialSampler.h index 442124704ac0a9bdfba7ce67da279e2bc8e03394..59683d2ee29924e76ca11eb43fbd8cd175c3c357 100644 --- a/paddle/gserver/layers/MultinomialSampler.h +++ b/paddle/gserver/layers/MultinomialSampler.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include diff --git a/paddle/gserver/layers/MultiplexLayer.cpp b/paddle/gserver/layers/MultiplexLayer.cpp index a70172d9a6344b704cd775ce872186273d2aa4b9..c681eb0623ab7b8426fe34ce6817a3f5f4ad8246 100644 --- a/paddle/gserver/layers/MultiplexLayer.cpp +++ b/paddle/gserver/layers/MultiplexLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/utils/Logging.h" #include "Layer.h" #include "paddle/math/Matrix.h" diff --git a/paddle/gserver/layers/NCELayer.cpp b/paddle/gserver/layers/NCELayer.cpp index 4faebe5d2ad6f94c36de52b36b1a0828e7710005..50b29cdea5a352093c0508995da4cf3e2afcc995 100644 --- a/paddle/gserver/layers/NCELayer.cpp +++ b/paddle/gserver/layers/NCELayer.cpp @@ -23,7 +23,8 @@ namespace paddle { /** * Noise-contrastive estimation. * Implements the method in the following paper: - * A fast and simple algorithm for training neural probabilistic language models. + * A fast and simple algorithm for training neural probabilistic language + * models. * * The config file api is nce_layer. */ @@ -180,8 +181,11 @@ public: int size = getSize(); resetOutput(batchSize, size); - Matrix::resizeOrCreate(sampleOut_.value, 1, samples_.size(), - /* trans= */ false, useGpu_); + Matrix::resizeOrCreate(sampleOut_.value, + 1, + samples_.size(), + /* trans= */ false, + useGpu_); forwardBias(); @@ -195,8 +199,11 @@ public: } void backward(const UpdateCallback& callback) { - Matrix::resizeOrCreate(sampleOut_.grad, 1, samples_.size(), - /* trans= */ false, useGpu_); + Matrix::resizeOrCreate(sampleOut_.grad, + 1, + samples_.size(), + /* trans= */ false, + useGpu_); backwardCost(); @@ -241,7 +248,8 @@ public: real* sampleOut = sampleOut_.value->getData(); for (size_t i = 0; i < samples_.size(); ++i) { - sampleOut[i] += dotProduct(dim, inputMat->getRowBuf(samples_[i].sampleId), + sampleOut[i] += dotProduct(dim, + inputMat->getRowBuf(samples_[i].sampleId), weightMat->getRowBuf(samples_[i].labelId)); } } @@ -257,7 +265,9 @@ public: if (weightGradMat) { for (size_t i = 0; i < samples_.size(); ++i) { - axpy(dim, sampleGrad[i], inputMat->getRowBuf(samples_[i].sampleId), + axpy(dim, + sampleGrad[i], + inputMat->getRowBuf(samples_[i].sampleId), weightGradMat->getRowBuf(samples_[i].labelId)); } weights_[layerId]->incUpdate(callback); @@ -265,7 +275,9 @@ public: if (inputGradMat) { for (size_t i = 0; i < samples_.size(); ++i) { - axpy(dim, sampleGrad[i], weightMat->getRowBuf(samples_[i].labelId), + axpy(dim, + sampleGrad[i], + weightMat->getRowBuf(samples_[i].labelId), inputGradMat->getRowBuf(samples_[i].sampleId)); } } diff --git a/paddle/gserver/layers/NormLayer.cpp b/paddle/gserver/layers/NormLayer.cpp index ad8b92d2ff72426d30f2488af7d168ffd8e5b65d..7f6ffe229842113869b4f2d61d59cdc0f4e1ddf8 100644 --- a/paddle/gserver/layers/NormLayer.cpp +++ b/paddle/gserver/layers/NormLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/utils/Logging.h" #include "NormLayer.h" #include "NormProjectionLayer.h" diff --git a/paddle/gserver/layers/NormLayer.h b/paddle/gserver/layers/NormLayer.h index 2b05be6fcb44fc3f61f9be4e464b2100284bf5c6..9e848e5268d6b4b69f24802b66c5fed7cc1bf9e4 100644 --- a/paddle/gserver/layers/NormLayer.h +++ b/paddle/gserver/layers/NormLayer.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include @@ -44,8 +43,8 @@ public: /** * @brief response normalization within feature maps - * namely normalize in independent channel - * When code refactoring, we delete the original implementation. + * namely normalize in independent channel + * When code refactoring, we delete the original implementation. * Need to implement in the futrue. */ class ResponseNormLayer : public NormLayer { diff --git a/paddle/gserver/layers/NormProjectionLayer.cpp b/paddle/gserver/layers/NormProjectionLayer.cpp index eab6e904ee998b876a4dd7c503eec3a9a84f7412..6ac468e6fc7c2962beaf8c28192890634340b296 100644 --- a/paddle/gserver/layers/NormProjectionLayer.cpp +++ b/paddle/gserver/layers/NormProjectionLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/utils/Logging.h" #include "paddle/utils/Stat.h" #include "NormProjectionLayer.h" @@ -65,8 +64,8 @@ void CMRProjectionNormLayer::forward(PassType passType) { denoms_->zeroMem(); - outV->crossMapNormalFwd(*input, imgSizeH_, imgSizeW_, *denoms_, channels_, - size_, scale_, pow_); + outV->crossMapNormalFwd( + *input, imgSizeH_, imgSizeW_, *denoms_, channels_, size_, scale_, pow_); } void CMRProjectionNormLayer::backward(const UpdateCallback& callback) { @@ -81,8 +80,15 @@ void CMRProjectionNormLayer::backward(const UpdateCallback& callback) { MatrixPtr localOutV = getOutputValue(); MatrixPtr preOutV = inputLayers_[0]->getOutputValue(); - preOutGrad->crossMapNormalBwd(*localGrad, *denoms_, *preOutV, *localOutV, - channels_, imgSizeH_, imgSizeW_, size_, scale_, + preOutGrad->crossMapNormalBwd(*localGrad, + *denoms_, + *preOutV, + *localOutV, + channels_, + imgSizeH_, + imgSizeW_, + size_, + scale_, pow_); } } // namespace paddle diff --git a/paddle/gserver/layers/NormProjectionLayer.h b/paddle/gserver/layers/NormProjectionLayer.h index 728806ea76958382a3ad06804f773c959598d043..b42e98ab0941e59a38bb1cfa73f49682dbef942c 100644 --- a/paddle/gserver/layers/NormProjectionLayer.h +++ b/paddle/gserver/layers/NormProjectionLayer.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "NormLayer.h" diff --git a/paddle/gserver/layers/Operator.cpp b/paddle/gserver/layers/Operator.cpp index 5fa8239ac5d6f11da0558c8c9eddf8af378f0df3..b89c4740142e377f0cbbe755377f37baac270552 100644 --- a/paddle/gserver/layers/Operator.cpp +++ b/paddle/gserver/layers/Operator.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "Operator.h" namespace paddle { diff --git a/paddle/gserver/layers/Operator.h b/paddle/gserver/layers/Operator.h index 9ee16f70ee3a3cae3b7e764c674bbef348a300fc..ff6558dc73b8d60f3b4a3d87c9d28c650c8f2987 100644 --- a/paddle/gserver/layers/Operator.h +++ b/paddle/gserver/layers/Operator.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "paddle/parameter/Parameter.h" @@ -48,12 +47,14 @@ public: static ClassRegistrar registrar_; /** - * Forward propagation. If backward() will be called, in and out must be kept valid until then. + * Forward propagation. If backward() will be called, in and out must be kept + * valid until then. * @param ins inputs of operator * @param out output of operator * @param passType PASS_TRAIN of PASS_TEST */ - void forward(std::vector ins, Argument* out, + void forward(std::vector ins, + Argument* out, PassType passType) { ins_ = ins; out_ = out; diff --git a/paddle/gserver/layers/OuterProdLayer.cpp b/paddle/gserver/layers/OuterProdLayer.cpp index 708c901ba9e9d2a5421fc64789f4ac174b365dc1..9b24a4f440c9e1fc3b4e73a7234c791fff045ea9 100644 --- a/paddle/gserver/layers/OuterProdLayer.cpp +++ b/paddle/gserver/layers/OuterProdLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/utils/Logging.h" #include "Layer.h" #include "paddle/math/Matrix.h" @@ -58,12 +57,15 @@ bool OuterProdLayer::init(const LayerMap& layerMap, CHECK_EQ(dim0 * dim1, getSize()) << "Dimension mismatch"; - tmpRow0 = Matrix::create(nullptr, /* height= */ 1, dim0, /* trans= */ false, - useGpu_); - tmpRow1 = Matrix::create(nullptr, /* height= */ 1, dim1, /* trans= */ false, + tmpRow0 = Matrix::create( + nullptr, /* height= */ 1, dim0, /* trans= */ false, useGpu_); + tmpRow1 = Matrix::create( + nullptr, /* height= */ 1, dim1, /* trans= */ false, useGpu_); + tmpMtx0 = Matrix::create(nullptr, + /* height= */ dim0, + dim1, + /* trans= */ false, useGpu_); - tmpMtx0 = Matrix::create(nullptr, /* height= */ dim0, dim1, - /* trans= */ false, useGpu_); return true; } diff --git a/paddle/gserver/layers/ParameterReluLayer.cpp b/paddle/gserver/layers/ParameterReluLayer.cpp index 98d108db5f05252aefa76fcad3d3eb429d59e82a..cd3bffa2e1d01ef8367c39c20c8e6f366c583b68 100644 --- a/paddle/gserver/layers/ParameterReluLayer.cpp +++ b/paddle/gserver/layers/ParameterReluLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "ParameterReluLayer.h" #include "paddle/utils/Logging.h" #include "paddle/utils/Stat.h" @@ -59,8 +58,8 @@ void ParameterReluLayer::backward(const UpdateCallback& callback) { } MatrixPtr preGrad = getInputGrad(0); - preGrad->paramReluBackwardDiff(*getOutputGrad(), *(getInputValue(0)), - *(weight_->getW())); + preGrad->paramReluBackwardDiff( + *getOutputGrad(), *(getInputValue(0)), *(weight_->getW())); { REGISTER_TIMER_INFO("WeightUpdate", getName().c_str()); weight_->getParameterPtr()->incUpdate(callback); diff --git a/paddle/gserver/layers/ParameterReluLayer.h b/paddle/gserver/layers/ParameterReluLayer.h index 367e4e787c5ef24a934974af54c7b2bb8cd6de5f..029c09381f0e13de111ef30c4574d2255abfd018 100644 --- a/paddle/gserver/layers/ParameterReluLayer.h +++ b/paddle/gserver/layers/ParameterReluLayer.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "Layer.h" diff --git a/paddle/gserver/layers/PoolLayer.cpp b/paddle/gserver/layers/PoolLayer.cpp index 2fbc9001f11613cd987e3815f6f31caa8f9979cf..511dfd87c12551c91e8864364dbf1a1085a989b6 100644 --- a/paddle/gserver/layers/PoolLayer.cpp +++ b/paddle/gserver/layers/PoolLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/utils/Logging.h" #include "PoolLayer.h" #include "PoolProjectionLayer.h" diff --git a/paddle/gserver/layers/PoolLayer.h b/paddle/gserver/layers/PoolLayer.h index e87ad08251dd40214f61857251a03e56867a675e..59be295a538b007993e77f85f079f78a8b881eca 100644 --- a/paddle/gserver/layers/PoolLayer.h +++ b/paddle/gserver/layers/PoolLayer.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "Layer.h" diff --git a/paddle/gserver/layers/PoolProjection.cpp b/paddle/gserver/layers/PoolProjection.cpp index 9be5aba3d57d23e462c9ea3608491606f988c35f..1b227c8084991e4bbf1e380881a6018fe01e9180 100644 --- a/paddle/gserver/layers/PoolProjection.cpp +++ b/paddle/gserver/layers/PoolProjection.cpp @@ -19,7 +19,8 @@ namespace paddle { REGISTER_PROJECTION_CREATE_FUNC(pool, &PoolProjection::create); PoolProjection::PoolProjection(const ProjectionConfig& config, - ParameterPtr parameter, bool useGpu) + ParameterPtr parameter, + bool useGpu) : Projection(config, parameter, useGpu) { const PoolConfig& conf = config_.pool_conf(); poolType_ = conf.pool_type(); @@ -47,9 +48,15 @@ size_t PoolProjection::getSize() { if (imgSize_ == 0) { imgSize_ = conf.img_size(); } - outputY_ = outputSize(imgSizeY_, sizeY_, confPaddingY_, strideY_, + outputY_ = outputSize(imgSizeY_, + sizeY_, + confPaddingY_, + strideY_, /* caffeMode */ false); - outputX_ = outputSize(imgSize_, sizeX_, confPadding_, stride_, + outputX_ = outputSize(imgSize_, + sizeX_, + confPadding_, + stride_, /* caffeMode */ false); const_cast(out_)->setFrameHeight(outputY_); @@ -59,7 +66,8 @@ size_t PoolProjection::getSize() { } PoolProjection* PoolProjection::create(const ProjectionConfig& config, - ParameterPtr parameter, bool useGpu) { + ParameterPtr parameter, + bool useGpu) { const std::string& pool = config.pool_conf().pool_type(); if (pool == "max-projection") { return new MaxPoolProjection(config, parameter, useGpu); @@ -76,8 +84,17 @@ void MaxPoolProjection::forward() { CHECK_EQ(width, out_->value->getWidth()); MatrixPtr inputV = in_->value; MatrixPtr outV = out_->value; - outV->maxPoolForward(*inputV, imgSizeY_, imgSize_, channels_, sizeX_, sizeY_, - strideY_, stride_, outputY_, outputX_, confPaddingY_, + outV->maxPoolForward(*inputV, + imgSizeY_, + imgSize_, + channels_, + sizeX_, + sizeY_, + strideY_, + stride_, + outputY_, + outputX_, + confPaddingY_, confPadding_); } @@ -91,9 +108,21 @@ void MaxPoolProjection::backward(const UpdateCallback& callback) { if (NULL == inputGrad) { return; } - inputGrad->maxPoolBackward(*inputV, imgSizeY_, imgSize_, *outGrad, *outV, - sizeX_, sizeY_, strideY_, stride_, outputY_, - outputX_, 1, 1, confPaddingY_, confPadding_); + inputGrad->maxPoolBackward(*inputV, + imgSizeY_, + imgSize_, + *outGrad, + *outV, + sizeX_, + sizeY_, + strideY_, + stride_, + outputY_, + outputX_, + 1, + 1, + confPaddingY_, + confPadding_); } void AvgPoolProjection::forward() { @@ -101,8 +130,17 @@ void AvgPoolProjection::forward() { CHECK_EQ(width, out_->value->getWidth()); MatrixPtr inputV = in_->value; MatrixPtr outV = out_->value; - outV->avgPoolForward(*inputV, imgSizeY_, imgSize_, channels_, sizeX_, sizeY_, - strideY_, stride_, outputY_, outputX_, confPaddingY_, + outV->avgPoolForward(*inputV, + imgSizeY_, + imgSize_, + channels_, + sizeX_, + sizeY_, + strideY_, + stride_, + outputY_, + outputX_, + confPaddingY_, confPadding_); } @@ -116,8 +154,18 @@ void AvgPoolProjection::backward(const UpdateCallback& callback) { return; } - inputGrad->avgPoolBackward(*outputGrad, imgSizeY_, imgSize_, sizeX_, sizeY_, - strideY_, stride_, outputY_, outputX_, 1, 1, - confPaddingY_, confPadding_); + inputGrad->avgPoolBackward(*outputGrad, + imgSizeY_, + imgSize_, + sizeX_, + sizeY_, + strideY_, + stride_, + outputY_, + outputX_, + 1, + 1, + confPaddingY_, + confPadding_); } } // namespace paddle diff --git a/paddle/gserver/layers/PoolProjection.h b/paddle/gserver/layers/PoolProjection.h index a11e25b729cb7afabdb3547326f269e54ddf42da..9c3191bd80061c13b645c2a107eaa723e2495032 100644 --- a/paddle/gserver/layers/PoolProjection.h +++ b/paddle/gserver/layers/PoolProjection.h @@ -30,11 +30,13 @@ protected: std::string poolType_; public: - PoolProjection(const ProjectionConfig& config, ParameterPtr parameter, + PoolProjection(const ProjectionConfig& config, + ParameterPtr parameter, bool useGpu); static PoolProjection* create(const ProjectionConfig& config, - ParameterPtr parameter, bool useGpu); + ParameterPtr parameter, + bool useGpu); const std::string& getPoolType() const { return poolType_; } @@ -43,7 +45,8 @@ public: class MaxPoolProjection : public PoolProjection { public: - MaxPoolProjection(const ProjectionConfig& config, ParameterPtr parameter, + MaxPoolProjection(const ProjectionConfig& config, + ParameterPtr parameter, bool useGpu) : PoolProjection(config, parameter, useGpu) {} @@ -53,7 +56,8 @@ public: class AvgPoolProjection : public PoolProjection { public: - AvgPoolProjection(const ProjectionConfig& config, ParameterPtr parameter, + AvgPoolProjection(const ProjectionConfig& config, + ParameterPtr parameter, bool useGpu) : PoolProjection(config, parameter, useGpu) {} diff --git a/paddle/gserver/layers/PoolProjectionLayer.cpp b/paddle/gserver/layers/PoolProjectionLayer.cpp index cabb346d6c99178f7c8ce049d495785c0a488173..aabc60af197af30a367c0f933276116ba316bd34 100644 --- a/paddle/gserver/layers/PoolProjectionLayer.cpp +++ b/paddle/gserver/layers/PoolProjectionLayer.cpp @@ -18,7 +18,6 @@ limitations under the License. */ namespace paddle { - size_t PoolProjectionLayer::getSize() { CHECK_EQ(inputLayers_.size(), 1UL); size_t layerSize = 0; @@ -31,9 +30,15 @@ size_t PoolProjectionLayer::getSize() { imgSizeW_ = imgSize_; } - outputH_ = outputSize(imgSizeH_, sizeY_, confPaddingY_, strideY_, + outputH_ = outputSize(imgSizeH_, + sizeY_, + confPaddingY_, + strideY_, /* caffeMode */ false); - outputW_ = outputSize(imgSizeW_, sizeX_, confPadding_, stride_, + outputW_ = outputSize(imgSizeW_, + sizeX_, + confPadding_, + stride_, /* caffeMode */ false); layerSize = outputH_ * outputW_ * channels_; diff --git a/paddle/gserver/layers/PowerLayer.cpp b/paddle/gserver/layers/PowerLayer.cpp index 44c5e6063b1aed93b3fbb175821f911ca26fac1a..0b9672f220919c6ee1a792fc2d68e8ae540ea09a 100644 --- a/paddle/gserver/layers/PowerLayer.cpp +++ b/paddle/gserver/layers/PowerLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/utils/Logging.h" #include "Layer.h" #include "paddle/math/Matrix.h" @@ -26,7 +25,7 @@ namespace paddle { * \f[ * y = x^w * \f] - * where \f$x\f$ is a input vector, \f$w\f$ is scalar weight, + * where \f$x\f$ is a input vector, \f$w\f$ is scalar weight, * and output \f$y\f$ is a vector. * * The config file api is power_layer. diff --git a/paddle/gserver/layers/PrintLayer.cpp b/paddle/gserver/layers/PrintLayer.cpp index 68fee69f44d0c2c144f6dde6fd8ff36bd96094f6..95be7b34cb106665d2465630233fca6b34d71e79 100644 --- a/paddle/gserver/layers/PrintLayer.cpp +++ b/paddle/gserver/layers/PrintLayer.cpp @@ -18,8 +18,7 @@ namespace paddle { class PrintLayer : public Layer { public: - explicit PrintLayer(const LayerConfig& config) - : Layer(config) {} + explicit PrintLayer(const LayerConfig& config) : Layer(config) {} void forward(PassType passType); void backward(const UpdateCallback& callback) {} }; diff --git a/paddle/gserver/layers/Projection.cpp b/paddle/gserver/layers/Projection.cpp index aebc08f4a0e5937e50d11a5cc832b27210c8ea42..c7eb4b644281ff6e7b58201c41888d3a8967f419 100644 --- a/paddle/gserver/layers/Projection.cpp +++ b/paddle/gserver/layers/Projection.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "Projection.h" #include "ContextProjection.h" @@ -25,7 +24,8 @@ ClassRegistrar Projection::registrar_; Projection* Projection::create(const ProjectionConfig& config, - ParameterPtr parameter, bool useGpu) { + ParameterPtr parameter, + bool useGpu) { return registrar_.createByType(config.type(), config, parameter, useGpu); } diff --git a/paddle/gserver/layers/Projection.h b/paddle/gserver/layers/Projection.h index 203edc5396a53cf72dcad6308335ba4731ba49bc..798503113d761091d1a1bdf9e4ec70e0c2c3b3a4 100644 --- a/paddle/gserver/layers/Projection.h +++ b/paddle/gserver/layers/Projection.h @@ -39,9 +39,11 @@ namespace paddle { class Projection { public: static Projection* create(const ProjectionConfig& config, - ParameterPtr parameter, bool useGpu); + ParameterPtr parameter, + bool useGpu); - Projection(const ProjectionConfig& config, ParameterPtr parameter, + Projection(const ProjectionConfig& config, + ParameterPtr parameter, bool useGpu) : config_(config), parameter_(parameter), useGpu_(useGpu) {} diff --git a/paddle/gserver/layers/RecurrentLayer.cpp b/paddle/gserver/layers/RecurrentLayer.cpp index 30ef679f92c073cce5bb6edd11896007c0a8e68e..08453e21b8ff27138f9fa44ac834b54eb94c0688 100644 --- a/paddle/gserver/layers/RecurrentLayer.cpp +++ b/paddle/gserver/layers/RecurrentLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "Layer.h" #include "paddle/utils/Stat.h" #include "SequenceToBatch.h" @@ -143,8 +142,8 @@ bool RecurrentLayer::init(const LayerMap& layerMap, void RecurrentLayer::resetState() { CHECK(!reversed_) << "state is not allowed for reversed recurrent layer"; - Matrix::resizeOrCreate(prevOutput_, 1, getSize(), /* trans= */ false, - useGpu_); + Matrix::resizeOrCreate( + prevOutput_, 1, getSize(), /* trans= */ false, useGpu_); prevOutput_->zeroMem(); } @@ -183,16 +182,23 @@ void RecurrentLayer::forward(PassType passType) { } } -void RecurrentLayer::forwardSequence(int batchSize, size_t numSequences, +void RecurrentLayer::forwardSequence(int batchSize, + size_t numSequences, const int* starts) { REGISTER_TIMER_INFO("RecurrentFwSequence", getName().c_str()); frameOutput_.reserve(batchSize); for (int i = frameOutput_.size(); i < batchSize; ++i) { Argument arg; - arg.value = Matrix::create(nullptr, /* height= */ 1, getSize(), - /* trans= */ false, useGpu_); - arg.grad = Matrix::create(nullptr, /* height= */ 1, getSize(), - /* trans= */ false, useGpu_); + arg.value = Matrix::create(nullptr, + /* height= */ 1, + getSize(), + /* trans= */ false, + useGpu_); + arg.grad = Matrix::create(nullptr, + /* height= */ 1, + getSize(), + /* trans= */ false, + useGpu_); frameOutput_.push_back(arg); } @@ -213,8 +219,8 @@ void RecurrentLayer::forwardOneSequence(int start, int length) { } activation_->forward(frameOutput_[start]); for (int i = 1; i < length; ++i) { - frameOutput_[start + i].value->mul(frameOutput_[start + i - 1].value, - weight_->getW(), 1, 1); + frameOutput_[start + i].value->mul( + frameOutput_[start + i - 1].value, weight_->getW(), 1, 1); activation_->forward(frameOutput_[start + i]); } if (prevOutput_) { @@ -223,8 +229,8 @@ void RecurrentLayer::forwardOneSequence(int start, int length) { } else { activation_->forward(frameOutput_[start + length - 1]); for (int i = length - 2; i >= 0; --i) { - frameOutput_[start + i].value->mul(frameOutput_[start + i + 1].value, - weight_->getW(), 1, 1); + frameOutput_[start + i].value->mul( + frameOutput_[start + i + 1].value, weight_->getW(), 1, 1); activation_->forward(frameOutput_[start + i]); } } @@ -256,7 +262,8 @@ void RecurrentLayer::backward(const UpdateCallback& callback) { weight_->getParameterPtr()->incUpdate(callback); } -void RecurrentLayer::backwardSequence(int batchSize, size_t numSequences, +void RecurrentLayer::backwardSequence(int batchSize, + size_t numSequences, const int* starts) { REGISTER_TIMER_INFO("RecurrentBwSequence", getName().c_str()); for (int i = 0; i < batchSize; ++i) { @@ -274,31 +281,36 @@ void RecurrentLayer::backwardOneSequence(int start, int length) { if (!reversed_) { for (int i = length - 1; i > 0; --i) { activation_->backward(frameOutput_[start + i]); - frameOutput_[start + i - 1].grad->mul(frameOutput_[start + i].grad, - weightT, 1, 1); + frameOutput_[start + i - 1].grad->mul( + frameOutput_[start + i].grad, weightT, 1, 1); } activation_->backward(frameOutput_[start]); if (weight_->getWGrad()) { weight_->getWGrad()->mul( output_.value->subMatrix(start, length - 1)->getTranspose(), - output_.grad->subMatrix(start + 1, length - 1), 1, 1); + output_.grad->subMatrix(start + 1, length - 1), + 1, + 1); } } else { for (int i = 0; i < length - 1; ++i) { activation_->backward(frameOutput_[start + i]); - frameOutput_[start + i + 1].grad->mul(frameOutput_[start + i].grad, - weightT, 1, 1); + frameOutput_[start + i + 1].grad->mul( + frameOutput_[start + i].grad, weightT, 1, 1); } activation_->backward(frameOutput_[start + length - 1]); if (weight_->getWGrad()) { weight_->getWGrad()->mul( output_.value->subMatrix(start + 1, length - 1)->getTranspose(), - output_.grad->subMatrix(start, length - 1), 1, 1); + output_.grad->subMatrix(start, length - 1), + 1, + 1); } } } -void RecurrentLayer::forwardBatch(int batchSize, size_t numSequences, +void RecurrentLayer::forwardBatch(int batchSize, + size_t numSequences, const int* starts) { if (!batchValue_) { batchValue_.reset(new SequenceToBatch(useGpu_)); @@ -327,7 +339,8 @@ void RecurrentLayer::forwardBatch(int batchSize, size_t numSequences, batchValue_->copyBackSeq(*output_.value); } -void RecurrentLayer::backwardBatch(int batchSize, size_t numSequences, +void RecurrentLayer::backwardBatch(int batchSize, + size_t numSequences, const int* starts) { if (!batchGrad_) { batchGrad_.reset(new SequenceToBatch(useGpu_)); @@ -377,11 +390,15 @@ void RecurrentLayer::backwardBatch(int batchSize, size_t numSequences, if (!reversed_) { weight_->getWGrad()->mul( output_.value->subMatrix(starts[seq], len - 1)->getTranspose(), - output_.grad->subMatrix(starts[seq] + 1, len - 1), 1, 1); + output_.grad->subMatrix(starts[seq] + 1, len - 1), + 1, + 1); } else { weight_->getWGrad()->mul( output_.value->subMatrix(starts[seq] + 1, len - 1)->getTranspose(), - output_.grad->subMatrix(starts[seq], len - 1), 1, 1); + output_.grad->subMatrix(starts[seq], len - 1), + 1, + 1); } } } diff --git a/paddle/gserver/layers/RecurrentLayerGroup.cpp b/paddle/gserver/layers/RecurrentLayerGroup.cpp index 62dbaa2674ce624dec44b8b3c86f9a08c1cfe0ee..a5443975da4ab6ecb302087fe71b018154d439b8 100644 --- a/paddle/gserver/layers/RecurrentLayerGroup.cpp +++ b/paddle/gserver/layers/RecurrentLayerGroup.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/gserver/layers/Layer.h" #include @@ -31,7 +30,8 @@ class RecurrentLayerGroup : public Layer { public: explicit RecurrentLayerGroup(const LayerConfig& config) : Layer(config) {} - void initSubNetwork(NeuralNetwork* rootNetwork, const ModelConfig& config, + void initSubNetwork(NeuralNetwork* rootNetwork, + const ModelConfig& config, const std::vector& parameterTypes, bool useGpu); @@ -53,7 +53,7 @@ public: /** * @see Layer.accessSubNetwork */ - void accessSubNetwork(const std::function &callback) { + void accessSubNetwork(const std::function& callback) { callback(*network_); } @@ -64,8 +64,10 @@ private: REGISTER_LAYER(recurrent_layer_group, RecurrentLayerGroup); void RecurrentLayerGroup::initSubNetwork( - NeuralNetwork* rootNetwork, const ModelConfig& config, - const std::vector& parameterTypes, bool useGpu) { + NeuralNetwork* rootNetwork, + const ModelConfig& config, + const std::vector& parameterTypes, + bool useGpu) { setNeedGradient(true); network_.reset(new RecurrentGradientMachine(config_.name(), rootNetwork)); diff --git a/paddle/gserver/layers/ResizeLayer.cpp b/paddle/gserver/layers/ResizeLayer.cpp index dc573e838f71623e6985b19a4ae2cba6109ef6b5..3c478a33e350cf0e901381890e3df1496893f4db 100644 --- a/paddle/gserver/layers/ResizeLayer.cpp +++ b/paddle/gserver/layers/ResizeLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "Layer.h" #include "paddle/math/Matrix.h" #include "paddle/math/BaseMatrix.h" @@ -68,9 +67,11 @@ void ResizeLayer::backward(const UpdateCallback& callback) { return; } - MatrixPtr tmp = - Matrix::create(input.grad->getData(), height * width / getSize(), - getSize(), false, useGpu_); + MatrixPtr tmp = Matrix::create(input.grad->getData(), + height * width / getSize(), + getSize(), + false, + useGpu_); tmp->add(*output_.grad); } diff --git a/paddle/gserver/layers/ScalingLayer.cpp b/paddle/gserver/layers/ScalingLayer.cpp index a494b401ff597290cf67ef55c4bf1b062da988ab..71570810f9576df74940968426c09ae421881ba6 100644 --- a/paddle/gserver/layers/ScalingLayer.cpp +++ b/paddle/gserver/layers/ScalingLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/utils/Logging.h" #include "Layer.h" #include "paddle/math/Matrix.h" @@ -26,7 +25,7 @@ namespace paddle { * \f[ * y.row[i] = w[i] * x.row[i] * \f] - * where \f$x\f$ is (batchSize x dataDim) input, \f$w\f$ is + * where \f$x\f$ is (batchSize x dataDim) input, \f$w\f$ is * (batchSize x 1) weight vector, and \f$y\f$ is (batchSize x dataDim) output. * * The config file api is scaling_layer. diff --git a/paddle/gserver/layers/ScalingProjection.cpp b/paddle/gserver/layers/ScalingProjection.cpp index c0a7072c6a7cc1d37723f43d1068483779f56437..7999d02d384a06b900fbfa2c8bb271660b7fe008 100644 --- a/paddle/gserver/layers/ScalingProjection.cpp +++ b/paddle/gserver/layers/ScalingProjection.cpp @@ -19,7 +19,8 @@ namespace paddle { class ScalingProjection : public Projection { public: ScalingProjection(const ProjectionConfig& config, - const ParameterPtr& parameter, bool useGpu) + const ParameterPtr& parameter, + bool useGpu) : Projection(config, parameter, useGpu) { CHECK_EQ(parameter->getSize(), 1UL); weight_.reset(new Weight(1, 1, parameter)); @@ -33,10 +34,13 @@ public: void backward(const UpdateCallback& callback) { if (weight_->getWGrad()) { auto sum = Matrix::create(in_->value->getHeight(), 1, false, useGpu_); - sum->sumOfProducts(*in_->value, *out_->grad, - /* scaleSum= */1, /* scaleDest= */0); + sum->sumOfProducts(*in_->value, + *out_->grad, + /* scaleSum= */ 1, + /* scaleDest= */ 0); weight_->getWGrad()->sumCols(*sum, - /* scaleSum= */1, /* scaleDest= */1); + /* scaleSum= */ 1, + /* scaleDest= */ 1); parameter_->incUpdate(callback); } if (in_->grad) { diff --git a/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp b/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp index 25ae9d519533a912fb32348c8a521405f6c77eb3..4dfa2c179dafe0d8dcc6766fbafeae129edcc49a 100644 --- a/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp +++ b/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "SelectiveFullyConnectedLayer.h" #include "paddle/utils/Logging.h" #include "paddle/utils/Stat.h" @@ -49,11 +48,11 @@ bool SelectiveFullyConnectedLayer::init(const LayerMap& layerMap, void SelectiveFullyConnectedLayer::prefetch() {} -void SelectiveFullyConnectedLayer::reserveOutput(size_t height, size_t width, +void SelectiveFullyConnectedLayer::reserveOutput(size_t height, + size_t width, size_t nnz) { bool flag = (passType_ == PASS_TEST && - config_.selective_fc_pass_generation() && - !fullOutput_); + config_.selective_fc_pass_generation() && !fullOutput_); SetDevice device(output_.deviceId); if (flag) { // output_.value is sparse matrix @@ -61,8 +60,12 @@ void SelectiveFullyConnectedLayer::reserveOutput(size_t height, size_t width, dynamic_cast(output_.value.get())) { output_.value = nullptr; } - Matrix::resizeOrCreateSparseMatrix(output_.value, height, width, nnz, - FLOAT_VALUE, SPARSE_CSR, + Matrix::resizeOrCreateSparseMatrix(output_.value, + height, + width, + nnz, + FLOAT_VALUE, + SPARSE_CSR, /*trans=*/false, /*useGpu=*/useGpu_); output_.value->copyFrom(*selCols_); @@ -74,19 +77,31 @@ void SelectiveFullyConnectedLayer::reserveOutput(size_t height, size_t width, dynamic_cast(output_.value.get())) { output_.value = nullptr; } - Matrix::resizeOrCreate(output_.value, height, width, - /*trans=*/false, /*useGpu=*/useGpu_); + Matrix::resizeOrCreate(output_.value, + height, + width, + /*trans=*/false, + /*useGpu=*/useGpu_); interOutput_ = output_.value; } else { // output_.value is dense matrix, but width = nnz /height CHECK_EQ(nnz % height, 0U); CHECK(nnz / height); - Matrix::resizeOrCreate(output_.value, height, nnz / height, - /*trans=*/false, /*useGpu=*/useGpu_); - interOutput_ = Matrix::createSparseMatrix( - output_.value->getData(), selCols_->getRows(), selCols_->getCols(), - height, width, nnz, FLOAT_VALUE, SPARSE_CSR, - /*trans=*/false, /*useGpu=*/useGpu_); + Matrix::resizeOrCreate(output_.value, + height, + nnz / height, + /*trans=*/false, + /*useGpu=*/useGpu_); + interOutput_ = Matrix::createSparseMatrix(output_.value->getData(), + selCols_->getRows(), + selCols_->getCols(), + height, + width, + nnz, + FLOAT_VALUE, + SPARSE_CSR, + /*trans=*/false, + /*useGpu=*/useGpu_); } } interOutput_->zeroMem(); @@ -97,8 +112,11 @@ void SelectiveFullyConnectedLayer::reserveOutput(size_t height, size_t width, CHECK(nnz / height) << "during training, " "each sample must have at least one column selected."; - Matrix::resizeOrCreate(output_.grad, height, nnz / height, - /*trans=*/false, /*useGpu=*/useGpu_); + Matrix::resizeOrCreate(output_.grad, + height, + nnz / height, + /*trans=*/false, + /*useGpu=*/useGpu_); output_.grad->zeroMem(); } } @@ -131,7 +149,7 @@ void SelectiveFullyConnectedLayer::forward(PassType passType) { real scaleT = i == 0 ? real(0) : real(1); flag = nnz < (hsize * wsize) * config_.selective_fc_full_mul_ratio() && - !fullOutput_; + !fullOutput_; if (flag) { // if the indecies are highly sparse, // manully compute the multiplication of @@ -145,8 +163,11 @@ void SelectiveFullyConnectedLayer::forward(PassType passType) { if (fullOutput_) { interOutput_->mul(input, weight->getTranspose(), 1, scaleT); } else { - Matrix::resizeOrCreate(mmat_, hsize, wsize, - /*trans=*/false, /*useGpu=*/useGpu_); + Matrix::resizeOrCreate(mmat_, + hsize, + wsize, + /*trans=*/false, + /*useGpu=*/useGpu_); mmat_->mul(input, weight->getTranspose()); interOutput_->add3(mmat_); } @@ -158,7 +179,7 @@ void SelectiveFullyConnectedLayer::forward(PassType passType) { } flag = (passType_ == PASS_TEST && config_.selective_fc_pass_generation() && - !fullOutput_); + !fullOutput_); if (flag) { // during generation, output of this layer is a sparse csr matrix, // which is probably the input of maxid layer @@ -166,8 +187,11 @@ void SelectiveFullyConnectedLayer::forward(PassType passType) { // activiation of this layer should be exponential, not softmax. Argument arg; - arg.value = Matrix::create(interOutput_->getData(), 1, nnz, - /*trans=*/false, /*useGpu=*/useGpu_); + arg.value = Matrix::create(interOutput_->getData(), + 1, + nnz, + /*trans=*/false, + /*useGpu=*/useGpu_); activation_->forward(arg); } else /* train and test in train, not generating */ { // during training, this layer output value is *Matrix*, which is input of @@ -187,17 +211,22 @@ void SelectiveFullyConnectedLayer::backward(const UpdateCallback& callback) { backwardActivation(); MatrixPtr oGrad = getOutputGrad(); if (!fullOutput_) { - interOutGrad_ = Matrix::createSparseMatrix( - oGrad->getData(), interOutput_->getRows(), interOutput_->getCols(), - interOutput_->getHeight(), interOutput_->getWidth(), - interOutput_->getElementCnt(), FLOAT_VALUE, SPARSE_CSR, - /*trans=*/false, - /*useGpu=*/useGpu_); + interOutGrad_ = Matrix::createSparseMatrix(oGrad->getData(), + interOutput_->getRows(), + interOutput_->getCols(), + interOutput_->getHeight(), + interOutput_->getWidth(), + interOutput_->getElementCnt(), + FLOAT_VALUE, + SPARSE_CSR, + /*trans=*/false, + /*useGpu=*/useGpu_); } else { - interOutGrad_ = - Matrix::create(oGrad->getData(), oGrad->getHeight(), oGrad->getWidth(), - /*trans=*/false, - /*useGpu=*/useGpu_); + interOutGrad_ = Matrix::create(oGrad->getData(), + oGrad->getHeight(), + oGrad->getWidth(), + /*trans=*/false, + /*useGpu=*/useGpu_); } if (biases_ && biases_->getWGrad()) { @@ -240,13 +269,21 @@ void paddle::SelectiveFullyConnectedLayer::fillSelectiveData( size_t sampleNum = candidates->size(); size_t outputWidth = getSize(); size_t nnz = - std::accumulate(candidates->begin(), candidates->end(), 0UL, + std::accumulate(candidates->begin(), + candidates->end(), + 0UL, [](size_t a, const std::pair& arr) { return a + arr.second; }); Matrix::resizeOrCreateSparseMatrix(this->cpuSelCols_, - sampleNum, outputWidth, nnz, NO_VALUE, SPARSE_CSR, false, false); + sampleNum, + outputWidth, + nnz, + NO_VALUE, + SPARSE_CSR, + false, + false); CHECK(this->cpuSelCols_ != nullptr); CpuSparseMatrixPtr selCols = std::dynamic_pointer_cast(cpuSelCols_); @@ -272,7 +309,13 @@ void paddle::SelectiveFullyConnectedLayer::fillSelectiveData( this->selCols_ = this->cpuSelCols_; } else { Matrix::resizeOrCreateSparseMatrix(this->selCols_, - sampleNum, outputWidth, nnz, NO_VALUE, SPARSE_CSR, false, true); + sampleNum, + outputWidth, + nnz, + NO_VALUE, + SPARSE_CSR, + false, + true); this->selCols_->copyFrom(*cpuSelCols_, HPPL_STREAM_1); hl_stream_synchronize(HPPL_STREAM_1); } diff --git a/paddle/gserver/layers/SelectiveFullyConnectedLayer.h b/paddle/gserver/layers/SelectiveFullyConnectedLayer.h index c152151cff051bc0f62bcf6702d6c6c649be8003..9f92ae060521bd7852b67d45649d1cd0792961d4 100644 --- a/paddle/gserver/layers/SelectiveFullyConnectedLayer.h +++ b/paddle/gserver/layers/SelectiveFullyConnectedLayer.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "Layer.h" @@ -98,8 +97,6 @@ private: /** * @brief Make SelectiveFC act as FullyConnectedLayer */ - void fillFullySelectiveData() { - fullOutput_ = true; - } + void fillFullySelectiveData() { fullOutput_ = true; } }; } // namespace paddle diff --git a/paddle/gserver/layers/SequenceConcatLayer.cpp b/paddle/gserver/layers/SequenceConcatLayer.cpp index dfce4dcb196132414542d4fe9f0d97200e44779c..bd72ba3d167d99b5d3fdd047d6b1bfab611b3232 100644 --- a/paddle/gserver/layers/SequenceConcatLayer.cpp +++ b/paddle/gserver/layers/SequenceConcatLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/utils/Logging.h" #include "Layer.h" #include "paddle/math/Matrix.h" @@ -68,13 +67,11 @@ void SequenceConcatLayer::forward(PassType passType) { const Argument& input1 = getInput(0); size_t numSequences1 = input1.getNumSequences(); - auto startPositions1 = - input1.sequenceStartPositions->getVector(false); + auto startPositions1 = input1.sequenceStartPositions->getVector(false); const Argument& input2 = getInput(1); size_t numSequences2 = input2.getNumSequences(); - auto startPositions2 = - input2.sequenceStartPositions->getVector(false); + auto startPositions2 = input2.sequenceStartPositions->getVector(false); CHECK_EQ(dim, input1.value->getWidth()); CHECK_EQ(startPositions1->getData()[numSequences1], input1.getBatchSize()); @@ -117,8 +114,8 @@ void SequenceConcatLayer::forward(PassType passType) { } // modify the sequenceStartPositions - ICpuGpuVector::resizeOrCreate(output_.sequenceStartPositions, - numSequences1 + 1, false); + ICpuGpuVector::resizeOrCreate( + output_.sequenceStartPositions, numSequences1 + 1, false); int* tgtBuf = output_.sequenceStartPositions->getMutableData(false); @@ -150,10 +147,8 @@ void SequenceConcatLayer::backward(const UpdateCallback& callback) { MatrixPtr inputGrad1 = getInputGrad(0); MatrixPtr inputGrad2 = getInputGrad(1); MatrixPtr outputGrad = getOutputGrad(); - auto startPositions1 = - getInput(0).sequenceStartPositions->getVector(false); - auto startPositions2 = - getInput(1).sequenceStartPositions->getVector(false); + auto startPositions1 = getInput(0).sequenceStartPositions->getVector(false); + auto startPositions2 = getInput(1).sequenceStartPositions->getVector(false); size_t numSequences1 = startPositions1->getSize() - 1; size_t numSequences2 = startPositions2->getSize() - 1; diff --git a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp index 26d9536dd57aa3e8f5b3b548730d06b89feed68f..0e9531eabb4b389b762e235ec01d5f16c88cd4a1 100644 --- a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp +++ b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/utils/Logging.h" #include "SequencePoolLayer.h" diff --git a/paddle/gserver/layers/SequencePoolLayer.cpp b/paddle/gserver/layers/SequencePoolLayer.cpp index 55be73d363df19bc3c597252b0e5e2518bcee849..c9f19b7d3b66b3ac031135c04a96ffe27245aa01 100644 --- a/paddle/gserver/layers/SequencePoolLayer.cpp +++ b/paddle/gserver/layers/SequencePoolLayer.cpp @@ -58,7 +58,7 @@ void SequencePoolLayer::forward(PassType passType) { resetOutput(newBatchSize_, dim); if (type_) { CHECK(input.subSequenceStartPositions) - << "when trans_type = seq, input must hasSubseq"; + << "when trans_type = seq, input must hasSubseq"; } /* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq, * thus, in this case, output_ has no sequenceStartPositions. diff --git a/paddle/gserver/layers/SequenceReshapeLayer.cpp b/paddle/gserver/layers/SequenceReshapeLayer.cpp index 05766706b002c0ab1a1ee3d5c34f134985a975eb..5ca9b8b300161688817234909f2b875801d90995 100644 --- a/paddle/gserver/layers/SequenceReshapeLayer.cpp +++ b/paddle/gserver/layers/SequenceReshapeLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/utils/Logging.h" #include "Layer.h" #include "paddle/math/Matrix.h" @@ -69,8 +68,7 @@ void SequenceReshapeLayer::forward(PassType passType) { size_t outDim = getSize(); size_t numSequences = input.getNumSequences(); - auto startPositions = - input.sequenceStartPositions->getVector(false); + auto startPositions = input.sequenceStartPositions->getVector(false); const int* starts = startPositions->getData(); CHECK_EQ(starts[numSequences], input.getBatchSize()); @@ -96,9 +94,7 @@ void SequenceReshapeLayer::forward(PassType passType) { // modify the sequenceStartPositions ICpuGpuVector::resizeOrCreate( - output_.sequenceStartPositions, - numSequences + 1, - false); + output_.sequenceStartPositions, numSequences + 1, false); int* tgtBuf = output_.sequenceStartPositions->getMutableData(false); @@ -134,8 +130,11 @@ void SequenceReshapeLayer::backward(const UpdateCallback& callback) { REGISTER_TIMER_INFO("SequenceReshapeLayerBackward", getName().c_str()); if (inputGrad) { - Matrix::resizeOrCreate(reshapedOutputGrad, inputGrad->getHeight(), - inputGrad->getWidth(), false, useGpu_); + Matrix::resizeOrCreate(reshapedOutputGrad, + inputGrad->getHeight(), + inputGrad->getWidth(), + false, + useGpu_); reshapedOutputGrad->copyFrom(*outputGrad); inputGrad->add(*reshapedOutputGrad); } diff --git a/paddle/gserver/layers/SequenceToBatch.cpp b/paddle/gserver/layers/SequenceToBatch.cpp index 88eace28b2afff982614375da8c1dd03ab324fdc..04402db9c8af2f51f30a09cbf1e9c4023fe3e531 100644 --- a/paddle/gserver/layers/SequenceToBatch.cpp +++ b/paddle/gserver/layers/SequenceToBatch.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include #include #include "SequenceToBatch.h" @@ -21,8 +20,10 @@ limitations under the License. */ namespace paddle { -void SequenceToBatch::resizeOrCreateBatch(int batchSize, size_t numSequences, - const int *seqStarts, bool reversed, +void SequenceToBatch::resizeOrCreateBatch(int batchSize, + size_t numSequences, + const int *seqStarts, + bool reversed, bool prevBatchState) { CHECK_EQ(seqStarts[numSequences], batchSize); IVector::resizeOrCreate(seq2BatchIdx_, batchSize, useGpu_); @@ -50,7 +51,8 @@ void SequenceToBatch::resizeOrCreateBatch(int batchSize, size_t numSequences, int length = seqStarts[seqId + 1] - seqStarts[seqId]; seqStartAndLength.emplace_back(seqStarts[seqId], length, seqId); } - std::sort(seqStartAndLength.begin(), seqStartAndLength.end(), + std::sort(seqStartAndLength.begin(), + seqStartAndLength.end(), [](SeqStartAndLength a, SeqStartAndLength b) { return a.length_ > b.length_; }); @@ -122,15 +124,19 @@ void SequenceToBatch::resizeOrCreateBatch(int batchSize, size_t numSequences, } void SequenceToBatch::resizeOrCreate(Matrix &seqValue) { - Matrix::resizeOrCreate(batchValue_, seqValue.getHeight(), seqValue.getWidth(), - /* trans= */ false, useGpu_); + Matrix::resizeOrCreate(batchValue_, + seqValue.getHeight(), + seqValue.getWidth(), + /* trans= */ false, + useGpu_); } MatrixPtr SequenceToBatch::getBatchValue(int batchId, int numRows) { return getBatchValue(*batchValue_, batchId, numRows); } -MatrixPtr SequenceToBatch::getBatchValue(Matrix &batchValue, int batchId, +MatrixPtr SequenceToBatch::getBatchValue(Matrix &batchValue, + int batchId, int numRows) { int *batchStartPositions = batchStartPositions_->getData(); int start = batchStartPositions[batchId]; @@ -151,7 +157,8 @@ void SequenceToBatch::getSeqOutputFromBatch(Matrix &sequence, Matrix &batch) { sequence2BatchCopy(sequence, batch, *seqEndIdxInBatch_, true); } -void SequenceToBatch::sequence2BatchCopy(Matrix &batch, Matrix &sequence, +void SequenceToBatch::sequence2BatchCopy(Matrix &batch, + Matrix &sequence, IVector &seq2BatchIdx, bool seq2batch) { int seqWidth = sequence.getWidth(); @@ -161,23 +168,27 @@ void SequenceToBatch::sequence2BatchCopy(Matrix &batch, Matrix &sequence, int *idxData = seq2BatchIdx.getData(); if (useGpu_) { - hl_sequence2batch_copy(batchData, seqData, idxData, seqWidth, - batchCount, seq2batch); + hl_sequence2batch_copy( + batchData, seqData, idxData, seqWidth, batchCount, seq2batch); } else { for (int i = 0; i < batchCount; ++i) { if (seq2batch) { - memcpy(batch.rowBuf(i), sequence.rowBuf(idxData[i]), + memcpy(batch.rowBuf(i), + sequence.rowBuf(idxData[i]), seqWidth * sizeof(real)); } else { - memcpy(sequence.rowBuf(idxData[i]), batch.rowBuf(i), + memcpy(sequence.rowBuf(idxData[i]), + batch.rowBuf(i), seqWidth * sizeof(real)); } } } } -void SequenceToBatch::sequence2BatchAdd(Matrix &batch, Matrix &sequence, - IVector &seq2BatchIdx, bool seq2batch) { +void SequenceToBatch::sequence2BatchAdd(Matrix &batch, + Matrix &sequence, + IVector &seq2BatchIdx, + bool seq2batch) { int seqWidth = sequence.getWidth(); int batchCount = batch.getHeight(); real *batchData = batch.getData(); @@ -185,8 +196,8 @@ void SequenceToBatch::sequence2BatchAdd(Matrix &batch, Matrix &sequence, int *idxData = seq2BatchIdx.getData(); if (useGpu_) { - hl_sequence2batch_add(batchData, seqData, idxData, seqWidth, - batchCount, seq2batch); + hl_sequence2batch_add( + batchData, seqData, idxData, seqWidth, batchCount, seq2batch); } else { for (int i = 0; i < batchCount; ++i) { if (seq2batch) { @@ -199,8 +210,11 @@ void SequenceToBatch::sequence2BatchAdd(Matrix &batch, Matrix &sequence, } void SequenceToBatch::copyFromSeq(Matrix &seqValue) { - Matrix::resizeOrCreate(batchValue_, seqValue.getHeight(), seqValue.getWidth(), - /* trans= */ false, useGpu_); + Matrix::resizeOrCreate(batchValue_, + seqValue.getHeight(), + seqValue.getWidth(), + /* trans= */ false, + useGpu_); sequence2BatchCopy(*batchValue_, seqValue, *seq2BatchIdx_, true); } @@ -208,12 +222,14 @@ void SequenceToBatch::copyBackSeq(Matrix &seqValue) { sequence2BatchCopy(*batchValue_, seqValue, *seq2BatchIdx_, false); } -void SequenceToBatch::copy(Matrix &seqValue, Matrix &batchValue, +void SequenceToBatch::copy(Matrix &seqValue, + Matrix &batchValue, bool seq2batch) { sequence2BatchCopy(batchValue, seqValue, *seq2BatchIdx_, seq2batch); } -void SequenceToBatch::add(Matrix &seqValue, Matrix &batchValue, +void SequenceToBatch::add(Matrix &seqValue, + Matrix &batchValue, bool seq2batch) { sequence2BatchAdd(batchValue, seqValue, *seq2BatchIdx_, seq2batch); } diff --git a/paddle/gserver/layers/SequenceToBatch.h b/paddle/gserver/layers/SequenceToBatch.h index 8cba7ea3b98c3a7774f331ce88160cb9a7a89743..6bc12f207ee3fadbd2a75ca5a5dbb7ce199cc99b 100644 --- a/paddle/gserver/layers/SequenceToBatch.h +++ b/paddle/gserver/layers/SequenceToBatch.h @@ -43,8 +43,10 @@ public: explicit SequenceToBatch(bool useGpu) : useGpu_(useGpu) {} /* resize and calculate the batchIndex_ */ - void resizeOrCreateBatch(int batchSize, size_t numSequences, - const int *seqStarts, bool reversed, + void resizeOrCreateBatch(int batchSize, + size_t numSequences, + const int *seqStarts, + bool reversed, bool prevBatchState = false); /* sequence matrix and batch matrix copy: @@ -81,9 +83,13 @@ public: } protected: - void sequence2BatchCopy(Matrix &batch, Matrix &sequence, - IVector &seq2BatchIdx, bool seq2batch); - void sequence2BatchAdd(Matrix &batch, Matrix &sequence, IVector &seq2BatchIdx, + void sequence2BatchCopy(Matrix &batch, + Matrix &sequence, + IVector &seq2BatchIdx, + bool seq2batch); + void sequence2BatchAdd(Matrix &batch, + Matrix &sequence, + IVector &seq2BatchIdx, bool seq2batch); IVectorPtr batchStartPositions_; diff --git a/paddle/gserver/layers/SlopeInterceptLayer.cpp b/paddle/gserver/layers/SlopeInterceptLayer.cpp index af5fccf6506b6d37faaa030fc2696ac29586908f..dd6ffcd50b01cfa56ee9fbc428ffc2cb9b73ce17 100644 --- a/paddle/gserver/layers/SlopeInterceptLayer.cpp +++ b/paddle/gserver/layers/SlopeInterceptLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/utils/Logging.h" #include "Layer.h" #include "paddle/math/Matrix.h" @@ -21,7 +20,8 @@ limitations under the License. */ namespace paddle { /** - * @brief A layer for applying a slope and an intercept to the input element-wise. + * @brief A layer for applying a slope and an intercept to the input + * element-wise. * This layer is used in NEURAL TURING MACHINE. * @note There is no activation and weight in this layer. * @@ -29,7 +29,8 @@ namespace paddle { * y = ax + b * \f] * - * Here, a is scale and b is offset, which are provided as attributes of the layer. + * Here, a is scale and b is offset, which are provided as attributes of the + * layer. * * The config file api is slope_intercept_layer. */ diff --git a/paddle/gserver/layers/SpatialPyramidPoolLayer.cpp b/paddle/gserver/layers/SpatialPyramidPoolLayer.cpp index 2fcfc8e1ae68a47822ce8f375fb94ecdb196dea6..9609919695853552ed54d8d55e8a669002fa3147 100644 --- a/paddle/gserver/layers/SpatialPyramidPoolLayer.cpp +++ b/paddle/gserver/layers/SpatialPyramidPoolLayer.cpp @@ -93,7 +93,8 @@ bool SpatialPyramidPoolLayer::init(const LayerMap& layerMap, size_t endCol = 0; for (size_t i = 0; i < pyramidHeight_; i++) { poolProjections_.emplace_back(PoolProjection::create( - getConfig(imgSizeW_, imgSizeH_, channels_, i, poolType_), nullptr, + getConfig(imgSizeW_, imgSizeH_, channels_, i, poolType_), + nullptr, useGpu_)); endCol += poolProjections_[i]->getOutputSize(); projCol_.push_back(std::make_pair(startCol, endCol)); diff --git a/paddle/gserver/layers/SpatialPyramidPoolLayer.h b/paddle/gserver/layers/SpatialPyramidPoolLayer.h index e15b6d2f85c6f5b9620e28aaef9c6246341611f9..79db574d99bdb1137e6a55244c382f9c894239c8 100644 --- a/paddle/gserver/layers/SpatialPyramidPoolLayer.h +++ b/paddle/gserver/layers/SpatialPyramidPoolLayer.h @@ -24,7 +24,7 @@ namespace paddle { * @brief A layer for spatial pyramid pooling on the input image by taking * the max, average, etc. within regions, so that the result vector of * different sized images are of the same size. - * + * * The config file api is spp_layer. */ @@ -47,8 +47,11 @@ public: virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap); - ProjectionConfig getConfig(size_t sizeX_, size_t sizeY_, size_t channels, - size_t pyamidLevel_, std::string& poolType_); + ProjectionConfig getConfig(size_t sizeX_, + size_t sizeY_, + size_t channels, + size_t pyamidLevel_, + std::string& poolType_); size_t getSize(); virtual void forward(PassType passType); diff --git a/paddle/gserver/layers/SubSequenceLayer.cpp b/paddle/gserver/layers/SubSequenceLayer.cpp index ccf65ba649f21478ae20902ccd8db0a4734e22e2..664f9e13c055df08552974048428326644b69a6e 100644 --- a/paddle/gserver/layers/SubSequenceLayer.cpp +++ b/paddle/gserver/layers/SubSequenceLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/utils/Logging.h" #include "Layer.h" #include "paddle/math/Matrix.h" @@ -75,18 +74,15 @@ void SubSequenceLayer::forward(PassType passType) { const Argument& input = getInput(0); size_t numSequences1 = input.getNumSequences(); - auto startPositions1 = - input.sequenceStartPositions->getVector(false); + auto startPositions1 = input.sequenceStartPositions->getVector(false); const Argument& offsetSeq = getInput(1); size_t numSequences2 = offsetSeq.getNumSequences(); - auto startPositions2 = - offsetSeq.sequenceStartPositions->getVector(false); + auto startPositions2 = offsetSeq.sequenceStartPositions->getVector(false); const Argument& sizeSeq = getInput(2); size_t numSequences3 = sizeSeq.getNumSequences(); - auto startPositions3 = - sizeSeq.sequenceStartPositions->getVector(false); + auto startPositions3 = sizeSeq.sequenceStartPositions->getVector(false); CHECK_EQ(dim, input.value->getWidth()); @@ -143,8 +139,8 @@ void SubSequenceLayer::forward(PassType passType) { } // modify the sequenceStartPositions - ICpuGpuVector::resizeOrCreate(output_.sequenceStartPositions, - numSequences1 + 1, false); + ICpuGpuVector::resizeOrCreate( + output_.sequenceStartPositions, numSequences1 + 1, false); int* tgtBuf = output_.sequenceStartPositions->getMutableData(false); int offset = 0; @@ -177,8 +173,7 @@ void SubSequenceLayer::backward(const UpdateCallback& callback) { MatrixPtr inputGrad1 = getInputGrad(0); MatrixPtr outputGrad = getOutputGrad(); - auto startPositions1 = - getInput(0).sequenceStartPositions->getVector(false); + auto startPositions1 = getInput(0).sequenceStartPositions->getVector(false); size_t numSequences1 = startPositions1->getSize() - 1; const int* starts1 = startPositions1->getData(); diff --git a/paddle/gserver/layers/SumToOneNormLayer.cpp b/paddle/gserver/layers/SumToOneNormLayer.cpp index 7b61dd08227253c6ac8bbd44c4a852c972762fe0..bcf39168408d2bac50c17d0e22ed747cf0b33d80 100644 --- a/paddle/gserver/layers/SumToOneNormLayer.cpp +++ b/paddle/gserver/layers/SumToOneNormLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/utils/Logging.h" #include "Layer.h" #include "paddle/math/Matrix.h" @@ -21,7 +20,7 @@ limitations under the License. */ namespace paddle { /** - * A layer for sum-to-one normalization, + * A layer for sum-to-one normalization, * which is used in NEURAL TURING MACHINE. * \f[ * out[i] = \frac {in[i]} {\sum_{k=1}^N in[k]} diff --git a/paddle/gserver/layers/TableProjection.cpp b/paddle/gserver/layers/TableProjection.cpp index 947d8cf9be1b4a6a5ce87bdcc57aa3c23967393e..2bc0d329d9605850ecdce6b4a87351579493d834 100644 --- a/paddle/gserver/layers/TableProjection.cpp +++ b/paddle/gserver/layers/TableProjection.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "TableProjection.h" namespace paddle { @@ -20,7 +19,8 @@ namespace paddle { REGISTER_PROJECTION(table, TableProjection); TableProjection::TableProjection(const ProjectionConfig& config, - const ParameterPtr& parameter, bool useGpu) + const ParameterPtr& parameter, + bool useGpu) : Projection(config, parameter, useGpu) { table_.reset( new Weight(config.input_size(), config.output_size(), parameter)); diff --git a/paddle/gserver/layers/TableProjection.h b/paddle/gserver/layers/TableProjection.h index eadf2de623cdf2990bc731cefcac66958c61a311..97c672508a009735a9a8f9980b715881c1f824a2 100644 --- a/paddle/gserver/layers/TableProjection.h +++ b/paddle/gserver/layers/TableProjection.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "Projection.h" @@ -34,7 +33,8 @@ namespace paddle { */ class TableProjection : public Projection { public: - TableProjection(const ProjectionConfig& config, const ParameterPtr& parameter, + TableProjection(const ProjectionConfig& config, + const ParameterPtr& parameter, bool useGpu); /** * If use sparse row matrix as parameter, prefetch feature ids in input label. diff --git a/paddle/gserver/layers/TensorLayer.cpp b/paddle/gserver/layers/TensorLayer.cpp index 84fe9005b003db65e1ae9072669c215a961556ab..03586cc6ff3d148a63af33d89b85d565e2198057 100644 --- a/paddle/gserver/layers/TensorLayer.cpp +++ b/paddle/gserver/layers/TensorLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "TensorLayer.h" #include "paddle/utils/Logging.h" @@ -72,7 +71,9 @@ void TensorLayer::forward(PassType passType) { MatrixPtr input1 = getInputValue(0); MatrixPtr input2 = getInputValue(1); MatrixPtr tmpMat = Matrix::create(input2->getHeight(), - input2->getWidth(), /* trans= */ false, input2->useGpu()); + input2->getWidth(), + /* trans= */ false, + input2->useGpu()); REGISTER_TIMER_INFO("TensorFwMulTimer", getName().c_str()); for (size_t i = 0; i < getSize(); ++i) { MatrixPtr weights = weights_[i]->getW(); @@ -101,7 +102,9 @@ void TensorLayer::backward(const UpdateCallback& callback) { MatrixPtr input2 = getInputValue(1); MatrixPtr oGrad = getOutputGrad(); MatrixPtr tmpMat = Matrix::create(input1->getHeight(), - input1->getWidth(), /* trans= */ false, input1->useGpu()); + input1->getWidth(), + /* trans= */ false, + input1->useGpu()); /* trans(grad * e1) * e2 */ { REGISTER_TIMER_INFO("TensorGradMulTimer", getName().c_str()); diff --git a/paddle/gserver/layers/TensorLayer.h b/paddle/gserver/layers/TensorLayer.h index 83b87b1307ac1faa5511b69aa89c6482cbfd9d44..9ac651de4d99a23a12394c674bda827e935749b9 100644 --- a/paddle/gserver/layers/TensorLayer.h +++ b/paddle/gserver/layers/TensorLayer.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "Layer.h" diff --git a/paddle/gserver/layers/TransLayer.cpp b/paddle/gserver/layers/TransLayer.cpp index f8827bec63a9bc0aa7391906af82d5053b9ccca3..53a24d4cc4633898cff1b56f5a377959a38f6354 100644 --- a/paddle/gserver/layers/TransLayer.cpp +++ b/paddle/gserver/layers/TransLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/utils/Logging.h" #include "TransLayer.h" namespace paddle { diff --git a/paddle/gserver/layers/TransLayer.h b/paddle/gserver/layers/TransLayer.h index 867ccb4d1950cf6b9f5e6da01a11b0abfed14072..25b091f9f414ead5048cd65cfc16b67ae1387ad9 100644 --- a/paddle/gserver/layers/TransLayer.h +++ b/paddle/gserver/layers/TransLayer.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "Layer.h" diff --git a/paddle/gserver/layers/TransposedFullMatrixProjection.cpp b/paddle/gserver/layers/TransposedFullMatrixProjection.cpp index 6e3f6bf2e496cf2e1a4bada5a9dc621024b08996..c883283f782352e674d0fcf0369e8491e31d60ff 100644 --- a/paddle/gserver/layers/TransposedFullMatrixProjection.cpp +++ b/paddle/gserver/layers/TransposedFullMatrixProjection.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/utils/Stat.h" #include "Projection.h" @@ -27,7 +26,8 @@ namespace paddle { class TransposedFullMatrixProjection : public Projection { public: TransposedFullMatrixProjection(const ProjectionConfig& config, - ParameterPtr parameter, bool useGPu); + ParameterPtr parameter, + bool useGPu); virtual void forward(); virtual void backward(const UpdateCallback& callback); diff --git a/paddle/gserver/layers/ValidationLayer.cpp b/paddle/gserver/layers/ValidationLayer.cpp index 48a7b54338fca36095d9cd4af49e09b7fb22dfdf..0fee4bd2463ac86dfcb5ecc0b5e75564d86971d2 100644 --- a/paddle/gserver/layers/ValidationLayer.cpp +++ b/paddle/gserver/layers/ValidationLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include #include #include @@ -68,8 +67,11 @@ void AucValidation::validationImp(MatrixPtr output, IVectorPtr label) { if (dynamic_cast(output.get())) { size_t height = output->getHeight(); size_t width = output->getWidth(); - Matrix::resizeOrCreate(cpuOutput_, height, width, - /* trans=*/false, /* useGpu=*/false); + Matrix::resizeOrCreate(cpuOutput_, + height, + width, + /* trans=*/false, + /* useGpu=*/false); cpuOutput_->copyFrom(*output); IVector::resizeOrCreate(cpuLabel_, height, false); cpuLabel_->copyFrom(*label); diff --git a/paddle/gserver/tests/LayerGradUtil.cpp b/paddle/gserver/tests/LayerGradUtil.cpp index bc7bee0e4bbc8c365505619f6fa21d2a88433fcd..47575169172832cd3f95a53ed6e4dcb87a5b7a4b 100644 --- a/paddle/gserver/tests/LayerGradUtil.cpp +++ b/paddle/gserver/tests/LayerGradUtil.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "LayerGradUtil.h" P_DECLARE_bool(thread_local_rand_use_global_seed); @@ -28,8 +27,13 @@ real getCostSum(LayerPtr& testLayer, MatrixPtr weights) { return Argument::sumCosts(outArgs); } -real getDiffAndPrint(real newCost1, real newCost2, real callbackCount, - char fill, string testLayerName, string name, real step, +real getDiffAndPrint(real newCost1, + real newCost2, + real callbackCount, + char fill, + string testLayerName, + string name, + real step, real delta) { EXPECT_FALSE(std::isnan(newCost1)); EXPECT_FALSE(std::isnan(newCost2)); @@ -49,7 +53,8 @@ real getDiffAndPrint(real newCost1, real newCost2, real callbackCount, return diff; } -void testState(LayerPtr testLayer, vector& dataLayers, +void testState(LayerPtr testLayer, + vector& dataLayers, vector& datas) { auto batchSize = datas[0].getBatchSize(); Argument data; @@ -82,8 +87,8 @@ void testState(LayerPtr testLayer, vector& dataLayers, data.value = datas[j].value->subMatrix(batchId, 1); } if (datas[j].ids) { - data.ids = IVector::create(datas[j].ids->getData() + batchId, 1, - FLAGS_use_gpu); + data.ids = IVector::create( + datas[j].ids->getData() + batchId, 1, FLAGS_use_gpu); } dataLayers[j]->setData(data); dataLayers[j]->forward(PASS_TEST); @@ -128,7 +133,8 @@ void testState(LayerPtr testLayer, vector& dataLayers, } } -void testBatchState(LayerPtr testLayer, vector& dataLayers, +void testBatchState(LayerPtr testLayer, + vector& dataLayers, vector& datas) { auto batchSize = datas[0].getBatchSize(); Argument data; @@ -192,8 +198,10 @@ void testBatchState(LayerPtr testLayer, vector& dataLayers, splitData.sequenceStartPositions = cpuSeqStartPos; for (size_t j = 0; j < datas.size(); ++j) { if (datas[j].value) { - Matrix::resizeOrCreate(splitData.value, splitBatchSize, - datas[j].value->getWidth(), false, + Matrix::resizeOrCreate(splitData.value, + splitBatchSize, + datas[j].value->getWidth(), + false, FLAGS_use_gpu); for (size_t seqId = 0; seqId < numSequences; ++seqId) { if (seqLens[seqId]) { @@ -268,8 +276,10 @@ void initWeight(MatrixPtr& weights) { weights->copyFrom(*tmpMat); } -void initBatchState(LayerPtr dataLayer, LayerPtr testLayer, - LayerStatePtr state, bool useGpu) { +void initBatchState(LayerPtr dataLayer, + LayerPtr testLayer, + LayerStatePtr state, + bool useGpu) { int sequenceNum = dataLayer->getOutput().getNumSequences(); MatrixPtr prevBatchOutput = Matrix::create(sequenceNum, testLayer->getSize(), false, useGpu); @@ -282,9 +292,13 @@ void initBatchState(LayerPtr dataLayer, LayerPtr testLayer, state->value.push_back(prevBatchState); } -void initDataLayer(TestConfig testConf, std::vector* dataLayers, - vector* datas, LayerMap* layerMap, - string testLayerName, size_t batchSize, bool trans, +void initDataLayer(TestConfig testConf, + std::vector* dataLayers, + vector* datas, + LayerMap* layerMap, + string testLayerName, + size_t batchSize, + bool trans, bool useGpu) { ICpuGpuVectorPtr sequenceStartPositions; ICpuGpuVectorPtr subSequenceStartPositions; @@ -328,13 +342,17 @@ void initDataLayer(TestConfig testConf, std::vector* dataLayers, break; case INPUT_SPARSE_NON_VALUE_DATA: data.value = makeRandomSparseMatrix( - batchSize, layer->getSize(), - /* withValue= */ false, useGpu, + batchSize, + layer->getSize(), + /* withValue= */ false, + useGpu, testConf.inputDefs[i].sparse.equalNnzPerSample); break; case INPUT_SPARSE_FLOAT_VALUE_DATA: - data.value = makeRandomSparseMatrix(batchSize, layer->getSize(), - /* withValue= */ true, useGpu); + data.value = makeRandomSparseMatrix(batchSize, + layer->getSize(), + /* withValue= */ true, + useGpu); break; case INPUT_DENSE_DIM_DATA: fillData(trans, layer->getSize(), numSequence); @@ -379,16 +397,21 @@ void initDataLayer(TestConfig testConf, std::vector* dataLayers, } } -void initTestLayer(TestConfig testConf, LayerMap* layerMap, - std::vector* parameters, LayerPtr* testLayer) { +void initTestLayer(TestConfig testConf, + LayerMap* layerMap, + std::vector* parameters, + LayerPtr* testLayer) { ParameterMap parameterMap; size_t index = 0; LayerConfig testConfig = testConf.layerConfig; CHECK_EQ(testConf.inputDefs.size(), size_t(testConf.layerConfig.inputs_size())); - auto initParameter = [&](string paraName, size_t paraSize, bool isStatic, - bool initialize, ParameterConfig paraConfig) { + auto initParameter = [&](string paraName, + size_t paraSize, + bool isStatic, + bool initialize, + ParameterConfig paraConfig) { paraConfig.set_name(paraName); paraConfig.set_size(paraSize); paraConfig.set_initial_std(1); @@ -431,8 +454,11 @@ void initTestLayer(TestConfig testConf, LayerMap* layerMap, if (testConf.biasSize) { testConfig.set_bias_parameter_name("bias"); ParameterConfig paraConfig; - initParameter(testConfig.bias_parameter_name(), testConf.biasSize, - testConf.staticBias, true, paraConfig); + initParameter(testConfig.bias_parameter_name(), + testConf.biasSize, + testConf.staticBias, + true, + paraConfig); } *testLayer = Layer::create(testConfig); @@ -441,9 +467,13 @@ void initTestLayer(TestConfig testConf, LayerMap* layerMap, (*testLayer)->setNeedGradient(true); } -void testPerturbParameter(TestConfig testConf, const MatrixPtr weights, - const LayerStatePtr state, real cost, - real callbackCount, real* maxDiff, LayerPtr testLayer, +void testPerturbParameter(TestConfig testConf, + const MatrixPtr weights, + const LayerStatePtr state, + real cost, + real callbackCount, + real* maxDiff, + LayerPtr testLayer, std::vector* parameters) { char fill = ' '; for (auto& parameter : *parameters) { @@ -481,9 +511,14 @@ void testPerturbParameter(TestConfig testConf, const MatrixPtr weights, parameter->setValueUpdated(); newCost[k] = getCostSum(testLayer, weights); } - real diff = getDiffAndPrint(newCost[0], newCost[1], callbackCount, fill, - testLayer->getName(), parameter->getName(), - step, delta); + real diff = getDiffAndPrint(newCost[0], + newCost[1], + callbackCount, + fill, + testLayer->getName(), + parameter->getName(), + step, + delta); *maxDiff = std::max(*maxDiff, abs(diff)); // restore parameter parameter->getBuf(PARAMETER_VALUE)->copyFrom(oldPara); @@ -492,9 +527,13 @@ void testPerturbParameter(TestConfig testConf, const MatrixPtr weights, } } -void testPerturbInput(TestConfig testConf, const MatrixPtr weights, - const LayerStatePtr state, real cost, real callbackCount, - real* maxDiff, LayerPtr testLayer, +void testPerturbInput(TestConfig testConf, + const MatrixPtr weights, + const LayerStatePtr state, + real cost, + real callbackCount, + real* maxDiff, + LayerPtr testLayer, std::vector dataLayers) { char fill = ' '; for (size_t index = 0; index < testConf.inputDefs.size(); index++) { @@ -539,9 +578,14 @@ void testPerturbInput(TestConfig testConf, const MatrixPtr weights, newCost[k] = getCostSum(testLayer, weights); } - real diff = getDiffAndPrint(newCost[0], newCost[1], callbackCount, fill, + real diff = getDiffAndPrint(newCost[0], + newCost[1], + callbackCount, + fill, testLayer->getName(), - dataLayers[index]->getName(), step, delta); + dataLayers[index]->getName(), + step, + delta); *maxDiff = std::max(*maxDiff, abs(diff)); // restore parameter outV->copyFrom(oldPara); @@ -549,9 +593,13 @@ void testPerturbInput(TestConfig testConf, const MatrixPtr weights, } } -void testLayerGradKernel(TestConfig testConf, string testLayerName, - size_t batchSize, bool trans, bool useGpu, - bool useWeight, float epsilon) { +void testLayerGradKernel(TestConfig testConf, + string testLayerName, + size_t batchSize, + bool trans, + bool useGpu, + bool useWeight, + float epsilon) { #ifdef PADDLE_ONLY_CPU if (useGpu) return; #endif @@ -566,8 +614,14 @@ void testLayerGradKernel(TestConfig testConf, string testLayerName, std::vector dataLayers; LayerMap layerMap; vector datas; - initDataLayer(testConf, &dataLayers, &datas, &layerMap, testLayerName, - batchSize, trans, useGpu); + initDataLayer(testConf, + &dataLayers, + &datas, + &layerMap, + testLayerName, + batchSize, + trans, + useGpu); // test layer initialize std::vector parameters; LayerPtr testLayer; @@ -620,17 +674,28 @@ void testLayerGradKernel(TestConfig testConf, string testLayerName, ++callbackCount; } for (size_t i = 0; i < parameters.size(); ++i) { - EXPECT_EQ(parameters[i]->isStatic() ? 0 : callbackCount, - callbackFlags[i]); + EXPECT_EQ(parameters[i]->isStatic() ? 0 : callbackCount, callbackFlags[i]); } // Test whether the layer's forward calculation is stable // by adding perturbation to its parameters or its input layers real maxDiff = 0; - testPerturbParameter(testConf, weights, state, cost, callbackCount, &maxDiff, - testLayer, ¶meters); - testPerturbInput(testConf, weights, state, cost, callbackCount, &maxDiff, - testLayer, dataLayers); + testPerturbParameter(testConf, + weights, + state, + cost, + callbackCount, + &maxDiff, + testLayer, + ¶meters); + testPerturbInput(testConf, + weights, + state, + cost, + callbackCount, + &maxDiff, + testLayer, + dataLayers); EXPECT_LE(fabs(maxDiff), epsilon); if (testConf.testState) { @@ -641,10 +706,15 @@ void testLayerGradKernel(TestConfig testConf, string testLayerName, } } -void testLayerGrad(TestConfig testConf, string testLayerName, size_t batchSize, - bool trans, bool useGpu, bool useWeight, float epsilon) { - testLayerGradKernel(testConf, testLayerName, batchSize, trans, useGpu, - useWeight, epsilon); +void testLayerGrad(TestConfig testConf, + string testLayerName, + size_t batchSize, + bool trans, + bool useGpu, + bool useWeight, + float epsilon) { + testLayerGradKernel( + testConf, testLayerName, batchSize, trans, useGpu, useWeight, epsilon); bool isStaticTest = false; LayerConfig testConfig = testConf.layerConfig; for (size_t i = 0; i < testConf.inputDefs.size(); i++) { @@ -662,14 +732,19 @@ void testLayerGrad(TestConfig testConf, string testLayerName, size_t batchSize, isStaticTest = true; } if (isStaticTest) { - testLayerGradKernel(testConf, testLayerName, batchSize, trans, useGpu, - useWeight, epsilon); + testLayerGradKernel( + testConf, testLayerName, batchSize, trans, useGpu, useWeight, epsilon); } } -void testProjectionGrad(ProjectionConfig conf, InputType inputType, - size_t parameterSize, size_t batchSize, bool useGpu, - bool testState, int biasSize, bool sharedBias) { +void testProjectionGrad(ProjectionConfig conf, + InputType inputType, + size_t parameterSize, + size_t batchSize, + bool useGpu, + bool testState, + int biasSize, + bool sharedBias) { TestConfig config; conf.set_name(conf.type()); config.layerConfig.set_type("mixed"); @@ -684,8 +759,11 @@ void testProjectionGrad(ProjectionConfig conf, InputType inputType, testLayerGrad(config, "mixed", batchSize, false, useGpu); } -void testOperatorGrad(TestConfig& config, OperatorConfig& operatorConf, - size_t batchSize, bool useGpu, bool testState) { +void testOperatorGrad(TestConfig& config, + OperatorConfig& operatorConf, + size_t batchSize, + bool useGpu, + bool testState) { config.layerConfig.set_type("mixed"); operatorConf.set_output_size(config.layerConfig.size()); diff --git a/paddle/gserver/tests/LayerGradUtil.h b/paddle/gserver/tests/LayerGradUtil.h index 3b9ec803959b372a960ed705da5abf7d301a2c64..a061c7fc533ff2c639ceda4db6d89a33fd3f0435 100644 --- a/paddle/gserver/tests/LayerGradUtil.h +++ b/paddle/gserver/tests/LayerGradUtil.h @@ -72,7 +72,10 @@ struct InputDef { sparse = {""}; isStatic = false; } - InputDef(InputType type, string nameIn, size_t dimIn, size_t sizeIn, + InputDef(InputType type, + string nameIn, + size_t dimIn, + size_t sizeIn, ParaSparse sparseIn) { inputType = type; name = nameIn; @@ -98,11 +101,18 @@ struct TestConfig { testBatchState(false) {} }; -real getCostSum(ParameterPtr& parameter, CpuVector& cpuPara, - LayerPtr& testLayer, MatrixPtr weights = nullptr); +real getCostSum(ParameterPtr& parameter, + CpuVector& cpuPara, + LayerPtr& testLayer, + MatrixPtr weights = nullptr); -real getDiffAndPrint(real newCost1, real newCost2, real callbackCount, - char fill, string testLayerName, string name, real step, +real getDiffAndPrint(real newCost1, + real newCost2, + real callbackCount, + char fill, + string testLayerName, + string name, + real step, real delta); /** @@ -113,7 +123,8 @@ real getDiffAndPrint(real newCost1, real newCost2, real callbackCount, * @param dataLayers[in/out] dataLayers * @param datas[in/out] data of dataLayers */ -void testState(LayerPtr testLayer, vector& dataLayers, +void testState(LayerPtr testLayer, + vector& dataLayers, vector& datas); /** @@ -124,7 +135,8 @@ void testState(LayerPtr testLayer, vector& dataLayers, * @param dataLayers[in/out] dataLayers * @param datas[in/out] data of dataLayers */ -void testBatchState(LayerPtr testLayer, vector& dataLayers, +void testBatchState(LayerPtr testLayer, + vector& dataLayers, vector& datas); /** @@ -144,8 +156,10 @@ double genPerturbation(const real* oldGrad, real* newGrad, size_t dim); void initWeight(MatrixPtr& weights); -void initBatchState(LayerPtr dataLayer, LayerPtr testLayer, - LayerStatePtr state, bool useGpu); +void initBatchState(LayerPtr dataLayer, + LayerPtr testLayer, + LayerStatePtr state, + bool useGpu); /** * @brief initialize the dataLayer by its inputType @@ -155,9 +169,13 @@ void initBatchState(LayerPtr dataLayer, LayerPtr testLayer, * datas[out] initialized data of dataLayers * layerMap[out] layerMap */ -void initDataLayer(TestConfig testConf, std::vector* dataLayers, - vector* datas, LayerMap* layerMap, - string testLayerName, size_t batchSize, bool trans, +void initDataLayer(TestConfig testConf, + std::vector* dataLayers, + vector* datas, + LayerMap* layerMap, + string testLayerName, + size_t batchSize, + bool trans, bool useGpu); /** @@ -168,8 +186,10 @@ void initDataLayer(TestConfig testConf, std::vector* dataLayers, * parameters[out] parameters of testLayer * testLayer[out] testLayer */ -void initTestLayer(TestConfig testConf, LayerMap* layerMap, - std::vector* parameters, LayerPtr* testLayer); +void initTestLayer(TestConfig testConf, + LayerMap* layerMap, + std::vector* parameters, + LayerPtr* testLayer); /** * @brief Test whether the layer's forward calculation is stable by adding @@ -184,9 +204,13 @@ void initTestLayer(TestConfig testConf, LayerMap* layerMap, * testLayer[in/out] testLayer * parameters[in/out] parameters of testLayer */ -void testPerturbParameter(TestConfig testConf, const MatrixPtr weights, - const LayerStatePtr state, real cost, - real callbackCount, real* maxDiff, LayerPtr testLayer, +void testPerturbParameter(TestConfig testConf, + const MatrixPtr weights, + const LayerStatePtr state, + real cost, + real callbackCount, + real* maxDiff, + LayerPtr testLayer, std::vector* parameters); /** @@ -202,25 +226,44 @@ void testPerturbParameter(TestConfig testConf, const MatrixPtr weights, * testLayer[in/out] testLayer * dataLayers[in/out] dataLayers */ -void testPerturbInput(TestConfig testConf, const MatrixPtr weights, - const LayerStatePtr state, real cost, real callbackCount, - real* maxDiff, LayerPtr testLayer, +void testPerturbInput(TestConfig testConf, + const MatrixPtr weights, + const LayerStatePtr state, + real cost, + real callbackCount, + real* maxDiff, + LayerPtr testLayer, std::vector dataLayers); -void testLayerGradKernel(TestConfig testConf, string testLayerName, - size_t batchSize, bool trans, bool useGpu, - bool useWeight = false, float epsilon = 0.02); +void testLayerGradKernel(TestConfig testConf, + string testLayerName, + size_t batchSize, + bool trans, + bool useGpu, + bool useWeight = false, + float epsilon = 0.02); -void testLayerGrad(TestConfig testConf, string testLayerName, size_t batchSize, - bool trans, bool useGpu, bool useWeight = false, +void testLayerGrad(TestConfig testConf, + string testLayerName, + size_t batchSize, + bool trans, + bool useGpu, + bool useWeight = false, float epsilon = 0.02); -void testProjectionGrad(ProjectionConfig conf, InputType inputType, - size_t parameterSize, size_t batchSize, bool useGpu, - bool testState = false, int biasSize = 0, +void testProjectionGrad(ProjectionConfig conf, + InputType inputType, + size_t parameterSize, + size_t batchSize, + bool useGpu, + bool testState = false, + int biasSize = 0, bool sharedBias = false); -void testOperatorGrad(TestConfig& config, OperatorConfig& operatorConf, - size_t batchSize, bool useGpu, bool testState = false); +void testOperatorGrad(TestConfig& config, + OperatorConfig& operatorConf, + size_t batchSize, + bool useGpu, + bool testState = false); } // namespace paddle diff --git a/paddle/gserver/tests/TestUtil.cpp b/paddle/gserver/tests/TestUtil.cpp index 97fbcc8176326357fdc406a9a04a4e3a937a2105..84d516683c18551765d707f26cc7003ba3432c7f 100644 --- a/paddle/gserver/tests/TestUtil.cpp +++ b/paddle/gserver/tests/TestUtil.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "TestUtil.h" #include "paddle/utils/CommandLineParser.h" @@ -30,8 +29,11 @@ std::string randStr(const int len) { return s; } -MatrixPtr makeRandomSparseMatrix(size_t height, size_t width, bool withValue, - bool useGpu, bool equalNnzPerSample) { +MatrixPtr makeRandomSparseMatrix(size_t height, + size_t width, + bool withValue, + bool useGpu, + bool equalNnzPerSample) { std::vector ids(height); std::vector indices(height + 1); indices[0] = 0; @@ -55,8 +57,8 @@ MatrixPtr makeRandomSparseMatrix(size_t height, size_t width, bool withValue, for (size_t i = 0; i < data.size(); ++i) { data[i].col = uniformRandom(width); } - auto mat = Matrix::createSparseMatrix(height, width, data.size(), NO_VALUE, - SPARSE_CSR, false, useGpu); + auto mat = Matrix::createSparseMatrix( + height, width, data.size(), NO_VALUE, SPARSE_CSR, false, useGpu); if (useGpu) { std::dynamic_pointer_cast(mat)->copyFrom( ids.data(), indices.data(), data.data(), HPPL_STREAM_DEFAULT); @@ -93,7 +95,7 @@ void generateSequenceStartPositions(size_t batchSize, } void generateSequenceStartPositions(size_t batchSize, - ICpuGpuVectorPtr& sequenceStartPositions) { + ICpuGpuVectorPtr& sequenceStartPositions) { int numSeqs; if (FLAGS_fixed_seq_length != 0) { numSeqs = std::ceil((float)batchSize / (float)FLAGS_fixed_seq_length); @@ -101,7 +103,7 @@ void generateSequenceStartPositions(size_t batchSize, numSeqs = batchSize / 10 + 1; } sequenceStartPositions = - ICpuGpuVector::create(numSeqs + 1, /* useGpu= */false); + ICpuGpuVector::create(numSeqs + 1, /* useGpu= */ false); int* buf = sequenceStartPositions->getMutableData(false); int64_t pos = 0; int len = FLAGS_fixed_seq_length; @@ -109,7 +111,8 @@ void generateSequenceStartPositions(size_t batchSize, for (int i = 0; i < numSeqs; ++i) { if (FLAGS_fixed_seq_length == 0) { len = uniformRandom( - std::min(maxLen, batchSize - pos - numSeqs + i)) + 1; + std::min(maxLen, batchSize - pos - numSeqs + i)) + + 1; } buf[i] = pos; pos += len; @@ -118,7 +121,6 @@ void generateSequenceStartPositions(size_t batchSize, buf[numSeqs] = batchSize; } - void generateSubSequenceStartPositions( const ICpuGpuVectorPtr& sequenceStartPositions, ICpuGpuVectorPtr& subSequenceStartPositions) { @@ -148,7 +150,6 @@ void generateSubSequenceStartPositions( subBuf[j] = buf[numSeqs]; } - void generateMDimSequenceData(const IVectorPtr& sequenceStartPositions, IVectorPtr& cpuSequenceDims) { /* generate sequences with 2 dims */ @@ -174,9 +175,8 @@ void generateMDimSequenceData(const IVectorPtr& sequenceStartPositions, } } -void generateMDimSequenceData( - const ICpuGpuVectorPtr& sequenceStartPositions, - IVectorPtr& cpuSequenceDims) { +void generateMDimSequenceData(const ICpuGpuVectorPtr& sequenceStartPositions, + IVectorPtr& cpuSequenceDims) { /* generate sequences with 2 dims */ int numSeqs = sequenceStartPositions->getSize() - 1; int numDims = 2; diff --git a/paddle/gserver/tests/TestUtil.h b/paddle/gserver/tests/TestUtil.h index 6a75f92ffe2f640fddd45d610645274a941a61c3..000f8884e8681db8f4d2a2d6454791958b964f92 100644 --- a/paddle/gserver/tests/TestUtil.h +++ b/paddle/gserver/tests/TestUtil.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include @@ -28,8 +27,11 @@ inline bool approximatelyEqual(float a, float b, float epsilon) { return fabs(a - b) <= ((fabs(a) < fabs(b) ? fabs(b) : fabs(a)) * epsilon); } -MatrixPtr makeRandomSparseMatrix(size_t height, size_t width, bool withValue, - bool useGpu, bool equalNnzPerSample = false); +MatrixPtr makeRandomSparseMatrix(size_t height, + size_t width, + bool withValue, + bool useGpu, + bool equalNnzPerSample = false); /** * @brief generate sequenceStartPositions for INPUT_SEQUENCE_DATA, @@ -39,10 +41,10 @@ MatrixPtr makeRandomSparseMatrix(size_t height, size_t width, bool withValue, * sequenceStartPositions[out] generation output */ void generateSequenceStartPositions(size_t batchSize, - IVectorPtr& sequenceStartPositions); + IVectorPtr& sequenceStartPositions); void generateSequenceStartPositions(size_t batchSize, - ICpuGpuVectorPtr& sequenceStartPositions); + ICpuGpuVectorPtr& sequenceStartPositions); /** * @brief generate subSequenceStartPositions for INPUT_HASSUB_SEQUENCE_DATA @@ -51,9 +53,8 @@ void generateSequenceStartPositions(size_t batchSize, * @param sequenceStartPositions[in] input * subSequenceStartPositions[out] generation output */ -void generateSubSequenceStartPositions( - const IVectorPtr& sequenceStartPositions, - IVectorPtr& subSequenceStartPositions); +void generateSubSequenceStartPositions(const IVectorPtr& sequenceStartPositions, + IVectorPtr& subSequenceStartPositions); void generateSubSequenceStartPositions( const ICpuGpuVectorPtr& sequenceStartPositions, @@ -66,12 +67,10 @@ void generateSubSequenceStartPositions( * @param sequenceStartPositions[in] input * cpuSequenceDims[out] generation output */ -void generateMDimSequenceData( - const IVectorPtr& sequenceStartPositions, - IVectorPtr& cpuSequenceDims); -void generateMDimSequenceData( - const ICpuGpuVectorPtr& sequenceStartPositions, - IVectorPtr& cpuSequenceDims); +void generateMDimSequenceData(const IVectorPtr& sequenceStartPositions, + IVectorPtr& cpuSequenceDims); +void generateMDimSequenceData(const ICpuGpuVectorPtr& sequenceStartPositions, + IVectorPtr& cpuSequenceDims); void checkMatrixEqual(const MatrixPtr& a, const MatrixPtr& b); diff --git a/paddle/gserver/tests/test_ActivationGrad.cpp b/paddle/gserver/tests/test_ActivationGrad.cpp index 2c5d17090dfc7772c84477cb721b084b7a03c835..e54c5109e71de1a41ec2bda2af4a19745acbbc83 100644 --- a/paddle/gserver/tests/test_ActivationGrad.cpp +++ b/paddle/gserver/tests/test_ActivationGrad.cpp @@ -42,9 +42,9 @@ void testActivation(const string& act) { testLayerGrad(config, act + "_activation", 100, - /* trans= */false, + /* trans= */ false, useGpu, - /* useWeight */true); + /* useWeight */ true); } } diff --git a/paddle/gserver/tests/test_ConvTrans.cpp b/paddle/gserver/tests/test_ConvTrans.cpp index bff7222b29907cb66d79decea76e1b5e26205ddf..f3efdfb428d14435fbfced6cfef3b7dadd8ff5a9 100644 --- a/paddle/gserver/tests/test_ConvTrans.cpp +++ b/paddle/gserver/tests/test_ConvTrans.cpp @@ -36,206 +36,206 @@ P_DECLARE_bool(prev_batch_state); // Test that the convTrans forward is the same as conv backward TEST(Layer, convTransLayerFwd) { - // Setting up conv-trans layer - TestConfig configt; - configt.biasSize = 3; - configt.layerConfig.set_type("exconvt"); - configt.layerConfig.set_num_filters(3); - configt.layerConfig.set_partial_sum(1); - configt.layerConfig.set_shared_biases(true); - - configt.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 384}); - LayerInputConfig* input = configt.layerConfig.add_inputs(); - ConvConfig* conv = input->mutable_conv_conf(); - conv->set_filter_size(2); - conv->set_filter_size_y(4); - conv->set_channels(16); - conv->set_padding(0); - conv->set_padding_y(1); - conv->set_stride(2); - conv->set_stride_y(2); - conv->set_groups(1); - conv->set_filter_channels(3 / conv->groups()); - conv->set_img_size(16); - conv->set_output_x(outputSize(conv->img_size(), conv->filter_size(), - conv->padding(), conv->stride(), - /* caffeMode */ true)); - configt.layerConfig.set_size(conv->img_size() * conv->img_size() * - configt.layerConfig.num_filters()); - configt.layerConfig.set_name("convTrans"); - - // data layer initialize - std::vector dataLayers; - LayerMap layerMap; - vector datas; - initDataLayer(configt, &dataLayers, &datas, &layerMap, "convTrans", - 100, false, false); - // test layer initialize - std::vector parameters; - LayerPtr convtLayer; - initTestLayer(configt, &layerMap, ¶meters, &convtLayer); - convtLayer->getBiasParameter()->zeroMem(); - convtLayer->forward(PASS_GC); - - // Setting up conv-layer config - TestConfig config; - config.biasSize = 16; - config.layerConfig.set_type("exconv"); - config.layerConfig.set_num_filters(16); - config.layerConfig.set_partial_sum(1); - config.layerConfig.set_shared_biases(true); - - config.inputDefs.push_back({INPUT_DATA, "layer_1", 768, 384}); - input = config.layerConfig.add_inputs(); - conv = input->mutable_conv_conf(); - conv->set_filter_size(2); - conv->set_filter_size_y(4); - conv->set_channels(3); - conv->set_padding(0); - conv->set_padding_y(1); - conv->set_stride(2); - conv->set_stride_y(2); - conv->set_groups(1); - conv->set_filter_channels(conv->channels() / conv->groups()); - conv->set_img_size(16); - conv->set_output_x(outputSize(conv->img_size(), conv->filter_size(), - conv->padding(), conv->stride(), - /* caffeMode */ true)); - config.layerConfig.set_size(conv->output_x() * conv->output_x() * - config.layerConfig.num_filters()); - config.layerConfig.set_name("conv"); - - // data layer initialize - std::vector dataLayers2; - LayerMap layerMap2; - vector datas2; - initDataLayer(config, &dataLayers2, &datas2, &layerMap2, "conv", - 100, false, false); - // test layer initialize - std::vector parameters2; - LayerPtr convLayer; - initTestLayer(config, &layerMap2, ¶meters2, &convLayer); - - // Sync convLayer and convtLayer parameter - convLayer->getBiasParameter()->zeroMem(); - convLayer->getParameters()[0]->getBuf(PARAMETER_VALUE)->copyFrom( - *(convtLayer->getParameters()[0]->getBuf(PARAMETER_VALUE))); - - // Set convLayer outputGrad as convTransLayer input value - convLayer->forward(PASS_GC); - convLayer->getOutput().grad->copyFrom(*(dataLayers[0]->getOutputValue())); - - vector callbackFlags(parameters2.size(), 0); - auto callback = [&](Parameter* para) { ++callbackFlags[para->getID()]; }; - convLayer->backward(callback); - - // Check that the convLayer backward is the same as convTransLayer forward - checkMatrixEqual(convtLayer->getOutputValue(), - dataLayers2[0]->getOutputGrad()); + // Setting up conv-trans layer + TestConfig configt; + configt.biasSize = 3; + configt.layerConfig.set_type("exconvt"); + configt.layerConfig.set_num_filters(3); + configt.layerConfig.set_partial_sum(1); + configt.layerConfig.set_shared_biases(true); + + configt.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 384}); + LayerInputConfig* input = configt.layerConfig.add_inputs(); + ConvConfig* conv = input->mutable_conv_conf(); + conv->set_filter_size(2); + conv->set_filter_size_y(4); + conv->set_channels(16); + conv->set_padding(0); + conv->set_padding_y(1); + conv->set_stride(2); + conv->set_stride_y(2); + conv->set_groups(1); + conv->set_filter_channels(3 / conv->groups()); + conv->set_img_size(16); + conv->set_output_x(outputSize(conv->img_size(), + conv->filter_size(), + conv->padding(), + conv->stride(), + /* caffeMode */ true)); + configt.layerConfig.set_size(conv->img_size() * conv->img_size() * + configt.layerConfig.num_filters()); + configt.layerConfig.set_name("convTrans"); + + // data layer initialize + std::vector dataLayers; + LayerMap layerMap; + vector datas; + initDataLayer( + configt, &dataLayers, &datas, &layerMap, "convTrans", 100, false, false); + // test layer initialize + std::vector parameters; + LayerPtr convtLayer; + initTestLayer(configt, &layerMap, ¶meters, &convtLayer); + convtLayer->getBiasParameter()->zeroMem(); + convtLayer->forward(PASS_GC); + + // Setting up conv-layer config + TestConfig config; + config.biasSize = 16; + config.layerConfig.set_type("exconv"); + config.layerConfig.set_num_filters(16); + config.layerConfig.set_partial_sum(1); + config.layerConfig.set_shared_biases(true); + + config.inputDefs.push_back({INPUT_DATA, "layer_1", 768, 384}); + input = config.layerConfig.add_inputs(); + conv = input->mutable_conv_conf(); + conv->set_filter_size(2); + conv->set_filter_size_y(4); + conv->set_channels(3); + conv->set_padding(0); + conv->set_padding_y(1); + conv->set_stride(2); + conv->set_stride_y(2); + conv->set_groups(1); + conv->set_filter_channels(conv->channels() / conv->groups()); + conv->set_img_size(16); + conv->set_output_x(outputSize(conv->img_size(), + conv->filter_size(), + conv->padding(), + conv->stride(), + /* caffeMode */ true)); + config.layerConfig.set_size(conv->output_x() * conv->output_x() * + config.layerConfig.num_filters()); + config.layerConfig.set_name("conv"); + + // data layer initialize + std::vector dataLayers2; + LayerMap layerMap2; + vector datas2; + initDataLayer( + config, &dataLayers2, &datas2, &layerMap2, "conv", 100, false, false); + // test layer initialize + std::vector parameters2; + LayerPtr convLayer; + initTestLayer(config, &layerMap2, ¶meters2, &convLayer); + + // Sync convLayer and convtLayer parameter + convLayer->getBiasParameter()->zeroMem(); + convLayer->getParameters()[0] + ->getBuf(PARAMETER_VALUE) + ->copyFrom(*(convtLayer->getParameters()[0]->getBuf(PARAMETER_VALUE))); + + // Set convLayer outputGrad as convTransLayer input value + convLayer->forward(PASS_GC); + convLayer->getOutput().grad->copyFrom(*(dataLayers[0]->getOutputValue())); + + vector callbackFlags(parameters2.size(), 0); + auto callback = [&](Parameter* para) { ++callbackFlags[para->getID()]; }; + convLayer->backward(callback); + + // Check that the convLayer backward is the same as convTransLayer forward + checkMatrixEqual(convtLayer->getOutputValue(), + dataLayers2[0]->getOutputGrad()); } - // Do one forward pass of convTrans layer and check to see if its output // matches the given result -void doOneConvtTest(size_t imgSize, size_t output_x, size_t stride, - size_t padding, size_t filter_size, MatrixPtr& result) { - TestConfig configt; - configt.biasSize = 1; - configt.layerConfig.set_type("exconvt"); - configt.layerConfig.set_num_filters(1); - configt.layerConfig.set_partial_sum(1); - configt.layerConfig.set_shared_biases(true); - - configt.inputDefs.push_back({INPUT_DATA, "layer_0", output_x * output_x, - filter_size * filter_size}); - LayerInputConfig* input = configt.layerConfig.add_inputs(); - ConvConfig* conv = input->mutable_conv_conf(); - conv->set_filter_size(filter_size); - conv->set_filter_size_y(filter_size); - conv->set_channels(1); - conv->set_padding(padding); - conv->set_padding_y(padding); - conv->set_stride(stride); - conv->set_stride_y(stride); - conv->set_groups(1); - conv->set_filter_channels(1); - conv->set_img_size(imgSize); - conv->set_output_x(output_x); - - configt.layerConfig.set_size(conv->img_size() * conv->img_size() * - configt.layerConfig.num_filters()); - configt.layerConfig.set_name("convTrans"); - - std::vector dataLayers; - LayerMap layerMap; - vector datas; - initDataLayer(configt, &dataLayers, &datas, &layerMap, "convTrans", - 1, false, false); - dataLayers[0]->getOutputValue()->zeroMem(); - dataLayers[0]->getOutputValue()->add(1.0); - - // test layer initialize - std::vector parameters; - LayerPtr convtLayer; - initTestLayer(configt, &layerMap, ¶meters, &convtLayer); - convtLayer->getBiasParameter()->zeroMem(); - convtLayer->getParameters()[0]->zeroMem(); - convtLayer->getParameters()[0]->getBuf(PARAMETER_VALUE)->add(1.0); - convtLayer->forward(PASS_GC); - - checkMatrixEqual(convtLayer->getOutputValue(), result); +void doOneConvtTest(size_t imgSize, + size_t output_x, + size_t stride, + size_t padding, + size_t filter_size, + MatrixPtr& result) { + TestConfig configt; + configt.biasSize = 1; + configt.layerConfig.set_type("exconvt"); + configt.layerConfig.set_num_filters(1); + configt.layerConfig.set_partial_sum(1); + configt.layerConfig.set_shared_biases(true); + + configt.inputDefs.push_back( + {INPUT_DATA, "layer_0", output_x * output_x, filter_size * filter_size}); + LayerInputConfig* input = configt.layerConfig.add_inputs(); + ConvConfig* conv = input->mutable_conv_conf(); + conv->set_filter_size(filter_size); + conv->set_filter_size_y(filter_size); + conv->set_channels(1); + conv->set_padding(padding); + conv->set_padding_y(padding); + conv->set_stride(stride); + conv->set_stride_y(stride); + conv->set_groups(1); + conv->set_filter_channels(1); + conv->set_img_size(imgSize); + conv->set_output_x(output_x); + + configt.layerConfig.set_size(conv->img_size() * conv->img_size() * + configt.layerConfig.num_filters()); + configt.layerConfig.set_name("convTrans"); + + std::vector dataLayers; + LayerMap layerMap; + vector datas; + initDataLayer( + configt, &dataLayers, &datas, &layerMap, "convTrans", 1, false, false); + dataLayers[0]->getOutputValue()->zeroMem(); + dataLayers[0]->getOutputValue()->add(1.0); + + // test layer initialize + std::vector parameters; + LayerPtr convtLayer; + initTestLayer(configt, &layerMap, ¶meters, &convtLayer); + convtLayer->getBiasParameter()->zeroMem(); + convtLayer->getParameters()[0]->zeroMem(); + convtLayer->getParameters()[0]->getBuf(PARAMETER_VALUE)->add(1.0); + convtLayer->forward(PASS_GC); + + checkMatrixEqual(convtLayer->getOutputValue(), result); } TEST(Layer, convTransLayerFwd2) { - MatrixPtr result; - result = Matrix::create(1, 5 * 5, false, false); - result->zeroMem(); - result->add(1.0); - doOneConvtTest(/* imgSize */ 5, - /* output_x */ 1, - /* stride */ 1, - /* padding */ 0, - /* filter_size */ 5, - result); - - float resultData[] = {1, 2, 2, 2, 1, - 2, 4, 4, 4, 2, - 2, 4, 4, 4, 2, - 2, 4, 4, 4, 2, - 1, 2, 2, 2, 1}; - result->setData(resultData); - doOneConvtTest(/* imgSize */ 5, - /* output_x */ 2, - /* stride */ 1, - /* padding */ 0, - /* filter_size */ 4, - result); - - float resultData2[] = {1, 2, 2, 2, 1, - 2, 4, 4, 4, 2, - 2, 4, 4, 4, 2, - 2, 4, 4, 4, 2, - 1, 2, 2, 2, 1}; - result->setData(resultData2); - doOneConvtTest(/* imgSize */ 5, - /* output_x */ 2, - /* stride */ 2, - /* padding */ 1, - /* filter_size */ 5, - result); - - float resultData3[] = {1, 1, 2, 1, 1, - 1, 1, 2, 1, 1, - 2, 2, 4, 2, 2, - 1, 1, 2, 1, 1, - 1, 1, 2, 1, 1}; - result->setData(resultData3); - doOneConvtTest(/* imgSize */ 5, - /* output_x */ 2, - /* stride */ 2, - /* padding */ 0, - /* filter_size */ 3, - result);} + MatrixPtr result; + result = Matrix::create(1, 5 * 5, false, false); + result->zeroMem(); + result->add(1.0); + doOneConvtTest(/* imgSize */ 5, + /* output_x */ 1, + /* stride */ 1, + /* padding */ 0, + /* filter_size */ 5, + result); + + float resultData[] = {1, 2, 2, 2, 1, 2, 4, 4, 4, 2, 2, 4, 4, + 4, 2, 2, 4, 4, 4, 2, 1, 2, 2, 2, 1}; + result->setData(resultData); + doOneConvtTest(/* imgSize */ 5, + /* output_x */ 2, + /* stride */ 1, + /* padding */ 0, + /* filter_size */ 4, + result); + + float resultData2[] = {1, 2, 2, 2, 1, 2, 4, 4, 4, 2, 2, 4, 4, + 4, 2, 2, 4, 4, 4, 2, 1, 2, 2, 2, 1}; + result->setData(resultData2); + doOneConvtTest(/* imgSize */ 5, + /* output_x */ 2, + /* stride */ 2, + /* padding */ 1, + /* filter_size */ 5, + result); + + float resultData3[] = {1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 2, 4, + 2, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1}; + result->setData(resultData3); + doOneConvtTest(/* imgSize */ 5, + /* output_x */ 2, + /* stride */ 2, + /* padding */ 0, + /* filter_size */ 3, + result); +} int main(int argc, char** argv) { testing::InitGoogleTest(&argc, argv); diff --git a/paddle/gserver/tests/test_Evaluator.cpp b/paddle/gserver/tests/test_Evaluator.cpp index 3a591a316b8bafccac9c59ff28e57b4e27f8377a..be639ea09380d02ed8251874bf690fc3596bddf2 100644 --- a/paddle/gserver/tests/test_Evaluator.cpp +++ b/paddle/gserver/tests/test_Evaluator.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include #include #include "ModelConfig.pb.h" @@ -48,8 +47,10 @@ struct TestConfig { TestConfig() : testAccumulate(true) {} }; -void testEvaluator(TestConfig testConf, string testEvaluatorName, - size_t batchSize, bool useGpu) { +void testEvaluator(TestConfig testConf, + string testEvaluatorName, + size_t batchSize, + bool useGpu) { #ifdef PADDLE_ONLY_CPU if (useGpu) return; #endif @@ -79,8 +80,10 @@ void testEvaluator(TestConfig testConf, string testEvaluatorName, data.ids->rand(dim); // now rand number can be 0 to inputDefs[i].dim. break; case INPUT_SPARSE_NON_VALUE_DATA: - data.value = makeRandomSparseMatrix(batchSize, dim, - /* withValue= */ false, useGpu); + data.value = makeRandomSparseMatrix(batchSize, + dim, + /* withValue= */ false, + useGpu); break; default: LOG(FATAL) << " unknown inputType "; @@ -116,8 +119,9 @@ void testEvaluator(TestConfig testConf, string testEvaluatorName, } } -void testEvaluatorAll(TestConfig testConf, string testEvaluatorName, - size_t batchSize) { +void testEvaluatorAll(TestConfig testConf, + string testEvaluatorName, + size_t batchSize) { testEvaluator(testConf, testEvaluatorName, batchSize, true); testEvaluator(testConf, testEvaluatorName, batchSize, false); } @@ -142,8 +146,8 @@ TEST(Evaluator, classification_error) { config.evaluatorConfig.set_classification_threshold(0.4); config.inputDefs.push_back({INPUT_DATA, "weight", 1}); // Not support GPU - testEvaluator(config, "classification_error_weight_multi_binary_label", 50, - false); + testEvaluator( + config, "classification_error_weight_multi_binary_label", 50, false); } TEST(Evaluator, sum) { @@ -211,8 +215,8 @@ TEST(Evaluator, precision_recall) { config.evaluatorConfig.set_classification_threshold(0.4); config.inputDefs.push_back({INPUT_DATA, "weight", 1}); // Not support GPU - testEvaluator(config, "precision_recall_weight_multi_binary_label", 100, - false); + testEvaluator( + config, "precision_recall_weight_multi_binary_label", 100, false); } TEST(Evaluator, ctc_error_evaluator) { diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp index a79dfe39c9bb26c7b2acec1051699e1804494d93..374ae57dd3681f891cf3f5b698085f0b8fbc6cd7 100644 --- a/paddle/gserver/tests/test_LayerGrad.cpp +++ b/paddle/gserver/tests/test_LayerGrad.cpp @@ -69,8 +69,10 @@ TEST(Projection, context) { std::max(0, conf.context_start() + conf.context_length() - 1); for (auto useGpu : {false, true}) { testProjectionGrad( - conf, INPUT_SEQUENCE_DATA, - trainablePadding ? conf.input_size() * pad : 0, batchSize, + conf, + INPUT_SEQUENCE_DATA, + trainablePadding ? conf.input_size() * pad : 0, + batchSize, useGpu, contextStart + contextLength <= 1); // = testState } @@ -86,8 +88,11 @@ TEST(Projection, trans_fc) { conf.set_input_size(50); conf.set_output_size(20); for (auto useGpu : {false, true}) { - testProjectionGrad(conf, INPUT_DATA, /* parameterSize */ 1000, - /* batchSize */ 100, useGpu); + testProjectionGrad(conf, + INPUT_DATA, + /* parameterSize */ 1000, + /* batchSize */ 100, + useGpu); } } @@ -97,8 +102,11 @@ TEST(Projection, fc) { conf.set_input_size(10); conf.set_output_size(20); for (auto useGpu : {false, true}) { - testProjectionGrad(conf, INPUT_DATA, /* parameterSize */ 200, - /* batchSize */ 100, useGpu); + testProjectionGrad(conf, + INPUT_DATA, + /* parameterSize */ 200, + /* batchSize */ 100, + useGpu); } } @@ -108,8 +116,11 @@ TEST(Projection, dot_mul) { conf.set_input_size(20); conf.set_output_size(20); for (auto useGpu : {false, true}) { - testProjectionGrad(conf, INPUT_DATA, /* parameterSize */ 20, - /* batchSize */ 100, useGpu); + testProjectionGrad(conf, + INPUT_DATA, + /* parameterSize */ 20, + /* batchSize */ 100, + useGpu); } } @@ -119,8 +130,11 @@ TEST(Projection, table) { conf.set_input_size(10); conf.set_output_size(20); for (auto useGpu : {false, true}) { - testProjectionGrad(conf, INPUT_LABEL, /* parameterSize */ 200, - /* batchSize */ 100, useGpu); + testProjectionGrad(conf, + INPUT_LABEL, + /* parameterSize */ 200, + /* batchSize */ 100, + useGpu); } } @@ -130,8 +144,11 @@ TEST(Projection, identity) { conf.set_input_size(10); conf.set_output_size(10); for (auto useGpu : {false, true}) { - testProjectionGrad(conf, INPUT_DATA, /* parameterSize */ 0, - /* batchSize */ 100, useGpu); + testProjectionGrad(conf, + INPUT_DATA, + /* parameterSize */ 0, + /* batchSize */ 100, + useGpu); } } @@ -141,8 +158,11 @@ TEST(Projection, scaling) { conf.set_input_size(10); conf.set_output_size(10); for (auto useGpu : {false}) { - testProjectionGrad(conf, INPUT_DATA, /* parameterSize */ 1, - /* batchSize */ 100, useGpu); + testProjectionGrad(conf, + INPUT_DATA, + /* parameterSize */ 1, + /* batchSize */ 100, + useGpu); } } @@ -169,20 +189,29 @@ TEST(Projection, conv) { conv->set_groups(1); conv->set_filter_channels(conv->channels() / conv->groups()); conv->set_img_size(IMAGE_SIZE); - int output_x = - outputSize(conv->img_size(), conv->filter_size(), conv->padding(), - conv->stride(), /* caffeMode */ true); - int output_y = - outputSize(conv->img_size(), conv->filter_size_y(), conv->padding_y(), - conv->stride_y(), /* caffeMode */ true); + int output_x = outputSize(conv->img_size(), + conv->filter_size(), + conv->padding(), + conv->stride(), + /* caffeMode */ true); + int output_y = outputSize(conv->img_size(), + conv->filter_size_y(), + conv->padding_y(), + conv->stride_y(), + /* caffeMode */ true); conv->set_output_x(output_x); conf.set_input_size(IMAGE_SIZE * IMAGE_SIZE * CHANNELS); conf.set_output_size(output_x * output_y * NUM_FILTERS); testProjectionGrad( - conf, INPUT_DATA, + conf, + INPUT_DATA, /* parameterSize */ NUM_FILTERS * CHANNELS * FILTER_SIZE * FILTER_SIZE_Y, - /* batchSize */ 100, true, false, NUM_FILTERS, true); + /* batchSize */ 100, + true, + false, + NUM_FILTERS, + true); } #endif @@ -253,8 +282,13 @@ TEST(Layer, CRFLayer) { config.layerConfig.add_inputs(); // Not support GPU now - testLayerGrad(config, "crf", 100, /* trans */ false, /* useGpu */ false, - false /*useWeight*/, 0.03 /*epsilon*/); + testLayerGrad(config, + "crf", + 100, + /* trans */ false, + /* useGpu */ false, + false /*useWeight*/, + 0.03 /*epsilon*/); } TEST(Layer, CTCLayer) { @@ -327,8 +361,10 @@ void testConvLayer(const string& type, bool trans, bool useGpu) { conv->set_groups(1); conv->set_filter_channels(conv->channels() / conv->groups()); conv->set_img_size(16); - conv->set_output_x(outputSize(conv->img_size(), conv->filter_size(), - conv->padding(), conv->stride(), + conv->set_output_x(outputSize(conv->img_size(), + conv->filter_size(), + conv->padding(), + conv->stride(), /* caffeMode */ true)); config.layerConfig.set_size(conv->output_x() * conv->output_x() * config.layerConfig.num_filters()); @@ -346,7 +382,6 @@ TEST(Layer, convLayer) { #endif } - void testConvTransLayer(const string& type, bool trans, bool useGpu) { TestConfig config; config.biasSize = 3; @@ -368,8 +403,10 @@ void testConvTransLayer(const string& type, bool trans, bool useGpu) { conv->set_groups(1); conv->set_filter_channels(3 / conv->groups()); conv->set_img_size(16); - conv->set_output_x(outputSize(conv->img_size(), conv->filter_size(), - conv->padding(), conv->stride(), + conv->set_output_x(outputSize(conv->img_size(), + conv->filter_size(), + conv->padding(), + conv->stride(), /* caffeMode */ true)); config.layerConfig.set_size(conv->img_size() * conv->img_size() * @@ -403,14 +440,16 @@ TEST(Layer, blockExpandLayer) { blockExpand->set_block_y(32); blockExpand->set_stride_x(2); blockExpand->set_stride_y(2); - blockExpand->set_output_x( - outputSize(blockExpand->img_size_x(), blockExpand->block_x(), - blockExpand->padding_x(), blockExpand->stride_x(), - /* caffeMode */ false)); - blockExpand->set_output_y( - outputSize(blockExpand->img_size_y(), blockExpand->block_y(), - blockExpand->padding_y(), blockExpand->stride_y(), - /* caffeMode */ false)); + blockExpand->set_output_x(outputSize(blockExpand->img_size_x(), + blockExpand->block_x(), + blockExpand->padding_x(), + blockExpand->stride_x(), + /* caffeMode */ false)); + blockExpand->set_output_y(outputSize(blockExpand->img_size_y(), + blockExpand->block_y(), + blockExpand->padding_y(), + blockExpand->stride_y(), + /* caffeMode */ false)); config.layerConfig.set_size(blockExpand->block_x() * blockExpand->block_y() * blockExpand->channels()); @@ -453,7 +492,11 @@ void testFcLayer(string format, size_t nnz) { << config.inputDefs[0].sparse.format; for (auto useGpu : {false, true}) { - testLayerGrad(config, "fc", 100, /* trans */ false, useGpu, + testLayerGrad(config, + "fc", + 100, + /* trans */ false, + useGpu, /* weight */ true); } } @@ -481,11 +524,19 @@ TEST(Layer, SelectiveFullyConnectedLayer) { {INPUT_SPARSE_NON_VALUE_DATA, "index", nout, 0, ParaSparse("csr", true)}); config.layerConfig.add_inputs(); - testLayerGrad(config, "selective_fc", 100, - /* trans= */ false, /* useGup= */ false, false); + testLayerGrad(config, + "selective_fc", + 100, + /* trans= */ false, + /* useGup= */ false, + false); #ifndef PADDLE_ONLY_CPU - testLayerGrad(config, "selective_fc", 100, - /* trans= */ false, /* useGup= */ true, false); + testLayerGrad(config, + "selective_fc", + 100, + /* trans= */ false, + /* useGup= */ true, + false); #endif } @@ -502,7 +553,10 @@ TEST(Layer, DataNormLayer) { for (auto strategy : {"z-score", "min-max", "decimal-scaling"}) { config.layerConfig.set_data_norm_strategy(strategy); // The parameters are static, so not support GPU now - testLayerGrad(config, "data_norm", 200, /* trans */ false, + testLayerGrad(config, + "data_norm", + 200, + /* trans */ false, /* useGpu */ false); } } @@ -534,8 +588,8 @@ TEST(Layer, multi_cross) { config.layerConfig.add_inputs(); for (auto useGpu : {false, true}) { - testLayerGrad(config, "multi-class-cross-entropy", 100, /* trans */ false, - useGpu); + testLayerGrad( + config, "multi-class-cross-entropy", 100, /* trans */ false, useGpu); } } @@ -550,8 +604,11 @@ TEST(Layer, multi_binary_label_sparse_mat) { config.layerConfig.add_inputs(); for (auto useGpu : {false, true}) { - testLayerGrad(config, "multi_binary_label_cross_entropy", 100, - /* trans */ false, useGpu); + testLayerGrad(config, + "multi_binary_label_cross_entropy", + 100, + /* trans */ false, + useGpu); } } @@ -566,8 +623,11 @@ TEST(layer, multi_binary_label_id) { config.layerConfig.add_inputs(); for (auto useGpu : {false, true}) { - testLayerGrad(config, "multi_binary_label_cross_entropy", 100, - /* trans */ false, useGpu); + testLayerGrad(config, + "multi_binary_label_cross_entropy", + 100, + /* trans */ false, + useGpu); } } @@ -583,7 +643,9 @@ TEST(Layer, multi_cross_with_selfnorm) { config.layerConfig.add_inputs(); // Not support GPU now - testLayerGrad(config, "multi_class_cross_entropy_with_selfnorm", 100, + testLayerGrad(config, + "multi_class_cross_entropy_with_selfnorm", + 100, /* trans */ false, /* useGpu */ false); } @@ -599,8 +661,11 @@ TEST(Layer, multi_cross_soft) { config.layerConfig.add_inputs(); for (auto useGpu : {false, true}) { - testLayerGrad(config, "soft_binary_class_cross_entropy", 100, - /* trans */ false, useGpu); + testLayerGrad(config, + "soft_binary_class_cross_entropy", + 100, + /* trans */ false, + useGpu); } } @@ -630,7 +695,10 @@ TEST(Layer, sparse_square_error) { config.layerConfig.add_inputs(); // "GpuSparseMatrix" as label is not supported - testLayerGrad(config, "square_error", 100, /* trans */ false, + testLayerGrad(config, + "square_error", + 100, + /* trans */ false, /* useGpu */ false); } @@ -645,7 +713,10 @@ TEST(Layer, sparse_float_square_error) { config.layerConfig.add_inputs(); // "GpuSparseMatrix" as label is not supported - testLayerGrad(config, "square_error", 100, /* trans */ false, + testLayerGrad(config, + "square_error", + 100, + /* trans */ false, /* useGpu */ false); } @@ -688,10 +759,14 @@ void testExpandLayer(string trans_type, bool hasSubseq) { config.inputDefs.push_back( {trans_type == "non-seq" ? INPUT_DENSE_DIM_DATA : INPUT_SEQUENCE_DATA, - "layer_0", 10, 0}); + "layer_0", + 10, + 0}); config.inputDefs.push_back( - {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA, "layer_1", - 10, 0}); + {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA, + "layer_1", + 10, + 0}); config.layerConfig.add_inputs(); config.layerConfig.add_inputs(); config.layerConfig.set_trans_type(trans_type); @@ -715,8 +790,10 @@ void testDegradeLayer(bool hasSubseq, string layer_type, string trans_type) { config.biasSize = 0; config.inputDefs.push_back( - {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA, "layer_0", - 10, 0}); + {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA, + "layer_0", + 10, + 0}); config.layerConfig.add_inputs(); config.layerConfig.set_trans_type(trans_type); @@ -746,9 +823,11 @@ TEST(Layer, MaxLayer) { } TEST(Layer, SequenceLastInstanceLayer) { - testDegradeLayer(false, "seqlastins", + testDegradeLayer(false, + "seqlastins", "non-seq"); // seq seqlastins to non-seq - testDegradeLayer(true, "seqlastins", + testDegradeLayer(true, + "seqlastins", "non-seq"); // hasSubseq seqlastins to non-seq testDegradeLayer(true, "seqlastins", "seq"); // hasSubseq seqlastins to seq } @@ -933,7 +1012,8 @@ TEST(Layer, NormLayer) { } #endif -void setPoolConfig(TestConfig* config, PoolConfig* pool, +void setPoolConfig(TestConfig* config, + PoolConfig* pool, const string& poolType) { (*config).biasSize = 0; (*config).layerConfig.set_type("pool"); @@ -1009,7 +1089,9 @@ TEST(Layer, PoolLayer) { #endif } -void testSppLayer(const string& poolType, const int pyramidHeight, bool trans, +void testSppLayer(const string& poolType, + const int pyramidHeight, + bool trans, bool useGpu) { TestConfig config; config.layerConfig.set_type("spp"); @@ -1232,7 +1314,8 @@ TEST(Layer, NCELayer) { for (auto isIdLabel : {false, true}) { config.inputDefs[1] = { - isIdLabel ? INPUT_LABEL : INPUT_SPARSE_NON_VALUE_DATA, "label", + isIdLabel ? INPUT_LABEL : INPUT_SPARSE_NON_VALUE_DATA, + "label", /* dim= */ numClasses, /* paraSize= */ 0}; @@ -1254,7 +1337,10 @@ TEST(Layer, NCELayer) { << " isIdLabel=" << isIdLabel << " withWeight=" << withWeight << " withDist=" << withDist; // Not support GPU now - testLayerGrad(config, "nce", 100, /* trans= */ false, + testLayerGrad(config, + "nce", + 100, + /* trans= */ false, /* useGpu */ false); } } @@ -1332,7 +1418,8 @@ void testBatchNormLayer(const string& type, bool trans, bool useGpu) { config.layerConfig.set_size(CHANNELS * IMG_SIZE * IMG_SIZE); config.layerConfig.set_active_type("sigmoid"); config.biasSize = CHANNELS; - config.inputDefs.push_back({INPUT_DATA, "layer_0", + config.inputDefs.push_back({INPUT_DATA, + "layer_0", /* dim= */ IMG_SIZE * IMG_SIZE * CHANNELS, /* paraSize= */ CHANNELS}); @@ -1349,7 +1436,11 @@ void testBatchNormLayer(const string& type, bool trans, bool useGpu) { img_conf->set_channels(CHANNELS); img_conf->set_img_size(IMG_SIZE); - testLayerGrad(config, "batch_norm", 64, /* trans= */ trans, useGpu, + testLayerGrad(config, + "batch_norm", + 64, + /* trans= */ trans, + useGpu, /* useWeight */ true); } @@ -1384,9 +1475,11 @@ TEST(Operator, conv) { conv->set_groups(1); conv->set_filter_channels(conv->channels() / conv->groups()); conv->set_img_size(IMAGE_SIZE); - int output_x = - outputSize(conv->img_size(), conv->filter_size(), conv->padding(), - conv->stride(), /* caffeMode */ true); + int output_x = outputSize(conv->img_size(), + conv->filter_size(), + conv->padding(), + conv->stride(), + /* caffeMode */ true); conv->set_output_x(output_x); config.layerConfig.set_size(output_x * output_x * config.layerConfig.num_filters()); @@ -1396,8 +1489,10 @@ TEST(Operator, conv) { config.inputDefs.push_back( {INPUT_DATA, "layer_0", IMAGE_SIZE * IMAGE_SIZE * CHANNELS, 0}); config.inputDefs.push_back( - {INPUT_DATA, "layer_1", - FILTER_SIZE * FILTER_SIZE_Y * CHANNELS * NUM_FILTERS, 0}); + {INPUT_DATA, + "layer_1", + FILTER_SIZE * FILTER_SIZE_Y * CHANNELS * NUM_FILTERS, + 0}); config.layerConfig.add_inputs(); config.layerConfig.add_inputs(); @@ -1411,12 +1506,17 @@ TEST(Layer, FeatureMapExpandLayer) { const int INPUT_SIZE = 100; config.layerConfig.set_size(INPUT_SIZE * CHANNELS); config.layerConfig.set_num_filters(CHANNELS); - config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", - /* dim= */ INPUT_SIZE, /* paraSize= */ 0}); + config.inputDefs.push_back({INPUT_SEQUENCE_DATA, + "layer_0", + /* dim= */ INPUT_SIZE, + /* paraSize= */ 0}); config.layerConfig.add_inputs(); for (auto useGpu : {false, true}) { - testLayerGrad(config, "featmap_expand", - /*batch_size*/ 100, /* trans= */ false, useGpu, + testLayerGrad(config, + "featmap_expand", + /*batch_size*/ 100, + /* trans= */ false, + useGpu, /* useWeight */ true); } } diff --git a/paddle/gserver/tests/test_LinearChainCRF.cpp b/paddle/gserver/tests/test_LinearChainCRF.cpp index f45e40c8b6acb5ff7d3e16f9a6f9a5acba13e84e..913d6ed7511a0c3c7c0b40e1fbdb48a17b51b1b2 100644 --- a/paddle/gserver/tests/test_LinearChainCRF.cpp +++ b/paddle/gserver/tests/test_LinearChainCRF.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include #include #include "paddle/gserver/layers/LinearChainCRF.h" diff --git a/paddle/gserver/tests/test_MultinomialSampler.cpp b/paddle/gserver/tests/test_MultinomialSampler.cpp index 73b4d0b8b7110d4ab79809875e2481cd2b565a68..3fc099adbdb6cb562c4bfc419b777ef534bdfed7 100644 --- a/paddle/gserver/tests/test_MultinomialSampler.cpp +++ b/paddle/gserver/tests/test_MultinomialSampler.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include #include @@ -43,7 +42,7 @@ TEST(MultinomialSampler, gen) { int size = 1024 * 4; default_random_engine reng; - for (size_t iter=0; iter < 256; ++iter) { + for (size_t iter = 0; iter < 256; ++iter) { uniform_int_distribution rand(1, numGrids / size * 1.8); vector prob; int sum = 0; @@ -138,7 +137,6 @@ void benchmarkRandom() { LOG(INFO) << "sum1=" << sum1; } - int main(int argc, char** argv) { initMain(argc, argv); testing::InitGoogleTest(&argc, argv); diff --git a/paddle/gserver/tests/test_NetworkCompare.cpp b/paddle/gserver/tests/test_NetworkCompare.cpp index 8d3eac5aca8d1567690f905b2e4b4f6fab7efdde..1810bc31fc2ce00ed6d8fd588c0dfa9ce398cb45 100644 --- a/paddle/gserver/tests/test_NetworkCompare.cpp +++ b/paddle/gserver/tests/test_NetworkCompare.cpp @@ -41,7 +41,8 @@ struct DataOut { std::vector paraGrads; }; -void initArgument(DataIn& data, const std::string& configPath, +void initArgument(DataIn& data, + const std::string& configPath, bool useGpu = FLAGS_use_gpu) { TrainerConfigHelper config(configPath); size_t batchSize = config.getOptConfig().batch_size(); @@ -122,9 +123,10 @@ void calcGradient(DataIn& in, DataOut& out, const std::string& configPath) { } gradientMachine->backward(); for (size_t i = 0; i < in.outGrads.size(); i++) { - MatrixPtr value = - Matrix::create(outArgs[i].value->getHeight(), - outArgs[i].value->getWidth(), false, false); + MatrixPtr value = Matrix::create(outArgs[i].value->getHeight(), + outArgs[i].value->getWidth(), + false, + false); value->copyFrom(*outArgs[i].value); out.outValues.push_back(value); } @@ -147,8 +149,12 @@ void calcGradient(DataIn& in, DataOut& out, const std::string& configPath) { gradientMachine->finish(); } -void checkBuffer(real* A, const char* desA, real* B, const char* desB, - size_t len, size_t width = 1) { +void checkBuffer(real* A, + const char* desA, + real* B, + const char* desB, + size_t len, + size_t width = 1) { int nNum = 0; for (size_t i = 0; i < len; ++i) { real diff = fabs(A[i] - B[i]); @@ -168,8 +174,10 @@ void compareGradient(DataOut& outA, DataOut& outB) { << "------------------------------"; for (size_t i = 0; i < outA.outValues.size(); ++i) { LOG(INFO) << "OUTPUT VALUE: " << i; - checkBuffer(outA.outValues[i]->getData(), "network A output", - outB.outValues[i]->getData(), "network B output", + checkBuffer(outA.outValues[i]->getData(), + "network A output", + outB.outValues[i]->getData(), + "network B output", outA.outValues[i]->getElementCnt(), outA.outValues[i]->getWidth()); } @@ -180,8 +188,10 @@ void compareGradient(DataOut& outA, DataOut& outB) { << "------------------------------"; for (size_t i = 0; i < outA.paraGrads.size(); ++i) { LOG(INFO) << "PARAMETER GRADIENT: " << i; - checkBuffer(outA.paraGrads[i]->getData(), "Network A", - outB.paraGrads[i]->getData(), "Network B", + checkBuffer(outA.paraGrads[i]->getData(), + "Network A", + outB.paraGrads[i]->getData(), + "Network B", outA.paraGrads[i]->getSize()); } } @@ -247,7 +257,6 @@ TEST(Compare, img_conv) { } #endif - P_DEFINE_string(config_file_a, "", "config of one network to compare"); P_DEFINE_string(config_file_b, "", "config of another network to compare"); TEST(Compare, network) { diff --git a/paddle/gserver/tests/test_ProtoDataProvider.cpp b/paddle/gserver/tests/test_ProtoDataProvider.cpp index 68f7f43261c8353b6836416bea97dad4f817ba75..01070bc1cb3023bc0321f0a8e867b8abd7030e08 100644 --- a/paddle/gserver/tests/test_ProtoDataProvider.cpp +++ b/paddle/gserver/tests/test_ProtoDataProvider.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include #include @@ -41,7 +40,9 @@ const int kSpraseMatrixDim = 1024; using namespace paddle; // NOLINT -void prepareData(DataBatch* batch, const int* numPerSlotType, bool iid, +void prepareData(DataBatch* batch, + const int* numPerSlotType, + bool iid, bool useGpu) { batch->clear(); int64_t size = uniformRandom(100) + 10; @@ -137,7 +138,7 @@ inline int getSlotDim(const Argument& arg) { inline SlotDef::SlotType getSlotType(const Argument& arg) { if (arg.value) { - auto & m = *arg.value; + auto& m = *arg.value; auto& type = typeid(m); if (type == typeid(CpuMatrix) || type == typeid(GpuMatrix)) { return SlotDef::VECTOR_DENSE; @@ -169,8 +170,12 @@ inline SlotDef::SlotType getSlotType(const Argument& arg) { return SlotDef::VECTOR_DENSE; } -void getColRow(const Argument& arg, int64_t pos, bool useGpu, int* colNum, - const int** rowCols, const real** rowValues) { +void getColRow(const Argument& arg, + int64_t pos, + bool useGpu, + int* colNum, + const int** rowCols, + const real** rowValues) { SlotDef::SlotType type = getSlotType(arg); GpuSparseMatrixPtr matGpu; CpuSparseMatrixPtr matCpu; @@ -190,8 +195,11 @@ void getColRow(const Argument& arg, int64_t pos, bool useGpu, int* colNum, } } -void makeSample(const vector& arguments, int64_t pos, - bool isBeginning, DataSample* sample, bool useGpu) { +void makeSample(const vector& arguments, + int64_t pos, + bool isBeginning, + DataSample* sample, + bool useGpu) { sample->set_is_beginning(isBeginning); int slotid = 0; for (auto& arg : arguments) { @@ -272,8 +280,7 @@ void writeData(const DataBatch& batch, bool useGpu, bool dataCompression) { int64_t totalSeqs = batch.getNumSequences(); int64_t seq = 0; - ICpuGpuVectorPtr sequenceStartPositions = - arguments[0].sequenceStartPositions; + ICpuGpuVectorPtr sequenceStartPositions = arguments[0].sequenceStartPositions; int64_t numWritten = 0; vector curProtoFiles = dataCompression ? protoFilesCompressed : protoFiles; @@ -306,8 +313,11 @@ void writeData(const DataBatch& batch, bool useGpu, bool dataCompression) { } // check that the sample at pos1 in args1 is same as the sample at pos2 in args2 -void checkSample(const vector& args1, int64_t pos1, - const vector& args2, int64_t pos2, bool useGpu) { +void checkSample(const vector& args1, + int64_t pos1, + const vector& args2, + int64_t pos2, + bool useGpu) { EXPECT_EQ(args1.size(), args2.size()); VLOG(1) << " pos1=" << pos1 << " pos2=" << pos2; @@ -361,8 +371,11 @@ void checkSample(const vector& args1, int64_t pos1, } } -void testProtoDataProvider(int* numPerSlotType, bool iid, bool async, - bool useGpu, bool dataCompression, +void testProtoDataProvider(int* numPerSlotType, + bool iid, + bool async, + bool useGpu, + bool dataCompression, int numConstantSlots = 0) { mkDir(kTestDir); DataBatch data; @@ -377,7 +390,9 @@ void testProtoDataProvider(int* numPerSlotType, bool iid, bool async, for (int i = 0; i < numConstantSlots; ++i) { config.add_constant_slots(i + 11); - MatrixPtr w = Matrix::create(data.getSize(), 1, /* trans= */ false, + MatrixPtr w = Matrix::create(data.getSize(), + 1, + /* trans= */ false, /* useGpu= */ false); w->assign(config.constant_slots(i)); data.appendData(w); @@ -393,16 +408,14 @@ void testProtoDataProvider(int* numPerSlotType, bool iid, bool async, size_t seq1 = 0; vector& args1 = data.getStreams(); - ICpuGpuVectorPtr sequenceStartPositions1 = - args1[0].sequenceStartPositions; + ICpuGpuVectorPtr sequenceStartPositions1 = args1[0].sequenceStartPositions; dataProvider->reset(); while (dataProvider->getNextBatch(batchSize, &batch) > 0) { CHECK_EQ(data.getNumStreams(), batch.getNumStreams()); vector& args2 = batch.getStreams(); - ICpuGpuVectorPtr sequenceStartPositions2 = - args2[0].sequenceStartPositions; + ICpuGpuVectorPtr sequenceStartPositions2 = args2[0].sequenceStartPositions; for (auto& arg : args2) { EXPECT_EQ(iid, !arg.sequenceStartPositions); } @@ -494,8 +507,8 @@ TEST(ProtoDataProvider, test) { numSparseValueVectorSlots; numPerSlotType[SlotDef::INDEX] = numIdSlots; numPerSlotType[SlotDef::STRING] = numStrSlots; - testProtoDataProvider(numPerSlotType, iid, async, useGpu, - dataCompression); + testProtoDataProvider( + numPerSlotType, iid, async, useGpu, dataCompression); } // end for (int dataCompression : numTwoArray) } // end for (int useGpu : numTwoArray) } // end for (int async : numTwoArray) @@ -531,7 +544,9 @@ TEST(ProtoDataProvider, constant_slots) { numPerSlotType[SlotDef::INDEX] = 1; testProtoDataProvider(numPerSlotType, /* iid= */ true, - /* async= */ false, useGpu, dataCompression, + /* async= */ false, + useGpu, + dataCompression, numConstantSlots); } // end for (int dataCompression : numTwoArray) } // end for (int useGpu : numTwoArray) @@ -541,16 +556,17 @@ TEST(ProtoDataProvider, constant_slots) { } void checkSampleSequence(const vector& args1, - const vector& args2, int64_t offset, - int64_t numSeqs, bool useGpu) { + const vector& args2, + int64_t offset, + int64_t numSeqs, + bool useGpu) { // check slot num are equal EXPECT_EQ(args1.size(), args2.size()); for (size_t i = 0; i < args1.size(); i++) { auto type = getSlotType(args1[i]); // check for args2: sequenceStartPositions vs numSeqs // (1) size - EXPECT_EQ(args2[i].sequenceStartPositions->getSize(), - (size_t)numSeqs + 1); + EXPECT_EQ(args2[i].sequenceStartPositions->getSize(), (size_t)numSeqs + 1); // (2) content auto checkArgContent = [&](const Argument& args, int numSeqs) { for (int j = 0; j <= numSeqs; j++) { @@ -579,8 +595,8 @@ void checkSampleSequence(const vector& args1, const real* rowValues1; // nullptr int totalLength = 0; for (int j = 0; j < numSeqs; j++) { - getColRow(args1[i], offset + j, useGpu, &colNum1, &rowCols1, - &rowValues1); + getColRow( + args1[i], offset + j, useGpu, &colNum1, &rowCols1, &rowValues1); // (1) lengths EXPECT_EQ(totalLength, args2[i].sequenceStartPositions->getElement(j)); @@ -626,13 +642,16 @@ void checkSampleSequence(const vector& args1, } } -void testProtoSequenceDataProvider(int* numPerSlotType, bool async, +void testProtoSequenceDataProvider(int* numPerSlotType, + bool async, bool useGpu) { mkDir(kTestDir); DataBatch data; - prepareData(&data, numPerSlotType, - /* iid */ true, useGpu); + prepareData(&data, + numPerSlotType, + /* iid */ true, + useGpu); writeData(data, useGpu, /* dataCompression */ false); DataConfig config; @@ -649,8 +668,7 @@ void testProtoSequenceDataProvider(int* numPerSlotType, bool async, DataBatch batch; vector& args1 = data.getStreams(); - ICpuGpuVectorPtr sequenceStartPositions1 = - args1[0].sequenceStartPositions; + ICpuGpuVectorPtr sequenceStartPositions1 = args1[0].sequenceStartPositions; dataProvider->reset(); @@ -658,8 +676,7 @@ void testProtoSequenceDataProvider(int* numPerSlotType, bool async, while (dataProvider->getNextBatch(batchSize, &batch) > 0) { CHECK_EQ(data.getNumStreams(), batch.getNumStreams()); vector& args2 = batch.getStreams(); - ICpuGpuVectorPtr sequenceStartPositions2 = - args2[0].sequenceStartPositions; + ICpuGpuVectorPtr sequenceStartPositions2 = args2[0].sequenceStartPositions; for (auto& arg : args1) { // args1 should not has sequence EXPECT_EQ(true, !arg.sequenceStartPositions); diff --git a/paddle/gserver/tests/test_PyDataProvider.cpp b/paddle/gserver/tests/test_PyDataProvider.cpp index 6ad45e3a65a6276ea9fa5bf8b3878c943caf7cba..802f9aa4cb558f48fe55d7d7d5c882d25925bb32 100644 --- a/paddle/gserver/tests/test_PyDataProvider.cpp +++ b/paddle/gserver/tests/test_PyDataProvider.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include #include @@ -114,9 +113,10 @@ void simpleValueCheck(const vector& argumentList, bool useGpu) { // Dense real* data; if (useGpu) { - MatrixPtr cpuMatrixPtr = - Matrix::create(argumentList[0].value->getHeight(), - argumentList[0].value->getWidth(), 0, 0); + MatrixPtr cpuMatrixPtr = Matrix::create(argumentList[0].value->getHeight(), + argumentList[0].value->getWidth(), + 0, + 0); cpuMatrixPtr->copyFrom(*argumentList[0].value); data = cpuMatrixPtr->getData(); } else { diff --git a/paddle/gserver/tests/test_PyDataProvider2.cpp b/paddle/gserver/tests/test_PyDataProvider2.cpp index b9867a728d9b4cc8d318578ab3e45021f87daa4c..24aa73910f254e636dfb88182552fe47c12c8543 100644 --- a/paddle/gserver/tests/test_PyDataProvider2.cpp +++ b/paddle/gserver/tests/test_PyDataProvider2.cpp @@ -31,14 +31,11 @@ extern void clearOnPoolFilledHook(); } // namespace unittest } // namespace paddle - const paddle::real epsilon = 1e-5; -static inline int64_t readDataBatch( - paddle::DataBatch* batch, - const std::string& funcName, - int64_t batchSize = 65535) { - +static inline int64_t readDataBatch(paddle::DataBatch* batch, + const std::string& funcName, + int64_t batchSize = 65535) { paddle::DataConfig config; config.set_type("py2"); config.set_files(FLAGS_train_list.c_str()); @@ -64,18 +61,19 @@ TEST(PyDataProvider2, dense_no_seq) { provider->setSkipShuffle(); // skip shuffle for unittest. paddle::DataBatch batch; - for (size_t pass=0; pass < 2; ++pass) { // read 2 passes + for (size_t pass = 0; pass < 2; ++pass) { // read 2 passes provider->reset(); int64_t num = provider->getNextBatchInternal(100, &batch); ASSERT_NE(num, 0); ASSERT_EQ((size_t)batch.getStreams().size(), (size_t)1); ASSERT_EQ((size_t)batch.getSize(), (size_t)100); // Check batch data. - for (size_t i=0; i < 100; ++i) { - for (size_t j=0; j < 200; ++j) { - paddle::real tmp = (paddle::real)((j-100.0) * (i+1) / 200.0); - ASSERT_NEAR(batch.getStreams()[0].value->getData()[i*200 + j], - tmp, epsilon);} + for (size_t i = 0; i < 100; ++i) { + for (size_t j = 0; j < 200; ++j) { + paddle::real tmp = (paddle::real)((j - 100.0) * (i + 1) / 200.0); + ASSERT_NEAR( + batch.getStreams()[0].value->getData()[i * 200 + j], tmp, epsilon); + } } num = provider->getNextBatchInternal(100, &batch); @@ -83,12 +81,13 @@ TEST(PyDataProvider2, dense_no_seq) { ASSERT_EQ(batch.getStreams().size(), (size_t)1); ASSERT_EQ((size_t)batch.getSize(), (size_t)100); // Check batch data. - for (size_t i=0; i < 100; ++i) { + for (size_t i = 0; i < 100; ++i) { size_t ii = i + 100; - for (size_t j=0; j < 200; ++j) { - paddle::real tmp = (paddle::real)((j-100.0) * (ii+1) / 200.0); - ASSERT_NEAR(batch.getStreams()[0].value->getData()[i*200 + j], - tmp, epsilon);} + for (size_t j = 0; j < 200; ++j) { + paddle::real tmp = (paddle::real)((j - 100.0) * (ii + 1) / 200.0); + ASSERT_NEAR( + batch.getStreams()[0].value->getData()[i * 200 + j], tmp, epsilon); + } } num = provider->getNextBatchInternal(100, &batch); ASSERT_EQ(num, 0); @@ -106,11 +105,11 @@ TEST(PyDataProvider2, index_no_seq) { provider->setSkipShuffle(); // skip shuffle for unittest. paddle::DataBatch batch; - for (size_t pass=0; pass < 2; ++pass) { + for (size_t pass = 0; pass < 2; ++pass) { provider->reset(); int64_t num = provider->getNextBatchInternal(10000, &batch); CHECK_EQ(num, 200); - for (int i=0; i < 200; ++i) { + for (int i = 0; i < 200; ++i) { CHECK_EQ(i, batch.getStreams()[0].ids->getData()[i]); } } @@ -118,13 +117,14 @@ TEST(PyDataProvider2, index_no_seq) { TEST(PyDataProvider2, init_hook) { paddle::PyObjectPtr pickle = paddle::py::import("pickle"); - paddle::PyObjectPtr globals( - PyModule_GetDict(PyImport_AddModule("__main__"))); + paddle::PyObjectPtr globals(PyModule_GetDict(PyImport_AddModule("__main__"))); PyDict_SetItemString(globals.get(), "pickle", pickle.get()); paddle::PyObjectPtr locals(PyDict_New()); paddle::PyObjectPtr mdl(PyRun_String( "dumps = pickle.dumps({'value':[float(x) for x in xrange(20)]})", - Py_file_input, globals.get(), locals.get())); + Py_file_input, + globals.get(), + locals.get())); CHECK_PY(mdl) << "Error!"; paddle::PyObjectPtr dps(PyDict_GetItemString(locals.get(), "dumps")); CHECK_PY(dps) << "Error!"; @@ -145,9 +145,9 @@ TEST(PyDataProvider2, init_hook) { ASSERT_EQ(num, 200); auto& mat = batch.getStreams()[0].value; ASSERT_EQ((size_t)mat->getWidth(), (size_t)20); - for (size_t i=0; i < 200; ++i) { - for (size_t j=0; j < 20; ++j) { - ASSERT_NEAR((paddle::real)j, mat->getData()[i*20 + j], epsilon); + for (size_t i = 0; i < 200; ++i) { + for (size_t j = 0; j < 20; ++j) { + ASSERT_NEAR((paddle::real)j, mat->getData()[i * 20 + j], epsilon); } } } @@ -168,11 +168,11 @@ TEST(PyDataProvider2, sparse_no_value_no_seq) { auto csm = std::dynamic_pointer_cast( batch.getStreams()[0].value); CHECK(csm != nullptr); - for (int i=0; i < 200; ++i) { + for (int i = 0; i < 200; ++i) { CHECK_EQ(csm->getColNum(i), (size_t)10); int* cols = csm->getRowCols(i); - for (int j=0; j < 10; ++j) { - CHECK_EQ(cols[j], (i+1)*(j+1)); + for (int j = 0; j < 10; ++j) { + CHECK_EQ(cols[j], (i + 1) * (j + 1)); } } } @@ -183,13 +183,13 @@ TEST(PyDataProvider2, sparse_value_no_seq) { auto csm = std::dynamic_pointer_cast( batch.getStreams()[0].value); CHECK(csm != nullptr); - for (int i=0; i < 200; ++i) { + for (int i = 0; i < 200; ++i) { CHECK_EQ(csm->getColNum(i), (size_t)10); int* cols = csm->getRowCols(i); real* dat = csm->getRowValues(i); - for (int j=0; j < 10; ++j) { - EXPECT_EQ(cols[j], (i+1)*(j+1)); - EXPECT_EQ(dat[j], real(j)/real(i+1)); + for (int j = 0; j < 10; ++j) { + EXPECT_EQ(cols[j], (i + 1) * (j + 1)); + EXPECT_EQ(dat[j], real(j) / real(i + 1)); } } } @@ -198,10 +198,10 @@ TEST(PyDataProvider2, index_seq) { paddle::DataBatch batch; CHECK_EQ(readDataBatch(&batch, "test_index_seq"), 200); auto& arg = batch.getStreams()[0]; - CHECK_EQ((int)arg.ids->getSize(), (200 + 1) * 200 /2); + CHECK_EQ((int)arg.ids->getSize(), (200 + 1) * 200 / 2); size_t tmp = 0; - for (size_t i=0; i < 200; ++i) { // CHECK DATA CORRECT - for (size_t j=0; j < i+1; ++j) { + for (size_t i = 0; i < 200; ++i) { // CHECK DATA CORRECT + for (size_t j = 0; j < i + 1; ++j) { ASSERT_EQ((size_t)arg.ids->getData()[tmp], j); ++tmp; } @@ -221,9 +221,9 @@ TEST(PyDataProvider2, index_sub_seq) { ASSERT_EQ(readDataBatch(&batch, "test_index_sub_seq"), 200); auto& arg = batch.getStreams()[0]; size_t tmp = 0; - for (size_t i=0; i < 200; ++i) { - for (size_t j=0; j < i+1; ++j) { - for (size_t k=0; k < j+1; ++k) { + for (size_t i = 0; i < 200; ++i) { + for (size_t j = 0; j < i + 1; ++j) { + for (size_t k = 0; k < j + 1; ++k) { CHECK_EQ((size_t)arg.ids->getData()[tmp++], k); } } @@ -236,14 +236,14 @@ TEST(PyDataProvider2, index_sub_seq) { ASSERT_EQ(arg.sequenceStartPositions->getData(false)[0], 0); size_t idx = 1; tmp = 0; - for (size_t i=0; i < 200; ++i) { - for (size_t j=0; j < i+1; ++j) { - tmp += j+1; + for (size_t i = 0; i < 200; ++i) { + for (size_t j = 0; j < i + 1; ++j) { + tmp += j + 1; ASSERT_EQ((size_t)arg.subSequenceStartPositions->getData(false)[idx], - (size_t)tmp); + (size_t)tmp); ++idx; } - ASSERT_EQ((size_t)arg.sequenceStartPositions->getData(false)[i+1], tmp); + ASSERT_EQ((size_t)arg.sequenceStartPositions->getData(false)[i + 1], tmp); } } @@ -264,7 +264,7 @@ TEST(PyDataProvider2, min_pool_size) { paddle::unittest::pydp2::setOnPoolFilledHook([&](size_t poolSize) { if (totalData > batchSize) { - CHECK_GE(poolSize, std::min(totalData-batchSize, minPoolSize)); + CHECK_GE(poolSize, std::min(totalData - batchSize, minPoolSize)); } }); while (true) { @@ -287,7 +287,7 @@ TEST(PyDataProvider2, can_over_batch_size) { config.set_load_data_args(""); paddle::DataBatch batch; std::unique_ptr provider( - paddle::DataProvider::create(config, false)); + paddle::DataProvider::create(config, false)); provider->reset(); constexpr size_t batchSize = 100; while (true) { @@ -313,7 +313,7 @@ TEST(PyDataProvider2, input_order) { *modelConfig.add_input_layer_names() = "input2"; paddle::DataBatch batch; std::unique_ptr provider( - paddle::DataProvider::create(config, modelConfig, false)); + paddle::DataProvider::create(config, modelConfig, false)); provider->reset(); constexpr size_t batchSize = 100; while (true) { @@ -338,7 +338,7 @@ TEST(PyDataProvider2, test_check) { config.set_load_data_args(""); paddle::DataBatch batch; std::unique_ptr provider( - paddle::DataProvider::create(config, false)); + paddle::DataProvider::create(config, false)); provider->reset(); while (true) { size_t realBatchSize = provider->getNextBatchInternal(100, &batch); @@ -346,7 +346,7 @@ TEST(PyDataProvider2, test_check) { break; } else { auto& ivec = batch.getStream(0).ids; - for (size_t i=0; i < ivec->getSize(); ++i) { + for (size_t i = 0; i < ivec->getSize(); ++i) { CHECK_LT(ivec->getData()[i], 10); } } diff --git a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp index d104db3e5b32d5ae5c874f7ef3e5c51fea6366ec..80d713dac03a42b370d50ebb17d089e9be2f17ff 100644 --- a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp +++ b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp @@ -45,12 +45,16 @@ public: auto p = const_cast(this); auto& params = p->getGradientMachine()->getParameters(); return std::accumulate( - params.begin(), params.end(), 0UL, + params.begin(), + params.end(), + 0UL, [](size_t a, const ParameterPtr& p) { return a + p->getSize(); }); } }; -void CalCost(const string& conf, const string& dir, real* cost, +void CalCost(const string& conf, + const string& dir, + real* cost, int num_passes) { auto config = std::make_shared(conf); TrainerForTest trainer; @@ -82,8 +86,8 @@ void CalCost(const string& conf, const string& dir, real* cost, int num = dataProvider->getNextBatch(batchSize, &dataBatch); if (num == 0) break; totalCost += trainer.calcGradient(dataBatch, vecW, vecGradient); - sgdUpdate(learningRate, momentum, decayRate, &vecW, &vecGradient, - &vecMomentum); + sgdUpdate( + learningRate, momentum, decayRate, &vecW, &vecGradient, &vecMomentum); } cost[i] = totalCost; } @@ -119,7 +123,8 @@ TEST(RecurrentGradientMachine, HasSubSequence) { for (bool useGpu : {false, true}) { test("gserver/tests/sequence_layer_group.conf", "gserver/tests/sequence_nest_layer_group.conf", - 1e-5, useGpu); + 1e-5, + useGpu); } } @@ -127,7 +132,8 @@ TEST(RecurrentGradientMachine, rnn) { for (bool useGpu : {false, true}) { test("gserver/tests/sequence_rnn.conf", "gserver/tests/sequence_nest_rnn.conf", - 1e-6, useGpu); + 1e-6, + useGpu); } } @@ -135,16 +141,18 @@ TEST(RecurrentGradientMachine, rnn_multi_input) { for (bool useGpu : {false, true}) { test("gserver/tests/sequence_rnn_multi_input.conf", "gserver/tests/sequence_nest_rnn_multi_input.conf", - 1e-6, useGpu); + 1e-6, + useGpu); } } TEST(RecurrentGradientMachine, rnn_multi_unequalength_input) { - for (bool useGpu : {false, true}) { - test("gserver/tests/sequence_rnn_multi_unequalength_inputs.conf", - "gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.conf", - 1e-6, useGpu); - } + for (bool useGpu : {false, true}) { + test("gserver/tests/sequence_rnn_multi_unequalength_inputs.conf", + "gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.conf", + 1e-6, + useGpu); + } } int main(int argc, char** argv) { diff --git a/paddle/gserver/tests/test_RecurrentLayer.cpp b/paddle/gserver/tests/test_RecurrentLayer.cpp index 1c8497e8c526f84cabf6e0862ea96653f99f64be..0643cec38b3a5d96de64438c7342f827fde808a9 100644 --- a/paddle/gserver/tests/test_RecurrentLayer.cpp +++ b/paddle/gserver/tests/test_RecurrentLayer.cpp @@ -71,7 +71,9 @@ void checkError(const CpuVector& vector1, const CpuVector& vector2) { EXPECT_EQ(count, 0) << "There are " << count << " different element."; } -LayerPtr creatDataLayer(string name, size_t batchSize, int layerSize, +LayerPtr creatDataLayer(string name, + size_t batchSize, + int layerSize, bool useGpu) { LayerConfig dataConfig; dataConfig.set_name(name); @@ -96,7 +98,9 @@ LayerPtr creatDataLayer(string name, size_t batchSize, int layerSize, return layer; } -ParameterPtr creatParameter(string name, int pid, size_t paraSize, +ParameterPtr creatParameter(string name, + int pid, + size_t paraSize, bool useGpu) { ParameterConfig paraConfig; paraConfig.set_name(name); @@ -112,7 +116,9 @@ ParameterPtr creatParameter(string name, int pid, size_t paraSize, return parameter; } -ParameterPtr creatParameterBias(string name, int pid, size_t paraSize, +ParameterPtr creatParameterBias(string name, + int pid, + size_t paraSize, bool useGpu) { ParameterConfig paraConfig; paraConfig.set_name(name); @@ -127,8 +133,10 @@ ParameterPtr creatParameterBias(string name, int pid, size_t paraSize, return parameter; } -LayerPtr initRecurrentLayer(LayerConfig layerConfig, size_t batchSize, - int layerSize, bool useGpu) { +LayerPtr initRecurrentLayer(LayerConfig layerConfig, + size_t batchSize, + int layerSize, + bool useGpu) { FLAGS_use_gpu = useGpu; LayerMap layerMap; ParameterMap parameterMap; @@ -214,7 +222,7 @@ TEST(Layer, RecurrentLayer) { #define protected public #include "paddle/gserver/layers/LstmLayer.h" #include "paddle/gserver/layers/GatedRecurrentLayer.h" -template +template class TestRecurrentLayer { public: LayerConfig config_; @@ -227,25 +235,34 @@ public: LayerMap layerMap_; ParameterMap parameterMap_; TestRecurrentLayer(const LayerConfig& config, - bool useGpu, bool useBatch = false) - : config_(config), useGpu_(useGpu), useBatch_(useBatch) {} + bool useGpu, + bool useBatch = false) + : config_(config), useGpu_(useGpu), useBatch_(useBatch) {} void init(size_t batchSize) { FLAGS_use_gpu = useGpu_; testLayer_ = Layer::create(config_); if (typeid(T) == typeid(GatedRecurrentLayer)) { dataLayer_ = creatDataLayer(config_.mutable_inputs(0)->input_layer_name(), - batchSize, config_.size() * 3, useGpu_); + batchSize, + config_.size() * 3, + useGpu_); para_ = creatParameter(config_.mutable_inputs(0)->input_parameter_name(), - 0, config_.size() * config_.size() * 3, useGpu_); - bias_ = creatParameterBias(config_.bias_parameter_name(), - 1, config_.size() * 3, useGpu_); + 0, + config_.size() * config_.size() * 3, + useGpu_); + bias_ = creatParameterBias( + config_.bias_parameter_name(), 1, config_.size() * 3, useGpu_); } else if (typeid(T) == typeid(LstmLayer)) { dataLayer_ = creatDataLayer(config_.mutable_inputs(0)->input_layer_name(), - batchSize, config_.size() * 4, useGpu_); + batchSize, + config_.size() * 4, + useGpu_); para_ = creatParameter(config_.mutable_inputs(0)->input_parameter_name(), - 0, config_.size() * config_.size() * 4, useGpu_); - bias_ = creatParameterBias(config_.bias_parameter_name(), - 1, config_.size() * 7, useGpu_); + 0, + config_.size() * config_.size() * 4, + useGpu_); + bias_ = creatParameterBias( + config_.bias_parameter_name(), 1, config_.size() * 7, useGpu_); } layerMap_[dataLayer_->getName()] = dataLayer_; parameterMap_[para_->getName()] = para_; @@ -266,15 +283,17 @@ public: } }; -template -void checkRecurrentLayer(LayerConfig layerConfig, size_t batchSize, - bool cpuBatch, bool gpuBatch) { +template +void checkRecurrentLayer(LayerConfig layerConfig, + size_t batchSize, + bool cpuBatch, + bool gpuBatch) { TestRecurrentLayer testCpu(layerConfig, false, cpuBatch); TestRecurrentLayer testGpu(layerConfig, true, gpuBatch); testCpu.init(batchSize); testGpu.init(batchSize); - auto checkError = [](MatrixPtr cpu, MatrixPtr gpu, - int numSequences, const char* str) { + auto checkError = []( + MatrixPtr cpu, MatrixPtr gpu, int numSequences, const char* str) { CpuMatrix check(gpu->getHeight(), gpu->getWidth()); check.copyFrom(*gpu); int height = cpu->getHeight(); @@ -290,8 +309,8 @@ void checkRecurrentLayer(LayerConfig layerConfig, size_t batchSize, } } } - EXPECT_EQ(count, 0) << "[" << str << "]" << - "There are " << count << " different element."; + EXPECT_EQ(count, 0) << "[" << str << "]" + << "There are " << count << " different element."; }; T* cpuLayer = dynamic_cast(testCpu.testLayer_.get()); T* gpuLayer = dynamic_cast(testGpu.testLayer_.get()); @@ -312,8 +331,8 @@ void checkRecurrentLayer(LayerConfig layerConfig, size_t batchSize, testCpu.forward(); testGpu.forward(); - checkError(cpuLayer->getOutputValue(), - gpuLayer->getOutputValue(), 1, "outputValue"); + checkError( + cpuLayer->getOutputValue(), gpuLayer->getOutputValue(), 1, "outputValue"); /* check backward */ cpuLayer->getOutputGrad()->randomizeUniform(); @@ -327,11 +346,15 @@ void checkRecurrentLayer(LayerConfig layerConfig, size_t batchSize, checkError(cpuInput.grad, gpuInput.grad, 1, "inputGrad"); // check weight grad int numSequences = cpuInput.getNumSequences(); - checkError(cpuLayer->weight_->getWGrad(), gpuLayer->weight_->getWGrad(), - numSequences, "weightGrad"); + checkError(cpuLayer->weight_->getWGrad(), + gpuLayer->weight_->getWGrad(), + numSequences, + "weightGrad"); // check bias grad - checkError(cpuLayer->bias_->getWGrad(), gpuLayer->bias_->getWGrad(), - numSequences, "biasGrad"); + checkError(cpuLayer->bias_->getWGrad(), + gpuLayer->bias_->getWGrad(), + numSequences, + "biasGrad"); } TEST(Layer, GatedRecurrentLayer) { @@ -357,7 +380,7 @@ TEST(Layer, GatedRecurrentLayer) { layerConfig.set_size(frameSize); layerConfig.set_reversed(reversed); checkRecurrentLayer( - layerConfig, batchSize, cpuBatch, gpuBatch); + layerConfig, batchSize, cpuBatch, gpuBatch); } } } @@ -388,8 +411,8 @@ TEST(Layer, LstmLayer) { << " cpuBatch=" << cpuBatch << " gpuBatch=" << gpuBatch; layerConfig.set_size(frameSize); layerConfig.set_reversed(reversed); - checkRecurrentLayer - (layerConfig, batchSize, cpuBatch, gpuBatch); + checkRecurrentLayer( + layerConfig, batchSize, cpuBatch, gpuBatch); } } } diff --git a/paddle/gserver/tests/test_SelectiveFCLayer.cpp b/paddle/gserver/tests/test_SelectiveFCLayer.cpp index 9a83217f1a8471e61c2938eff7185cfa585b6c7d..204b03332ff5bba3b9f3e5d98050942d6f0f390f 100644 --- a/paddle/gserver/tests/test_SelectiveFCLayer.cpp +++ b/paddle/gserver/tests/test_SelectiveFCLayer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include #include #include @@ -53,7 +52,7 @@ int randint(int* data, size_t int_max, size_t size) { int this_int = 0; while (count < size) { - this_int = std::rand() % int_max; // NOLINT + this_int = std::rand() % int_max; // NOLINT if (tmp.find(this_int) == tmp.end()) { tmp[this_int] = 0; count += 1; @@ -71,8 +70,10 @@ int randint(int* data, size_t int_max, size_t size) { return 0; } -void calcOutput(ComData& comData, const string configFile, - const string configArgs, bool useGpu) { +void calcOutput(ComData& comData, + const string configFile, + const string configArgs, + bool useGpu) { FLAGS_config = configFile; FLAGS_config_args = configArgs; FLAGS_use_gpu = useGpu; @@ -95,8 +96,8 @@ void calcOutput(ComData& comData, const string configFile, vector& inArgs = dataBatch.getStreams(); trainer.getGradientMachine()->start(trainer.getConfig(), nullptr); - trainer.getGradientMachine()->forwardBackward(inArgs, &comData.outArgs, - PASS_TRAIN); + trainer.getGradientMachine()->forwardBackward( + inArgs, &comData.outArgs, PASS_TRAIN); trainer.getGradientMachine()->finish(); } @@ -108,8 +109,8 @@ void checkMatrix(real* A, real* B, size_t matSize) { #endif int diffNum = 0; for (size_t i = 0; i < matSize; ++i) { - if (std::isinf(A[i]) || std::isnan(A[i]) - || std::isinf(B[i]) || std::isnan(B[i])) { + if (std::isinf(A[i]) || std::isnan(A[i]) || std::isinf(B[i]) || + std::isnan(B[i])) { } else if (fabs(A[i] - B[i]) > err) { diffNum++; } @@ -117,8 +118,10 @@ void checkMatrix(real* A, real* B, size_t matSize) { EXPECT_EQ(0, diffNum); } -void checkTranspose(real* matrix, real* transpose, - size_t width, size_t matSize) { +void checkTranspose(real* matrix, + real* transpose, + size_t width, + size_t matSize) { #ifndef PADDLE_TYPE_DOUBLE real err = 1e-3; #else @@ -149,20 +152,20 @@ void compareOutput(ComData& fcData, ComData& selFcData) { // check cost LOG(INFO) << "Check cost"; CpuMatrix fcCost(outArgsFc[0].value->getHeight(), - outArgsFc[0].value->getWidth()); + outArgsFc[0].value->getWidth()); CpuMatrix selfcCost(outArgsSelfc[0].value->getHeight(), - outArgsSelfc[0].value->getWidth()); + outArgsSelfc[0].value->getWidth()); fcCost.copyFrom(*outArgsFc[0].value); selfcCost.copyFrom(*outArgsSelfc[0].value); checkMatrix(fcCost.getData(), selfcCost.getData(), fcCost.getElementCnt()); // check selective fc output and fc output - LOG(INFO) << "Compare output of SelectiveFullyConectedLayer " << - "with FullyConectedLayer"; + LOG(INFO) << "Compare output of SelectiveFullyConectedLayer " + << "with FullyConectedLayer"; CpuMatrix fcOut(outArgsFc[1].value->getHeight(), - outArgsFc[1].value->getWidth()); + outArgsFc[1].value->getWidth()); CpuMatrix selfcOut(outArgsSelfc[1].value->getHeight(), - outArgsSelfc[1].value->getWidth()); + outArgsSelfc[1].value->getWidth()); fcOut.copyFrom(*outArgsFc[1].value); selfcOut.copyFrom(*outArgsSelfc[1].value); @@ -189,32 +192,40 @@ void compareOutput(ComData& fcData, ComData& selFcData) { CpuVector paraGrad1(*p1->getBuf(PARAMETER_GRADIENT)); CpuVector paraGrad2(*p2->getBuf(PARAMETER_GRADIENT)); if (paramName == "rand_fc_param.bias") { - checkMatrix(paraValue1.getData(), - paraValue2.getData(), - paraValue1.getSize()); - checkMatrix(paraGrad1.getData(), - paraGrad2.getData(), - paraGrad1.getSize()); + checkMatrix( + paraValue1.getData(), paraValue2.getData(), paraValue1.getSize()); + checkMatrix( + paraGrad1.getData(), paraGrad2.getData(), paraGrad1.getSize()); } else { - checkTranspose(paraValue1.getData(), paraValue2.getData(), - fcLayerWidth, paraValue1.getSize()); - checkTranspose(paraGrad1.getData(), paraGrad2.getData(), - fcLayerWidth, paraGrad1.getSize()); + checkTranspose(paraValue1.getData(), + paraValue2.getData(), + fcLayerWidth, + paraValue1.getSize()); + checkTranspose(paraGrad1.getData(), + paraGrad2.getData(), + fcLayerWidth, + paraGrad1.getSize()); } } } -void compareSparseMulOutput(real* fcOutput, real* selOutput, size_t nnz, - const std::shared_ptr > > &selCols) { +void compareSparseMulOutput( + real* fcOutput, + real* selOutput, + size_t nnz, + const std::shared_ptr>>& selCols) { #ifndef PADDLE_TYPE_DOUBLE real err = 1e-3; #else real err = 1e-10; #endif - size_t nnzCount = std::accumulate(selCols->begin(), selCols->end(), 0UL, - [](size_t a, const std::pair& arr){ - return a+arr.second; - }); + size_t nnzCount = + std::accumulate(selCols->begin(), + selCols->end(), + 0UL, + [](size_t a, const std::pair& arr) { + return a + arr.second; + }); EXPECT_EQ(nnz, nnzCount); size_t sampleNum = selCols->size(); @@ -225,18 +236,20 @@ void compareSparseMulOutput(real* fcOutput, real* selOutput, size_t nnz, size_t selIdx = (*selCols)[i].first[j]; if (fabs(fcOutput[i * fcLayerWidth + selIdx] - selOutput[count]) > err) { diffNum++; - LOG(INFO) << count << " diff : " - << fcOutput[i * fcLayerWidth + selIdx] << "\t" - << selOutput[count]; - } + LOG(INFO) << count << " diff : " << fcOutput[i * fcLayerWidth + selIdx] + << "\t" << selOutput[count]; + } count++; } } EXPECT_EQ(0, diffNum); } -LayerPtr creatDataLayer(string name, size_t batchSize, size_t layerSize, - std::vector& values, bool useGpu) { +LayerPtr creatDataLayer(string name, + size_t batchSize, + size_t layerSize, + std::vector& values, + bool useGpu) { LayerConfig dataConfig; dataConfig.set_name(name); dataConfig.set_type("data"); @@ -253,8 +266,8 @@ LayerPtr creatDataLayer(string name, size_t batchSize, size_t layerSize, return layer; } -ParameterPtr creatParameter(string name, int pid, size_t paraSize, - string paramFile, bool useGpu) { +ParameterPtr creatParameter( + string name, int pid, size_t paraSize, string paramFile, bool useGpu) { ParameterConfig paraConfig; paraConfig.set_name(name); paraConfig.set_size(paraSize); @@ -268,16 +281,19 @@ ParameterPtr creatParameter(string name, int pid, size_t paraSize, return parameter; } -LayerPtr initFcLayer(LayerPtr dataLayer, LayerConfig layerConfig, - int dataLayerSize, int fcLayerSize, - string paraName, string paraFile, bool useGpu) { +LayerPtr initFcLayer(LayerPtr dataLayer, + LayerConfig layerConfig, + int dataLayerSize, + int fcLayerSize, + string paraName, + string paraFile, + bool useGpu) { LayerMap layerMap; ParameterMap parameterMap; layerMap[dataLayer->getName()] = dataLayer; - ParameterPtr para = - creatParameter(paraName, 0, dataLayerSize * fcLayerSize, - paraFile, useGpu); + ParameterPtr para = creatParameter( + paraName, 0, dataLayerSize * fcLayerSize, paraFile, useGpu); parameterMap[para->getName()] = para; layerConfig.add_inputs(); @@ -296,14 +312,13 @@ LayerPtr initFcLayer(LayerPtr dataLayer, LayerConfig layerConfig, #ifndef PADDLE_TYPE_DOUBLE // The parameter file used in fc.conf and selective_fc.conf is float TEST(Layer, SelectiveFcLayer_train_dense_mul) { - const string& fcConfig = - "gserver/tests/SelectiveFcTest/conf/fc.conf"; + const string& fcConfig = "gserver/tests/SelectiveFcTest/conf/fc.conf"; const string& fcConfigArgs = - "filelist=gserver/tests/SelectiveFcTest/dense_mul_list"; + "filelist=gserver/tests/SelectiveFcTest/dense_mul_list"; const string& selFcConfig = "gserver/tests/SelectiveFcTest/conf/selective_fc.conf"; const string& selConfigArgs = - "filelist=gserver/tests/SelectiveFcTest/dense_mul_list"; + "filelist=gserver/tests/SelectiveFcTest/dense_mul_list"; for (auto useGpu : {false, true}) { #ifdef PADDLE_ONLY_CPU @@ -323,7 +338,7 @@ TEST(Layer, SelectiveFcLayer_train_dense_mul) { } #endif // PADDLE_TYPE_DOUBLE -void testSelectiveFcLayerTrainSparseMul(const LayerConfig &config, +void testSelectiveFcLayerTrainSparseMul(const LayerConfig& config, bool useGpu) { FLAGS_use_gpu = useGpu; size_t batchSize = 100; @@ -332,21 +347,26 @@ void testSelectiveFcLayerTrainSparseMul(const LayerConfig &config, for (size_t j = 0; j < batchSize * dataLayerSize; ++j) { values[j] = std::rand() / real(RAND_MAX); } - LayerPtr dataLayer = creatDataLayer( - "data", batchSize, dataLayerSize, values, useGpu); + LayerPtr dataLayer = + creatDataLayer("data", batchSize, dataLayerSize, values, useGpu); const string& selfcParaFile = - "gserver/tests/SelectiveFcTest/model/rand_fc_param.w.transpose"; + "gserver/tests/SelectiveFcTest/model/rand_fc_param.w.transpose"; const string& selfcParaName = "rand_fc_param.w.transpose"; std::shared_ptr selfcLayer = - std::dynamic_pointer_cast(initFcLayer( - dataLayer, config, dataLayerSize, fcLayerWidth, - selfcParaName, selfcParaFile, useGpu)); + std::dynamic_pointer_cast( + initFcLayer(dataLayer, + config, + dataLayerSize, + fcLayerWidth, + selfcParaName, + selfcParaFile, + useGpu)); // create selected columns - std::shared_ptr > > selCols( - new std::vector > (batchSize)); + std::shared_ptr>> selCols( + new std::vector>(batchSize)); size_t maxNNZ = 30; srand((size_t)(time(NULL))); int total = 0; @@ -364,8 +384,9 @@ void testSelectiveFcLayerTrainSparseMul(const LayerConfig &config, MatrixPtr outMatSelfc = selfcLayer->getOutputValue(); CpuSparseMatrixPtr cpuOutMatSelfc( - new CpuSparseMatrix(outMatSelfc->getHeight(), outMatSelfc->getWidth(), - outMatSelfc->getElementCnt())); + new CpuSparseMatrix(outMatSelfc->getHeight(), + outMatSelfc->getWidth(), + outMatSelfc->getElementCnt())); cpuOutMatSelfc->copyFrom(*outMatSelfc, HPPL_STREAM_DEFAULT); #ifndef PADDLE_ONLY_CPU if (useGpu) { @@ -376,7 +397,7 @@ void testSelectiveFcLayerTrainSparseMul(const LayerConfig &config, size_t nnz = cpuOutMatSelfc->getElementCnt(); const string& fcParaFile = - "gserver/tests/SelectiveFcTest/model/rand_fc_param.w"; + "gserver/tests/SelectiveFcTest/model/rand_fc_param.w"; const string& fcParaName = "rand_fc_param.w"; LayerConfig fcLayerConfig; fcLayerConfig.set_name("fc_layer"); @@ -384,13 +405,18 @@ void testSelectiveFcLayerTrainSparseMul(const LayerConfig &config, fcLayerConfig.set_active_type("linear"); fcLayerConfig.set_size(fcLayerWidth); - LayerPtr fcLayer = initFcLayer(dataLayer, fcLayerConfig, - dataLayerSize, fcLayerWidth, fcParaName, fcParaFile, useGpu); + LayerPtr fcLayer = initFcLayer(dataLayer, + fcLayerConfig, + dataLayerSize, + fcLayerWidth, + fcParaName, + fcParaFile, + useGpu); fcLayer->forward(PASS_TEST); MatrixPtr outMatFc = fcLayer->getOutputValue(); MatrixPtr cpuOutMatFc( - new CpuMatrix(outMatFc->getHeight(), outMatFc->getWidth())); + new CpuMatrix(outMatFc->getHeight(), outMatFc->getWidth())); cpuOutMatFc->copyFrom(*outMatFc, HPPL_STREAM_DEFAULT); #ifndef PADDLE_ONLY_CPU if (useGpu) { @@ -401,7 +427,7 @@ void testSelectiveFcLayerTrainSparseMul(const LayerConfig &config, compareSparseMulOutput(outValueFc, outValueSelfc, nnz, selCols); for (size_t i = 0; i < batchSize; ++i) { - delete [](*selCols)[i].first; + delete[](*selCols)[i].first; } } diff --git a/paddle/math/Allocator.h b/paddle/math/Allocator.h index f7aa60380f23eeea91ee852480862f6b19caedec..cba8b37289b53b7d75c64a6a95c9e3900b193902 100644 --- a/paddle/math/Allocator.h +++ b/paddle/math/Allocator.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include @@ -48,10 +47,10 @@ public: * @return Pointer to the allocated memory */ virtual void* alloc(size_t size) { - void* ptr; - CHECK_EQ(posix_memalign(&ptr, 32ul, size), 0); - CHECK(ptr) << "Fail to allocate CPU memory: size=" << size; - return ptr; + void* ptr; + CHECK_EQ(posix_memalign(&ptr, 32ul, size), 0); + CHECK(ptr) << "Fail to allocate CPU memory: size=" << size; + return ptr; } /** @@ -59,12 +58,12 @@ public: * @param ptr Pointer to be free. */ virtual void free(void* ptr) { - if (ptr) { ::free(ptr); } + if (ptr) { + ::free(ptr); + } } - virtual std::string getName() { - return "cpu_alloc"; - } + virtual std::string getName() { return "cpu_alloc"; } }; /** @@ -81,7 +80,7 @@ public: */ virtual void* alloc(size_t size) { void* ptr = hl_malloc_device(size); - CHECK(ptr)<< "Fail to allocate GPU memory " << size << " bytes"; + CHECK(ptr) << "Fail to allocate GPU memory " << size << " bytes"; return ptr; } @@ -95,9 +94,7 @@ public: } } - virtual std::string getName() { - return "gpu_alloc"; - } + virtual std::string getName() { return "gpu_alloc"; } }; /** @@ -128,9 +125,7 @@ public: } } - virtual std::string getName() { - return "cuda_host_alloc"; - } + virtual std::string getName() { return "cuda_host_alloc"; } }; } // namespace paddle diff --git a/paddle/math/BaseMatrix.h b/paddle/math/BaseMatrix.h index 3a91fdc3c30c5332866a97c256b018eb0982260f..d41dcee682cce15e94d45dafeb12bb0dce19b221 100644 --- a/paddle/math/BaseMatrix.h +++ b/paddle/math/BaseMatrix.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include #include @@ -52,9 +51,14 @@ public: size_t cRow_; size_t dCol_; size_t dRow_; - MatrixOffset(size_t aCol = 0, size_t aRow = 0, size_t bCol = 0, - size_t bRow = 0, size_t cCol = 0, size_t cRow = 0, - size_t dCol = 0, size_t dRow = 0) + MatrixOffset(size_t aCol = 0, + size_t aRow = 0, + size_t bCol = 0, + size_t bRow = 0, + size_t cCol = 0, + size_t cRow = 0, + size_t dCol = 0, + size_t dRow = 0) : aCol_(aCol), aRow_(aRow), bCol_(bCol), @@ -65,7 +69,7 @@ public: dRow_(dRow) {} }; -template +template class BaseMatrixT { public: size_t height_, width_; @@ -97,8 +101,12 @@ public: trans_(mat.trans_), useGpu_(useGpu) {} - BaseMatrixT(size_t height, size_t width, size_t stride, T* data, bool trans, - bool use_gpu) + BaseMatrixT(size_t height, + size_t width, + size_t stride, + T* data, + bool trans, + bool use_gpu) : height_(height), width_(width), stride_(stride), @@ -167,12 +175,17 @@ public: * @endcode */ template - int applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols, - MatrixOffset& offset, bAsRowVector, bAsColVector); + int applyBinary(Op op, + BaseMatrixT& b, + int numRows, + int numCols, + MatrixOffset& offset, + bAsRowVector, + bAsColVector); template - int applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols, - MatrixOffset& offset); + int applyBinary( + Op op, BaseMatrixT& b, int numRows, int numCols, MatrixOffset& offset); /** * ternary operator: element wise op(a, b, c). @@ -212,13 +225,22 @@ public: * @endcode */ template - int applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c, int numRows, - int numCols, MatrixOffset& offset, cAsRowVector, + int applyTernary(Op op, + BaseMatrixT& b, + BaseMatrixT& c, + int numRows, + int numCols, + MatrixOffset& offset, + cAsRowVector, cAsColVector); template - int applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c, int numRows, - int numCols, MatrixOffset& offset); + int applyTernary(Op op, + BaseMatrixT& b, + BaseMatrixT& c, + int numRows, + int numCols, + MatrixOffset& offset); /** * quaternary operator: element wise op(a, b, c, d). @@ -247,8 +269,13 @@ public: * @endcode */ template - int applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d, - int numRows, int numCols, MatrixOffset& offset); + int applyQuaternary(Op op, + BaseMatrixT& b, + BaseMatrixT& c, + BaseMatrixT& d, + int numRows, + int numCols, + MatrixOffset& offset); /** * a aggregate expression that apply each row(or column) of matrix b. @@ -266,10 +293,20 @@ public: * a[i] = sv(a[i], dst) * @endcode */ - template - int aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b, int numRows, - int numCols, MatrixOffset& offset, aAsRowVector, aAsColVector); + int aggregate(Agg agg, + Op op, + Saver sv, + BaseMatrixT& b, + int numRows, + int numCols, + MatrixOffset& offset, + aAsRowVector, + aAsColVector); /** * a aggregate expression that apply each row(or column) of matrix b and c. @@ -288,10 +325,20 @@ public: * a[i] = sv(a[i], dst) * @endcode */ - template - int aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b, BaseMatrixT& c, - int numRows, int numCols, MatrixOffset& offset, aAsRowVector, + int aggregate(Agg agg, + Op op, + Saver sv, + BaseMatrixT& b, + BaseMatrixT& c, + int numRows, + int numCols, + MatrixOffset& offset, + aAsRowVector, aAsColVector); /** @@ -319,8 +366,12 @@ public: // Same as the above with the special handing of sv=add2(scaleDest, scaleAgg) template - int applyRow(Agg agg, Op op, real scaleDest, real scaleAgg, - BaseMatrixT& b, BaseMatrixT& c); + int applyRow(Agg agg, + Op op, + real scaleDest, + real scaleAgg, + BaseMatrixT& b, + BaseMatrixT& c); /** * a aggregate expression that apply each row of matrix b. @@ -664,8 +715,7 @@ public: * this = a*p1 + b*p2 + c*p3 * @endcode */ - void add3(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d, T p1, T p2, - T p3); + void add3(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d, T p1, T p2, T p3); /** * @code @@ -675,9 +725,9 @@ public: */ void sgdUpdate(BaseMatrixT& b, // grad BaseMatrixT& c, // mom - T p1, // learningRate, - T p2, // momentum, - T p3); // decayRate + T p1, // learningRate, + T p2, // momentum, + T p3); // decayRate /** * @code @@ -688,9 +738,9 @@ public: void sgdUpdate(BaseMatrixT& b, // grad, BaseMatrixT& c, // mom, BaseMatrixT& d, // lr, - T p1, // learningRate, - T p2, // momentum, - T p3); // decayRate + T p1, // learningRate, + T p2, // momentum, + T p3); // decayRate /// apply L1/L2 to *this* void applyL1(T learningRate, T decayRate); @@ -767,17 +817,21 @@ public: * this = b>c ? b : c * @endcode */ - void max(BaseMatrixT& b, BaseMatrixT& c); // NOLINT + void max(BaseMatrixT& b, BaseMatrixT& c); // NOLINT /** * @code * this[destCol] += (b>p1 == c>p1) ? 0 : 1) * @endcode */ - void binaryClassificationError(size_t destCol, BaseMatrixT& b, BaseMatrixT& c, + void binaryClassificationError(size_t destCol, + BaseMatrixT& b, + BaseMatrixT& c, T p); - void binaryClassificationError2(size_t destCol, BaseMatrixT& b, - BaseMatrixT& c, T p); + void binaryClassificationError2(size_t destCol, + BaseMatrixT& b, + BaseMatrixT& c, + T p); /** * @code @@ -833,8 +887,8 @@ public: * this += sqr(p1*b + p2*c + p3*d) * @endcode */ - void addSquareSum(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT d, T p1, - T p2, T p3); + void addSquareSum( + BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT d, T p1, T p2, T p3); /** * @code @@ -965,12 +1019,13 @@ public: void sumCols(BaseMatrixT& b, T scaleSum, T scaleDest); /// this_i = scaleDest * this_i + scaleSum * \sum_j (b_{ij} - c_{ij})^2 - void sumOfSquaredDiffs(BaseMatrixT& b, BaseMatrixT& c, - T scaleSum, T scaleDest); + void sumOfSquaredDiffs(BaseMatrixT& b, + BaseMatrixT& c, + T scaleSum, + T scaleDest); /// this_i = scaleDest * this_i + scaleSum * \sum_j b_{ij} * c_{ij} - void sumOfProducts(BaseMatrixT& b, BaseMatrixT& c, - T scaleSum, T scaleDest); + void sumOfProducts(BaseMatrixT& b, BaseMatrixT& c, T scaleSum, T scaleDest); /** * @code @@ -985,9 +1040,7 @@ public: */ void rowPow(size_t cCol, BaseMatrixT& b, BaseMatrixT& c); - virtual bool isSparse() const { - return false; - } + virtual bool isSparse() const { return false; } }; typedef BaseMatrixT BaseMatrix; diff --git a/paddle/math/CpuSparseMatrix.cpp b/paddle/math/CpuSparseMatrix.cpp index 64ee124a5613a99ac3d7ff36897e4f2d0489ad51..ad3f8e64efd37c27c7f462dd7c8311577a05a391 100644 --- a/paddle/math/CpuSparseMatrix.cpp +++ b/paddle/math/CpuSparseMatrix.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "hl_gpu.h" #include "CpuSparseMatrix.h" #include "SparseMatrix.h" @@ -24,24 +23,35 @@ namespace paddle { const size_t CpuSparseMatrix::DEFAULT_AVG_WIDTH; -CpuSparseMatrix::CpuSparseMatrix(size_t height, size_t width, size_t nnz, - SparseValueType valueType, SparseFormat format, +CpuSparseMatrix::CpuSparseMatrix(size_t height, + size_t width, + size_t nnz, + SparseValueType valueType, + SparseFormat format, bool trans) : Matrix(NULL, height, width, trans, false) { resize(height, width, nnz, valueType, format); } -CpuSparseMatrix::CpuSparseMatrix(CpuMemHandlePtr dataHandle, size_t height, - size_t width, size_t nnz, - SparseValueType valueType, SparseFormat format, +CpuSparseMatrix::CpuSparseMatrix(CpuMemHandlePtr dataHandle, + size_t height, + size_t width, + size_t nnz, + SparseValueType valueType, + SparseFormat format, bool trans) : Matrix(dataHandle, height, width, trans, false) { resize(height, width, nnz, valueType, format); } -CpuSparseMatrix::CpuSparseMatrix(real* data, int* rows, int* cols, - size_t height, size_t width, size_t nnz, - SparseValueType valueType, SparseFormat format, +CpuSparseMatrix::CpuSparseMatrix(real* data, + int* rows, + int* cols, + size_t height, + size_t width, + size_t nnz, + SparseValueType valueType, + SparseFormat format, bool trans) : Matrix(NULL, height, width, trans, false) { cols_ = cols; @@ -54,8 +64,11 @@ CpuSparseMatrix::CpuSparseMatrix(real* data, int* rows, int* cols, format_ = format; } -void CpuSparseMatrix::resize(size_t newHeight, size_t newWidth, size_t newNnz, - SparseValueType valueType, SparseFormat format) { +void CpuSparseMatrix::resize(size_t newHeight, + size_t newWidth, + size_t newNnz, + SparseValueType valueType, + SparseFormat format) { CHECK_LE(newNnz, newHeight * newWidth); size_t newSize = 0; if (format == SPARSE_CSR) { @@ -110,23 +123,38 @@ void CpuSparseMatrix::sparseResize() { } void CpuSparseMatrix::resize(size_t newHeight, size_t newWidth) { - resize(newHeight, newWidth, newHeight * std::min(DEFAULT_AVG_WIDTH, newWidth), - valueType_, format_); + resize(newHeight, + newWidth, + newHeight * std::min(DEFAULT_AVG_WIDTH, newWidth), + valueType_, + format_); } MatrixPtr CpuSparseMatrix::getTranspose() { if (!memoryHandle_ && !value_) { - MatrixPtr dest(new CpuSparseMatrix(height_, width_, elementCnt_, valueType_, - format_, true)); + MatrixPtr dest(new CpuSparseMatrix( + height_, width_, elementCnt_, valueType_, format_, true)); return dest; } else if (memoryHandle_) { MatrixPtr dest(new CpuSparseMatrix( - std::dynamic_pointer_cast(memoryHandle_), height_, - width_, elementCnt_, valueType_, format_, true)); + std::dynamic_pointer_cast(memoryHandle_), + height_, + width_, + elementCnt_, + valueType_, + format_, + true)); return dest; } else if (value_) { - MatrixPtr dest(new CpuSparseMatrix(value_, rows_, cols_, height_, width_, - elementCnt_, valueType_, format_, true)); + MatrixPtr dest(new CpuSparseMatrix(value_, + rows_, + cols_, + height_, + width_, + elementCnt_, + valueType_, + format_, + true)); return dest; } else { return NULL; @@ -140,7 +168,10 @@ void CpuSparseMatrix::mul(MatrixPtr a, MatrixPtr b, real scaleAB, real scaleT) { if (dynamic_cast(a.get()) && dynamic_cast(b.get())) { CpuMatrix::mul(dynamic_cast(a.get()), - dynamic_cast(b.get()), this, scaleAB, scaleT); + dynamic_cast(b.get()), + this, + scaleAB, + scaleT); } else { LOG(FATAL) << "not supported"; } @@ -243,7 +274,8 @@ void CpuSparseMatrix::randomizeUniform() { } } -void CpuSparseMatrix::copyFrom(std::vector& rows, std::vector& cols, +void CpuSparseMatrix::copyFrom(std::vector& rows, + std::vector& cols, std::vector& values) { size_t size = format_ == SPARSE_CSR ? cols.size() : rows.size(); resize(height_, width_, size, valueType_, format_); @@ -302,11 +334,11 @@ MatrixPtr CpuSparseMatrix::clone(size_t height, size_t width, bool useGpu) { } CHECK(width && height); if (!useGpu) { - return std::make_shared(height, width, 0, valueType_, - format_); + return std::make_shared( + height, width, 0, valueType_, format_); } else { - return std::make_shared(height, width, elementCnt_, - valueType_, format_); + return std::make_shared( + height, width, elementCnt_, valueType_, format_); } } @@ -315,13 +347,25 @@ MatrixPtr CpuSparseMatrix::subMatrix(size_t startRow, size_t numRows) { CHECK_EQ(format_, SPARSE_CSR); if (valueType_ == NO_VALUE) { return std::make_shared( - nullptr, rows_ + startRow, cols_, numRows, width_, - rows_[startRow + numRows] - rows_[startRow], valueType_, format_, + nullptr, + rows_ + startRow, + cols_, + numRows, + width_, + rows_[startRow + numRows] - rows_[startRow], + valueType_, + format_, trans_); } else { return std::make_shared( - value_, rows_ + startRow, cols_, numRows, width_, - rows_[startRow + numRows] - rows_[startRow], valueType_, format_, + value_, + rows_ + startRow, + cols_, + numRows, + width_, + rows_[startRow + numRows] - rows_[startRow], + valueType_, + format_, trans_); } } @@ -404,8 +448,10 @@ void CpuSparseMatrix::transpose(MatrixPtr matTrans, bool memAlloc) { } } -void CpuSparseMatrix::setRow(size_t row, size_t colNum, - const unsigned int* cols, const real* values) { +void CpuSparseMatrix::setRow(size_t row, + size_t colNum, + const unsigned int* cols, + const real* values) { if (format_ == SPARSE_CSR) { CHECK_LT(row, height_); CHECK(NULL != cols); @@ -494,11 +540,23 @@ void CpuSparseMatrix::copyFrom(const GpuSparseMatrix& src, hl_stream_t stream) { CHECK_EQ(size_t(elementCnt_), src.getElementCnt()); size_t valSize = valueType_ == NO_VALUE ? 0 : elementCnt_; if (format_ == SPARSE_CSC) - hl_memcpy_from_csc_matrix(value_, valSize, rows_, elementCnt_, cols_, - width_ + 1, src.sMatrix_.get(), stream); + hl_memcpy_from_csc_matrix(value_, + valSize, + rows_, + elementCnt_, + cols_, + width_ + 1, + src.sMatrix_.get(), + stream); else - hl_memcpy_from_csr_matrix(value_, valSize, rows_, height_ + 1, cols_, - elementCnt_, src.sMatrix_.get(), stream); + hl_memcpy_from_csr_matrix(value_, + valSize, + rows_, + height_ + 1, + cols_, + elementCnt_, + src.sMatrix_.get(), + stream); } void CpuSparseMatrix::copyFrom(const CpuSparseMatrix& src) { @@ -536,14 +594,16 @@ void CpuSparseMatrix::copyFrom(const CpuSparseMatrix& src) { } } -void CpuSparseMatrix::copyRow(int offsets, size_t colNum, +void CpuSparseMatrix::copyRow(int offsets, + size_t colNum, const sparse_non_value_t* row) { for (size_t j = 0; j < colNum; j++) { cols_[offsets + j] = row[j].col; } } -void CpuSparseMatrix::copyRow(int offsets, size_t colNum, +void CpuSparseMatrix::copyRow(int offsets, + size_t colNum, const sparse_float_value_t* row) { for (size_t j = 0; j < colNum; j++) { cols_[offsets + j] = row[j].col; @@ -596,7 +656,8 @@ void CpuSparseMatrix::trimFrom(const CpuSparseMatrix& src) { if (format_ == SPARSE_CSR) { int* srcCols = src.getCols(); size_t numLessWidth = - std::count_if(srcCols, srcCols + src.getElementCnt(), + std::count_if(srcCols, + srcCols + src.getElementCnt(), [this](size_t n) { return n < this->width_; }); resize(height_, width_, numLessWidth, valueType_, format_); rows_[0] = 0; @@ -636,13 +697,15 @@ void CpuSparseMatrix::trimFrom(const CpuSparseMatrix& src) { void CpuSparseMatrix::zeroMem() { CHECK(valueType_ == FLOAT_VALUE); - memset(value_, 0, elementCnt_* sizeof(real)); + memset(value_, 0, elementCnt_ * sizeof(real)); } -template void CpuSparseMatrix::copyFrom(int64_t* ids, int64_t* indices, +template void CpuSparseMatrix::copyFrom(int64_t* ids, + int64_t* indices, sparse_non_value_t* data); -template void CpuSparseMatrix::copyFrom(int64_t* ids, int64_t* indices, +template void CpuSparseMatrix::copyFrom(int64_t* ids, + int64_t* indices, sparse_float_value_t* data); template void CpuSparseMatrix::copyFrom(int64_t* indices, @@ -673,7 +736,9 @@ void CpuSparseMatrix::rowMax(IVector& maxIds, Matrix& maxVal) { } size_t outsize = std::min(num, beam); - std::partial_sort(vec.begin(), vec.begin() + outsize, vec.end(), + std::partial_sort(vec.begin(), + vec.begin() + outsize, + vec.end(), [](const valuepair& a, const valuepair& b) { return a.first > b.first; }); diff --git a/paddle/math/CpuSparseMatrix.h b/paddle/math/CpuSparseMatrix.h index fd3b5030bea7ac937d9cf828e29d3441446a65f6..861564555166da0bb70d500569dc0d4f89dd2fe5 100644 --- a/paddle/math/CpuSparseMatrix.h +++ b/paddle/math/CpuSparseMatrix.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include #include "Matrix.h" @@ -21,24 +20,38 @@ namespace paddle { class CpuSparseMatrix : public Matrix { public: - CpuSparseMatrix(size_t height, size_t width, + CpuSparseMatrix(size_t height, + size_t width, size_t nnz, /* used to allocate space */ SparseValueType valueType = FLOAT_VALUE, - SparseFormat format = SPARSE_CSR, bool trans = false); - - CpuSparseMatrix(CpuMemHandlePtr memHandle, size_t height, size_t width, - size_t nnz, SparseValueType valueType, SparseFormat format, + SparseFormat format = SPARSE_CSR, + bool trans = false); + + CpuSparseMatrix(CpuMemHandlePtr memHandle, + size_t height, + size_t width, + size_t nnz, + SparseValueType valueType, + SparseFormat format, bool trans); - CpuSparseMatrix(real* data, int* rows, int* cols, size_t height, size_t width, - size_t nnz, SparseValueType valueType, SparseFormat format, + CpuSparseMatrix(real* data, + int* rows, + int* cols, + size_t height, + size_t width, + size_t nnz, + SparseValueType valueType, + SparseFormat format, bool trans); ~CpuSparseMatrix() {} - void resize(size_t newHeight, size_t newWidth, + void resize(size_t newHeight, + size_t newWidth, size_t newNnz, /* used to allocate space */ - SparseValueType valueType, SparseFormat format); + SparseValueType valueType, + SparseFormat format); void resize(size_t newHeight, size_t newWidth); MatrixPtr getTranspose(); @@ -75,8 +88,6 @@ public: } } - - real* getColumn(size_t i) const { if (format_ == SPARSE_CSC) { return value_ + cols_[i]; @@ -182,7 +193,7 @@ public: * getData is convenient to get value */ real* getData() { return getValue(); } - const real* getData() const { return getValue();} + const real* getData() const { return getValue(); } /** * @brief only set value_ of FLOAT_VALUE sparse matrix to zero @@ -220,7 +231,9 @@ public: void printOneRow(std::ostream& os, size_t idx) const; - void setRow(size_t row, size_t colNum, const unsigned int* cols, + void setRow(size_t row, + size_t colNum, + const unsigned int* cols, const real* values); void randomizeUniform(); @@ -241,7 +254,8 @@ public: virtual MatrixPtr subMatrix(size_t startRow, size_t numRows); - void copyFrom(std::vector& rows, std::vector& cols, + void copyFrom(std::vector& rows, + std::vector& cols, std::vector& values); void copyFrom(const CpuMatrix& src); @@ -285,9 +299,7 @@ protected: // BaseMatrixT interface public: - bool isSparse() const { - return true; - } + bool isSparse() const { return true; } private: using Matrix::copyFrom; diff --git a/paddle/math/ExecViaCpu.h b/paddle/math/ExecViaCpu.h index 64e5b8312168499d4267937cdc7f0b872fa5ea37..67fb6c0cda6f46ddf4547b9ec9faaa8931c75eed 100644 --- a/paddle/math/ExecViaCpu.h +++ b/paddle/math/ExecViaCpu.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - /* execViaCpu is used to do operations on GpuMatirx and/or GpuIVector through cpu functions. It can automatically make a temporary CPU copy for the @@ -46,8 +45,10 @@ public: explicit CopyToCpu(Matrix& arg) : arg_(arg) { if (arg.useGpu()) { CHECK(!arg.isTransposed()) << "Not supported"; - copied_ = Matrix::create(arg.getHeight(), arg.getWidth(), - /* trans= */ false, /* useGpu= */ false); + copied_ = Matrix::create(arg.getHeight(), + arg.getWidth(), + /* trans= */ false, + /* useGpu= */ false); copied_->copyFrom(arg); } } @@ -69,8 +70,10 @@ public: explicit CopyToCpu(const Matrix& arg) : arg_(arg) { if (arg.useGpu()) { CHECK(!arg.isTransposed()) << "Not supported"; - copied_ = Matrix::create(arg.getHeight(), arg.getWidth(), - /* trans= */ false, /* useGpu= */ false); + copied_ = Matrix::create(arg.getHeight(), + arg.getWidth(), + /* trans= */ false, + /* useGpu= */ false); copied_->copyFrom(arg); } } @@ -165,7 +168,8 @@ class GpuFuncWrapper2 std::is_function::value, std::is_pointer::value && std::is_function::type>::value, - std::is_class::value, F> {}; + std::is_class::value, + F> {}; template class GpuFuncWrapper diff --git a/paddle/math/MathFunctions.cpp b/paddle/math/MathFunctions.cpp index e0b2a2bb5b2cdbd845d9be08a8926f0514398458..1217163beecf19c2af215e3d4c72db644cd74b51 100644 --- a/paddle/math/MathFunctions.cpp +++ b/paddle/math/MathFunctions.cpp @@ -12,36 +12,79 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "MathFunctions.h" #include "hl_matrix_ops.cuh" #include "hl_matrix_apply.cuh" namespace paddle { -template<> -void gemm(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, - const int M, const int N, const int K, - const float alpha, const float* A, const int lda, - const float* B, const int ldb, - const float beta, float* C, const int ldc) { - cblas_sgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb, - beta, C, ldc); -} - -template<> -void gemm(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, - const int M, const int N, const int K, - const double alpha, const double* A, const int lda, - const double* B, const int ldb, - const double beta, double* C, const int ldc) { - cblas_dgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb, - beta, C, ldc); -} - -template<> -int getrf(const CBLAS_ORDER order, const int M, const int N, - float *A, const int lda, int *ipiv) { +template <> +void gemm(const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, + const int M, + const int N, + const int K, + const float alpha, + const float* A, + const int lda, + const float* B, + const int ldb, + const float beta, + float* C, + const int ldc) { + cblas_sgemm(CblasRowMajor, + transA, + transB, + M, + N, + K, + alpha, + A, + lda, + B, + ldb, + beta, + C, + ldc); +} + +template <> +void gemm(const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, + const int M, + const int N, + const int K, + const double alpha, + const double* A, + const int lda, + const double* B, + const int ldb, + const double beta, + double* C, + const int ldc) { + cblas_dgemm(CblasRowMajor, + transA, + transB, + M, + N, + K, + alpha, + A, + lda, + B, + ldb, + beta, + C, + ldc); +} + +template <> +int getrf(const CBLAS_ORDER order, + const int M, + const int N, + float* A, + const int lda, + int* ipiv) { #ifdef PADDLE_USE_ATLAS return clapack_sgetrf(order, M, N, A, lda, ipiv); #else @@ -49,9 +92,13 @@ int getrf(const CBLAS_ORDER order, const int M, const int N, #endif } -template<> -int getrf(const CBLAS_ORDER order, const int M, const int N, - double *A, const int lda, int *ipiv) { +template <> +int getrf(const CBLAS_ORDER order, + const int M, + const int N, + double* A, + const int lda, + int* ipiv) { #ifdef PADDLE_USE_ATLAS return clapack_dgetrf(order, M, N, A, lda, ipiv); #else @@ -59,9 +106,12 @@ int getrf(const CBLAS_ORDER order, const int M, const int N, #endif } -template<> -int getri(const CBLAS_ORDER order, const int N, float *A, - const int lda, const int *ipiv) { +template <> +int getri(const CBLAS_ORDER order, + const int N, + float* A, + const int lda, + const int* ipiv) { #ifdef PADDLE_USE_ATLAS return clapack_sgetri(order, N, A, lda, ipiv); #else @@ -69,9 +119,12 @@ int getri(const CBLAS_ORDER order, const int N, float *A, #endif } -template<> -int getri(const CBLAS_ORDER order, const int N, double *A, - const int lda, const int *ipiv) { +template <> +int getri(const CBLAS_ORDER order, + const int N, + double* A, + const int lda, + const int* ipiv) { #ifdef PADDLE_USE_ATLAS return clapack_dgetri(order, N, A, lda, ipiv); #else @@ -79,149 +132,155 @@ int getri(const CBLAS_ORDER order, const int N, double *A, #endif } -template<> +template <> void axpy(const int n, const float alpha, const float* x, float* y) { cblas_saxpy(n, alpha, x, 1, y, 1); } -template<> +template <> void axpy(const int n, const double alpha, const double* x, double* y) { cblas_daxpy(n, alpha, x, 1, y, 1); } -template<> +template <> float dotProduct(const int n, const float* x, const float* y) { return cblas_sdot(n, x, 1, y, 1); } -template<> +template <> double dotProduct(const int n, const double* x, const double* y) { return cblas_ddot(n, x, 1, y, 1); } #ifdef PADDLE_USE_MKL -template<> +template <> void vExp(const int n, const float* a, float* r) { vsExp(n, a, r); } -template<> +template <> void vExp(const int n, const double* a, double* r) { vdExp(n, a, r); } -template<> +template <> void vPow(const int n, const float* a, const float b, float* r) { vsPowx(n, a, b, r); } -template<> +template <> void vPow(const int n, const double* a, const double b, double* r) { vdPowx(n, a, b, r); } -template<> +template <> void vLog(const int n, const float* a, float* r) { vsLn(n, a, r); } -template<> +template <> void vLog(const int n, const double* a, double* r) { vdLn(n, a, r); } -template<> +template <> void vAdd(const int n, const float* a, const float* b, float* r) { vsAdd(n, a, b, r); } -template<> +template <> void vAdd(const int n, const double* a, const double* b, double* r) { vdAdd(n, a, b, r); } -template<> +template <> void vInvSqrt(const int n, const float* a, float* r) { vsInvSqrt(n, a, r); } -template<> +template <> void vInvSqrt(const int n, const double* a, double* r) { vdInvSqrt(n, a, r); } -template<> +template <> void vLog1p(const int n, const float* a, float* r) { vsLog1p(n, a, r); } -template<> +template <> void vLog1p(const int n, const double* a, double* r) { vdLog1p(n, a, r); } -template<> +template <> void vTanh(const int n, const float* a, float* r) { vsTanh(n, a, r); } -template<> +template <> void vTanh(const int n, const double* a, double* r) { vdTanh(n, a, r); } #else DEFINE_MATRIX_BINARY_OP(vExp, b = std::exp(a)); -template +template void vExp(const int n, const T* a, T* r) { hl_cpu_apply_binary_op, 0, 0>( - binary::vExp(), const_cast(a), r, 1, n, n, n); + binary::vExp(), const_cast(a), r, 1, n, n, n); } DEFINE_MATRIX_BINARY_OP(vLog, b = std::log(a)); -template +template void vLog(const int n, const T* a, T* r) { hl_cpu_apply_binary_op, 0, 0>( - binary::vLog(), const_cast(a), r, 1, n, n, n); + binary::vLog(), const_cast(a), r, 1, n, n, n); } DEFINE_MATRIX_BINARY_OP(vInvSqrt, b = 1.0f / std::sqrt(a)); -template +template void vInvSqrt(const int n, const T* a, T* r) { hl_cpu_apply_binary_op, 0, 0>( - binary::vInvSqrt(), const_cast(a), r, 1, n, n, n); + binary::vInvSqrt(), const_cast(a), r, 1, n, n, n); } DEFINE_MATRIX_BINARY_OP(vLog1p, b = std::log(1.0f + a)); -template +template void vLog1p(const int n, const T* a, T* r) { hl_cpu_apply_binary_op, 0, 0>( - binary::vLog1p(), const_cast(a), r, 1, n, n, n); + binary::vLog1p(), const_cast(a), r, 1, n, n, n); } -DEFINE_MATRIX_BINARY_OP(vTanh, - T tmp = -2.0 * a; - tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp; - b = 2.0 / (1.0 + std::exp(tmp)) - 1.0); -template +DEFINE_MATRIX_BINARY_OP(vTanh, T tmp = -2.0 * a; + tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp; + b = 2.0 / (1.0 + std::exp(tmp)) - 1.0); +template void vTanh(const int n, const T* a, T* r) { hl_cpu_apply_binary_op, 0, 0>( - binary::vTanh(), const_cast(a), r, 1, n, n, n); + binary::vTanh(), const_cast(a), r, 1, n, n, n); } DEFINE_MATRIX_BINARY_PARAMETER_OP(vPow, ONE_PARAMETER, b = std::pow(a, p)); -template +template void vPow(const int n, const T* a, const T b, T* r) { hl_cpu_apply_binary_op, 0, 0>( - binary::vPow(b), const_cast(a), r, 1, n, n, n); + binary::vPow(b), const_cast(a), r, 1, n, n, n); } DEFINE_MATRIX_TERNARY_OP(vAdd, c = a + b); -template +template void vAdd(const int n, const T* a, const T* b, T* r) { hl_cpu_apply_ternary_op, 0, 0>(ternary::vAdd(), - const_cast(a), const_cast(b), r, 1, n, n, n , n); + const_cast(a), + const_cast(b), + r, + 1, + n, + n, + n, + n); } template void vExp(const int n, const float* a, float* r); diff --git a/paddle/math/MathFunctions.h b/paddle/math/MathFunctions.h index 29c07467c7bac9c382f02a5f6ffdcfd87c5b09a0..0741c456780e36c6b87dd44d89ffc601ac928f31 100644 --- a/paddle/math/MathFunctions.h +++ b/paddle/math/MathFunctions.h @@ -35,46 +35,58 @@ extern "C" { namespace paddle { -template -void gemm(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, - const int M, const int N, const int K, - const T alpha, const T* A, const int lda, - const T* B, const int ldb, - const T beta, T* C, const int ldc); - -template -int getrf(const CBLAS_ORDER Order, const int M, const int N, - T *A, const int lda, int *ipiv); - -template -int getri(const CBLAS_ORDER Order, const int N, T *A, - const int lda, const int *ipiv); - -template +template +void gemm(const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, + const int M, + const int N, + const int K, + const T alpha, + const T* A, + const int lda, + const T* B, + const int ldb, + const T beta, + T* C, + const int ldc); + +template +int getrf(const CBLAS_ORDER Order, + const int M, + const int N, + T* A, + const int lda, + int* ipiv); + +template +int getri( + const CBLAS_ORDER Order, const int N, T* A, const int lda, const int* ipiv); + +template void axpy(const int n, const T alpha, const T* x, T* y); -template +template T dotProduct(const int n, const T* x, const T* y); -template +template void vExp(const int n, const T* a, T* r); -template +template void vPow(const int n, const T* a, const T b, T* r); -template +template void vLog(const int n, const T* a, T* r); -template +template void vAdd(const int n, const T* a, const T* b, T* r); -template +template void vInvSqrt(const int n, const T* a, T* r); -template +template void vLog1p(const int n, const T* a, T* r); -template +template void vTanh(const int n, const T* a, T* r); } // namespace paddle diff --git a/paddle/math/MathUtils.cpp b/paddle/math/MathUtils.cpp index 548f17936381c7e1c4d0c2c9661b197f3f06bd35..878e0b8723025e75f7838e981517f58a3dcb5424 100644 --- a/paddle/math/MathUtils.cpp +++ b/paddle/math/MathUtils.cpp @@ -23,8 +23,8 @@ namespace paddle { * major is rows and minor is cols, according to * major value to initialize minor value" */ -void sparseRand(int* major, int* minor, int nnz, int majorLen, int minorMax, - bool useGpu) { +void sparseRand( + int* major, int* minor, int nnz, int majorLen, int minorMax, bool useGpu) { CHECK(size_t(nnz) > size_t(1)); int* cpuMajor; int* cpuMinor; @@ -57,7 +57,8 @@ void sparseRand(int* major, int* minor, int nnz, int majorLen, int minorMax, cpuMinor[j] = idx; used[idx] = 1; } - std::sort(cpuMinor + cpuMajor[i], cpuMinor + cpuMajor[i + 1], + std::sort(cpuMinor + cpuMajor[i], + cpuMinor + cpuMajor[i + 1], [](int a, int b) { return a < b; }); } /*memcpy result to gpu*/ @@ -67,8 +68,8 @@ void sparseRand(int* major, int* minor, int nnz, int majorLen, int minorMax, } } -int outputSize(int imageSize, int filterSize, int padding, int stride, - bool caffeMode) { +int outputSize( + int imageSize, int filterSize, int padding, int stride, bool caffeMode) { int outputSize; if (!caffeMode) { outputSize = @@ -80,14 +81,14 @@ int outputSize(int imageSize, int filterSize, int padding, int stride, return outputSize; } -int imageSize(int outputSize, int filterSize, int padding, int stride, - bool caffeMode) { +int imageSize( + int outputSize, int filterSize, int padding, int stride, bool caffeMode) { int imageSize; if (!caffeMode) { - imageSize = - (outputSize - 1) * stride + filterSize - 2 * padding - stride + 1; + imageSize = + (outputSize - 1) * stride + filterSize - 2 * padding - stride + 1; } else { - imageSize = (outputSize - 1) * stride + filterSize - 2 * padding; + imageSize = (outputSize - 1) * stride + filterSize - 2 * padding; } CHECK_GE(imageSize, 1); return imageSize; diff --git a/paddle/math/MathUtils.h b/paddle/math/MathUtils.h index 91683dc3e9144df4664f46859ff5e2215dc34144..907116c00281bfcf34c6652564f55a37c3f47a8c 100644 --- a/paddle/math/MathUtils.h +++ b/paddle/math/MathUtils.h @@ -41,8 +41,8 @@ namespace paddle { * * rows is [1, 3, 4, 0, 2, 4, 1, 2, 3, 4] */ -void sparseRand(int* major, int* minor, int nnz, int majorLen, int minorMax, - bool useGpu); +void sparseRand( + int* major, int* minor, int nnz, int majorLen, int minorMax, bool useGpu); /** * Calculate output size based on caffeMode_. @@ -57,14 +57,14 @@ void sparseRand(int* major, int* minor, int nnz, int majorLen, int minorMax, * - output: (012), (234), (456), (678), (9) * - outputSize = 5; */ -int outputSize(int imageSize, int filterSize, int padding, int stride, - bool caffeMode); +int outputSize( + int imageSize, int filterSize, int padding, int stride, bool caffeMode); /** * Calculate image size based on output size and caffeMode_. * It is the reverse function of outputSize() */ -int imageSize(int outputSize, int filterSize, int padding, int stride, - bool caffeMode); +int imageSize( + int outputSize, int filterSize, int padding, int stride, bool caffeMode); } // namespace paddle diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp index 706a598d0c33762b0578190ea4a0aa06247a88ef..b70b47a5fcc72edea8fa5a680c4af962ea0f4ae9 100644 --- a/paddle/math/Matrix.cpp +++ b/paddle/math/Matrix.cpp @@ -40,58 +40,75 @@ inline real _square(real a) { return a * a; } inline real _safelog(real a) { return a > 0.0f ? std::log(a) : -40.0f; } -Matrix::Matrix(MemoryHandlePtr memHandle, size_t height, size_t width, - bool trans, bool use_gpu) +Matrix::Matrix(MemoryHandlePtr memHandle, + size_t height, + size_t width, + bool trans, + bool use_gpu) : BaseMatrix( - height, width, + height, + width, memHandle ? (reinterpret_cast(memHandle->getBuf())) : nullptr, - trans, use_gpu) { + trans, + use_gpu) { elementCnt_ = width * height; memoryHandle_ = memHandle; } -Matrix::Matrix(real* data, size_t height, size_t width, bool trans, - bool use_gpu) +Matrix::Matrix( + real* data, size_t height, size_t width, bool trans, bool use_gpu) : BaseMatrix(height, width, data, trans, use_gpu) { elementCnt_ = width * height; } -Matrix::Matrix(real* data, size_t height, size_t width, size_t stride, - bool trans, bool use_gpu) +Matrix::Matrix(real* data, + size_t height, + size_t width, + size_t stride, + bool trans, + bool use_gpu) : BaseMatrix(height, width, stride, data, trans, use_gpu) { elementCnt_ = width * height; } -MatrixPtr Matrix::createSparseMatrix(real* data, int* row, int* col, - size_t height, size_t width, +MatrixPtr Matrix::createSparseMatrix(real* data, + int* row, + int* col, + size_t height, + size_t width, size_t nnz, /* used to allocate space */ SparseValueType valueType, /*value type*/ - SparseFormat format, bool trans, + SparseFormat format, + bool trans, bool useGpu) { if (useGpu) { - return std::make_shared(data, row, col, height, width, nnz, - valueType, format, trans); + return std::make_shared( + data, row, col, height, width, nnz, valueType, format, trans); } else { - return std::make_shared(data, row, col, height, width, nnz, - valueType, format, trans); + return std::make_shared( + data, row, col, height, width, nnz, valueType, format, trans); } } -MatrixPtr Matrix::createSparseMatrix(size_t height, size_t width, +MatrixPtr Matrix::createSparseMatrix(size_t height, + size_t width, size_t nnz, /* used to allocate space */ SparseValueType valueType, /*value type*/ - SparseFormat format, bool trans, + SparseFormat format, + bool trans, bool useGpu) { if (useGpu) { - return std::make_shared(height, width, nnz, valueType, - format, trans); + return std::make_shared( + height, width, nnz, valueType, format, trans); } else { - return std::make_shared(height, width, nnz, valueType, - format, trans); + return std::make_shared( + height, width, nnz, valueType, format, trans); } } -MatrixPtr Matrix::create(MemoryHandlePtr memHandle, size_t height, size_t width, +MatrixPtr Matrix::create(MemoryHandlePtr memHandle, + size_t height, + size_t width, bool trans) { if (auto gpuHandle = std::dynamic_pointer_cast(memHandle)) { return std::make_shared(gpuHandle, height, width, trans); @@ -112,8 +129,8 @@ MatrixPtr Matrix::create(size_t height, size_t width, bool trans, bool useGpu) { } } -MatrixPtr Matrix::create(real* data, size_t height, size_t width, bool trans, - bool useGpu) { +MatrixPtr Matrix::create( + real* data, size_t height, size_t width, bool trans, bool useGpu) { if (useGpu) { return std::make_shared(data, height, width, trans); } else { @@ -121,8 +138,12 @@ MatrixPtr Matrix::create(real* data, size_t height, size_t width, bool trans, } } -MatrixPtr Matrix::create(real* data, size_t height, size_t width, size_t stride, - bool trans, bool useGpu) { +MatrixPtr Matrix::create(real* data, + size_t height, + size_t width, + size_t stride, + bool trans, + bool useGpu) { if (useGpu) { return std::make_shared(data, height, width, stride, trans); } else { @@ -130,20 +151,23 @@ MatrixPtr Matrix::create(real* data, size_t height, size_t width, size_t stride, } } -MatrixPtr Matrix::createSparseMatrix(size_t height, size_t width, size_t nnz, - SparseValueType valueType, bool trans, +MatrixPtr Matrix::createSparseMatrix(size_t height, + size_t width, + size_t nnz, + SparseValueType valueType, + bool trans, bool useGpu) { if (useGpu) { - return std::make_shared(height, width, nnz, valueType, - SPARSE_CSR, trans); + return std::make_shared( + height, width, nnz, valueType, SPARSE_CSR, trans); } else { - return std::make_shared(height, width, nnz, valueType, - SPARSE_CSR, trans); + return std::make_shared( + height, width, nnz, valueType, SPARSE_CSR, trans); } } -void Matrix::resizeOrCreate(MatrixPtr& matrix, size_t height, size_t width, - bool trans, bool useGpu) { +void Matrix::resizeOrCreate( + MatrixPtr& matrix, size_t height, size_t width, bool trans, bool useGpu) { if (!matrix) { matrix = Matrix::create(height, width, trans, useGpu); } else { @@ -152,14 +176,17 @@ void Matrix::resizeOrCreate(MatrixPtr& matrix, size_t height, size_t width, } } -void Matrix::resizeOrCreateSparseMatrix(MatrixPtr& matrix, size_t height, - size_t width, size_t nnz, +void Matrix::resizeOrCreateSparseMatrix(MatrixPtr& matrix, + size_t height, + size_t width, + size_t nnz, SparseValueType valueType, - SparseFormat format, bool trans, + SparseFormat format, + bool trans, bool useGpu) { if (!matrix) { - matrix = Matrix::createSparseMatrix(height, width, nnz, valueType, format, - trans, useGpu); + matrix = Matrix::createSparseMatrix( + height, width, nnz, valueType, format, trans, useGpu); } else { CHECK(dynamic_cast(matrix.get()) || dynamic_cast(matrix.get())); @@ -176,7 +203,9 @@ void Matrix::reshape(size_t height, size_t width) { stride_ = width_; } -MatrixPtr Matrix::subMatrix(size_t startRow, size_t endRow, size_t startCol, +MatrixPtr Matrix::subMatrix(size_t startRow, + size_t endRow, + size_t startCol, size_t endCol) { CHECK_LE(startRow, endRow); CHECK_LE(endRow, getHeight()); @@ -184,8 +213,11 @@ MatrixPtr Matrix::subMatrix(size_t startRow, size_t endRow, size_t startCol, CHECK_LE(endCol, getWidth()); return Matrix::create(getData() + startRow * getStride() + startCol, - endRow - startRow, endCol - startCol, getStride(), - trans_, useGpu_); + endRow - startRow, + endCol - startCol, + getStride(), + trans_, + useGpu_); } void Matrix::setDiag(real value) { @@ -199,7 +231,10 @@ void Matrix::setDiag(real value) { GpuMatrix::GpuMatrix(size_t height, size_t width, bool trans) : Matrix(std::make_shared(height * width * sizeof(real)), - height, width, trans, true) {} + height, + width, + trans, + true) {} GpuMatrix::~GpuMatrix() {} @@ -258,11 +293,11 @@ void GpuMatrix::copyFrom(const Matrix& src) { CHECK(elementCnt_ == src.getElementCnt()); if (typeid(src) == typeid(CpuMatrix)) { - hl_memcpy_host2device(data_, const_cast(src.getData()), - sizeof(real) * elementCnt_); + hl_memcpy_host2device( + data_, const_cast(src.getData()), sizeof(real) * elementCnt_); } else if (typeid(src) == typeid(GpuMatrix)) { - hl_memcpy_device2device(data_, const_cast(src.getData()), - sizeof(real) * elementCnt_); + hl_memcpy_device2device( + data_, const_cast(src.getData()), sizeof(real) * elementCnt_); } else { LOG(FATAL) << "Wrong"; } @@ -272,8 +307,10 @@ void GpuMatrix::copyFrom(const Matrix& src, hl_stream_t stream) { CHECK(isContiguous()); CHECK(src.isContiguous()); CHECK(elementCnt_ == src.getElementCnt()); - hl_memcpy_async(this->getData(), const_cast(src.getData()), - sizeof(real) * elementCnt_, stream); + hl_memcpy_async(this->getData(), + const_cast(src.getData()), + sizeof(real) * elementCnt_, + stream); } void GpuMatrix::copyFrom(const real* hostSrc, size_t size) { @@ -324,7 +361,9 @@ MatrixPtr GpuMatrix::getTranspose() { if (memoryHandle_.get() != NULL) { MatrixPtr copy_T( new GpuMatrix(std::dynamic_pointer_cast(memoryHandle_), - height_, width_, true)); + height_, + width_, + true)); return copy_T; } else { MatrixPtr copy_T(new GpuMatrix(data_, height_, width_, true)); @@ -346,7 +385,6 @@ void GpuMatrix::transpose(MatrixPtr matTrans, bool memAlloc) { hl_matrix_transpose(data, dataTrans, height_, width_, lda, ldc); } - MatrixPtr GpuMatrix::getInverse() { MatrixPtr matInv; inverse(matInv, true); @@ -379,17 +417,16 @@ void GpuMatrix::addSharedBias(Matrix& b, real scale) { CHECK(b.getHeight() == 1) << "the Bias should be a vector"; CHECK_LE(b.getWidth(), getWidth()); CHECK_EQ(getWidth() % b.getWidth(), 0UL); - hl_matrix_add_shared_bias(getData(), b.getData(), b.getWidth(), - getHeight(), getWidth(), scale); + hl_matrix_add_shared_bias( + getData(), b.getData(), b.getWidth(), getHeight(), getWidth(), scale); } - void GpuMatrix::collectBias(Matrix& a, real scale) { CHECK_EQ(getHeight(), (size_t)1); CHECK_EQ(width_, a.getWidth()); GpuSparseMatrix* sMatPtr = dynamic_cast(&a); if (!sMatPtr) { - sumCols(a, /* scaleSum= */scale, /* scaleDest= */1); + sumCols(a, /* scaleSum= */ scale, /* scaleDest= */ 1); } else { real* data = getData(); hl_sparse_matrix_s A_d = sMatPtr->sMatrix_.get(); @@ -397,15 +434,13 @@ void GpuMatrix::collectBias(Matrix& a, real scale) { } } - void GpuMatrix::collectSharedBias(Matrix& a, real scale) { CHECK_EQ(getHeight(), (size_t)1); CHECK_EQ(a.getWidth() % getWidth(), 0UL); - hl_matrix_collect_shared_bias(getData(), a.getData(), getWidth(), - a.getHeight(), a.getWidth(), scale); + hl_matrix_collect_shared_bias( + getData(), a.getData(), getWidth(), a.getHeight(), a.getWidth(), scale); } - void GpuMatrix::sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode) { @@ -421,7 +456,9 @@ void GpuMatrix::sequenceAvgForward(Matrix& a, } /* this = scaleAB*(a*b) + scaleT*this */ -void GpuMatrix::mul(const GpuMatrix& a, const GpuMatrix& b, real scaleAB, +void GpuMatrix::mul(const GpuMatrix& a, + const GpuMatrix& b, + real scaleAB, real scaleT) { CHECK(!isTransposed()) << "Not supported"; @@ -453,11 +490,24 @@ void GpuMatrix::mul(const GpuMatrix& a, const GpuMatrix& b, real scaleAB, hl_trans_op_t transa = !a.isTransposed() ? HPPL_OP_N : HPPL_OP_T; hl_trans_op_t transb = !b.isTransposed() ? HPPL_OP_N : HPPL_OP_T; - hl_matrix_mul(A_d, transa, B_d, transb, C_d, dimM, dimN, dimK, scaleAB, - scaleT, lda, ldb, ldc); -} - -void GpuMatrix::mul(const GpuSparseMatrix& a, const GpuMatrix& b, real scaleAB, + hl_matrix_mul(A_d, + transa, + B_d, + transb, + C_d, + dimM, + dimN, + dimK, + scaleAB, + scaleT, + lda, + ldb, + ldc); +} + +void GpuMatrix::mul(const GpuSparseMatrix& a, + const GpuMatrix& b, + real scaleAB, real scaleT) { CHECK(isContiguous()); CHECK(b.isContiguous()); @@ -475,11 +525,21 @@ void GpuMatrix::mul(const GpuSparseMatrix& a, const GpuMatrix& b, real scaleAB, hl_sparse_matrix_s A_d = a.sMatrix_.get(); real* B_d = b.data_; real* C_d = data_; - hl_matrix_csr_mul_dense(A_d, transA, B_d, HPPL_OP_N, C_d, height_, width_, - b.height_, scaleAB, scaleT); -} - -void GpuMatrix::mul(const GpuMatrix& a, const GpuSparseMatrix& b, real scaleAB, + hl_matrix_csr_mul_dense(A_d, + transA, + B_d, + HPPL_OP_N, + C_d, + height_, + width_, + b.height_, + scaleAB, + scaleT); +} + +void GpuMatrix::mul(const GpuMatrix& a, + const GpuSparseMatrix& b, + real scaleAB, real scaleT) { CHECK(isContiguous()); CHECK(a.isContiguous()); @@ -497,11 +557,27 @@ void GpuMatrix::mul(const GpuMatrix& a, const GpuSparseMatrix& b, real scaleAB, << "Matrix dimensions are not equal"; } if (b.format_ == SPARSE_CSC) { - hl_matrix_dense_mul_csc(A_d, HPPL_OP_N, B_d, transB, C_d, height_, width_, - a.width_, scaleAB, scaleT); + hl_matrix_dense_mul_csc(A_d, + HPPL_OP_N, + B_d, + transB, + C_d, + height_, + width_, + a.width_, + scaleAB, + scaleT); } else { - hl_matrix_dense_mul_csr(A_d, HPPL_OP_N, B_d, transB, C_d, height_, width_, - a.width_, scaleAB, scaleT); + hl_matrix_dense_mul_csr(A_d, + HPPL_OP_N, + B_d, + transB, + C_d, + height_, + width_, + a.width_, + scaleAB, + scaleT); } } @@ -510,7 +586,9 @@ void GpuMatrix::mul(const MatrixPtr a, const MatrixPtr b) { mul(a, b, 1.0, 0.0); } -void GpuMatrix::mul(const MatrixPtr a, const MatrixPtr b, real scaleAB, +void GpuMatrix::mul(const MatrixPtr a, + const MatrixPtr b, + real scaleAB, real scaleT) { GpuMatrixPtr a_ptr = std::dynamic_pointer_cast(a); GpuMatrixPtr b_ptr = std::dynamic_pointer_cast(b); @@ -563,8 +641,14 @@ void GpuMatrix::selectRows(Matrix& table, IVector& ids) { size_t tableSize = table.getHeight(); int* index = ids.getData(); - hl_matrix_select_rows(a, stride_, table.getData(), table.stride_, index, - numSamples, tableSize, dim); + hl_matrix_select_rows(a, + stride_, + table.getData(), + table.stride_, + index, + numSamples, + tableSize, + dim); #endif } @@ -581,15 +665,21 @@ void GpuMatrix::addToRows(Matrix& table, IVector& ids) { size_t tableSize = table.getHeight(); int* index = ids.getData(); - hl_matrix_add_to_rows(table.getData(), table.stride_, a, stride_, index, - numSamples, tableSize, dim); + hl_matrix_add_to_rows(table.getData(), + table.stride_, + a, + stride_, + index, + numSamples, + tableSize, + dim); #endif } void GpuMatrix::colMerge(Matrix& src) { CHECK(src.height_ == height_); if (!trans_ && !src.trans_) { - sumRows(src, /* scaleSum= */1, /* scaleDest= */0); + sumRows(src, /* scaleSum= */ 1, /* scaleDest= */ 0); } else { LOG(FATAL) << "Is not supported"; } @@ -599,7 +689,7 @@ void GpuMatrix::rowSum(Matrix& sum) { CHECK_EQ(sum.getHeight(), getHeight()); CHECK_EQ(sum.getWidth(), (size_t)1); - sum.sumRows(*this, /* scaleSum= */1, /* scaleDest= */0); + sum.sumRows(*this, /* scaleSum= */ 1, /* scaleDest= */ 0); } void GpuMatrix::rowMax(Matrix& max) { @@ -617,8 +707,13 @@ void GpuMatrix::rowMax(IVector& maxIds, Matrix& maxVal) { CHECK_EQ(maxIds.getSize(), numSamples * beam); CHECK_EQ(maxVal.getHeight(), numSamples); - hl_matrix_top_k(maxVal.getData(), maxVal.getStride(), maxIds.getData(), - this->getData(), this->getStride(), this->getWidth(), beam, + hl_matrix_top_k(maxVal.getData(), + maxVal.getStride(), + maxIds.getData(), + this->getData(), + this->getStride(), + this->getWidth(), + beam, numSamples); #endif } @@ -634,7 +729,9 @@ void GpuMatrix::colMax(IVector& maxIds, Matrix& maxVal) { LOG(FATAL) << "Is not supported"; } -void GpuMatrix::maxoutForward(Matrix& a, IVector& id, size_t channels, +void GpuMatrix::maxoutForward(Matrix& a, + IVector& id, + size_t channels, size_t groups) { CHECK(dynamic_cast(&a)); CHECK(dynamic_cast(&id)); @@ -646,11 +743,13 @@ void GpuMatrix::maxoutForward(Matrix& a, IVector& id, size_t channels, real* output = getData(); int* idForGpu = id.getData(); - hl_maxout_forward(input, output, idForGpu, batchSize, size, size / channels, - groups); + hl_maxout_forward( + input, output, idForGpu, batchSize, size, size / channels, groups); } -void GpuMatrix::maxoutBackward(Matrix& a, IVector& id, size_t channels, +void GpuMatrix::maxoutBackward(Matrix& a, + IVector& id, + size_t channels, size_t groups) { CHECK(dynamic_cast(&a)); CHECK(dynamic_cast(&id)); @@ -662,8 +761,8 @@ void GpuMatrix::maxoutBackward(Matrix& a, IVector& id, size_t channels, const real* output = a.getData(); const int* idForGpu = id.getData(); - hl_maxout_backward(input, output, idForGpu, batchSize, size, size / channels, - groups); + hl_maxout_backward( + input, output, idForGpu, batchSize, size, size / channels, groups); } /*calulate the error of classification */ @@ -679,8 +778,8 @@ void GpuMatrix::classificationError(MatrixPtr output, IVectorPtr label) { real* recResult_d = data_; int* label_d = label_ptr->getData(); - hl_matrix_classification_error(output_d, label_d, recResult_d, height_, - output_ptr->width_); + hl_matrix_classification_error( + output_d, label_d, recResult_d, height_, output_ptr->width_); } /* copy -log(output[i * width + label]) to this->data[i] */ @@ -717,13 +816,15 @@ void GpuMatrix::oneHotCrossEntropyBp(Matrix& outputV, IVector& label) { hl_matrix_cross_entropy_bp(grad_d, output_d, label_d, height_, width_); } -void GpuMatrix::oneHotCrossEntropyWithSelfNorm(Matrix& output, IVector& label, +void GpuMatrix::oneHotCrossEntropyWithSelfNorm(Matrix& output, + IVector& label, real alpha) { LOG(FATAL) << "Not implemented"; } void GpuMatrix::oneHotCrossEntropyWithSelfNormBp(Matrix& outputV, - IVector& label, real alpha) { + IVector& label, + real alpha) { LOG(FATAL) << "Not implemented"; } @@ -790,8 +891,10 @@ void GpuMatrix::sumOfSquares(Matrix& output, Matrix& label) { LOG(FATAL) << "not supported: GpuSparseMatrix as label"; } - BaseMatrix::sumOfSquaredDiffs(output, label, - /* scaleSum= */1, /* scaleDest= */1); + BaseMatrix::sumOfSquaredDiffs(output, + label, + /* scaleSum= */ 1, + /* scaleDest= */ 1); } void GpuMatrix::sumOfSquaresBp(Matrix& outputV, Matrix& label) { @@ -826,9 +929,12 @@ void GpuMatrix::cosSim(Matrix& output1, Matrix& output2, real scale) { real* y = output2.getData(); hl_cossim(out, x, y, dim, output1.getHeight(), output2.getHeight(), scale); } -void GpuMatrix::cosSimDerivative(Matrix& output, Matrix& prevOut1, - Matrix& prevOut2, Matrix& prevGrad1, - Matrix& prevGrad2, real scale) { +void GpuMatrix::cosSimDerivative(Matrix& output, + Matrix& prevOut1, + Matrix& prevOut2, + Matrix& prevGrad1, + Matrix& prevGrad2, + real scale) { CHECK(output.useGpu_ == true && prevOut1.useGpu_ == true && prevOut2.useGpu_ == true && prevGrad1.useGpu_ == true && prevGrad2.useGpu_ == true) @@ -852,8 +958,16 @@ void GpuMatrix::cosSimDerivative(Matrix& output, Matrix& prevOut1, real* prevOutY = prevOut2.getData(); real* prevGradX = prevGrad1.getData(); real* prevGradY = prevGrad2.getData(); - hl_cossim_derivative(grad, out, prevOutX, prevOutY, prevGradX, prevGradY, dim, - prevOut1.getHeight(), prevOut2.getHeight(), scale); + hl_cossim_derivative(grad, + out, + prevOutX, + prevOutY, + prevGradX, + prevGradY, + dim, + prevOut1.getHeight(), + prevOut2.getHeight(), + scale); } void GpuMatrix::randomizeUniform() { @@ -902,9 +1016,17 @@ void GpuMatrix::check(std::ostream& os, Matrix& refMat, bool printDiff) { LOG(INFO) << "the diffCnt is " << diffCnt; } -void GpuMatrix::convExpand(Matrix& feature, int feaImgHeight, int feaImgWidth, - int channels, int blockH, int blockW, int strideH, - int strideW, int paddingH, int paddingW, int outputH, +void GpuMatrix::convExpand(Matrix& feature, + int feaImgHeight, + int feaImgWidth, + int channels, + int blockH, + int blockW, + int strideH, + int strideW, + int paddingH, + int paddingW, + int outputH, int outputW) { CHECK(feature.useGpu_ == true) << "Matrix type are not equal"; @@ -915,15 +1037,34 @@ void GpuMatrix::convExpand(Matrix& feature, int feaImgHeight, int feaImgWidth, size_t elemCnt = outputH * outputW * blockH * blockW * channels; CHECK_EQ(elemCnt, height_ * width_) << "Matrix dimensions are not equal"; - hl_expand_feature2col(feature.getData(), channels, feaImgHeight, feaImgWidth, - blockH, blockW, strideH, strideW, paddingH, paddingW, - outputH, outputW, getData()); -} - -void GpuMatrix::convShrink(Matrix& expandFeat, int thisImgHeight, - int thisImgWidth, int channels, int blockH, - int blockW, int strideH, int strideW, int paddingH, - int paddingW, int outputH, int outputW, real alpha, + hl_expand_feature2col(feature.getData(), + channels, + feaImgHeight, + feaImgWidth, + blockH, + blockW, + strideH, + strideW, + paddingH, + paddingW, + outputH, + outputW, + getData()); +} + +void GpuMatrix::convShrink(Matrix& expandFeat, + int thisImgHeight, + int thisImgWidth, + int channels, + int blockH, + int blockW, + int strideH, + int strideW, + int paddingH, + int paddingW, + int outputH, + int outputW, + real alpha, real beta) { CHECK(expandFeat.useGpu_ == true) << "Matrix type are not equal"; CHECK_EQ(size_t(thisImgHeight * thisImgWidth * channels), @@ -933,16 +1074,34 @@ void GpuMatrix::convShrink(Matrix& expandFeat, int thisImgHeight, size_t elemCnt = outputH * outputW * blockW * blockH * channels; CHECK(elemCnt == expandFeat.getHeight() * expandFeat.getWidth()) << "Matrix dimensions are not equal"; - hl_shrink_col2feature(expandFeat.getData(), channels, thisImgHeight, - thisImgWidth, blockH, blockW, strideH, strideW, - paddingH, paddingW, outputH, outputW, getData(), alpha, + hl_shrink_col2feature(expandFeat.getData(), + channels, + thisImgHeight, + thisImgWidth, + blockH, + blockW, + strideH, + strideW, + paddingH, + paddingW, + outputH, + outputW, + getData(), + alpha, beta); } -void GpuMatrix::maxPoolForward(Matrix& inputMat, size_t imgSizeH, - size_t imgSizeW, size_t channels, size_t sizeX, - size_t sizeY, size_t strideH, size_t strideW, - size_t outputH, size_t outputW, size_t paddingH, +void GpuMatrix::maxPoolForward(Matrix& inputMat, + size_t imgSizeH, + size_t imgSizeW, + size_t channels, + size_t sizeX, + size_t sizeY, + size_t strideH, + size_t strideW, + size_t outputH, + size_t outputW, + size_t paddingH, size_t paddingW) { CHECK(inputMat.useGpu_ == true) << "Matrix type are not equal"; @@ -954,17 +1113,38 @@ void GpuMatrix::maxPoolForward(Matrix& inputMat, size_t imgSizeH, CHECK(height_ == inputMat.getHeight()); CHECK(width_ == outputH * outputW * channels); - hl_maxpool_forward(frameNum, inputData, channels, height, width, outputH, - outputW, sizeX, sizeY, strideH, strideW, paddingH, - paddingW, data_, getStride()); -} - -void GpuMatrix::maxPoolBackward(Matrix& inputMat, size_t imgSizeH, - size_t imgSizeW, Matrix& outGrad, Matrix& outV, - size_t sizeX, size_t sizeY, size_t strideH, - size_t strideW, size_t outputH, size_t outputW, - real scaleTargets, real scaleOutput, - size_t paddingH, size_t paddingW) { + hl_maxpool_forward(frameNum, + inputData, + channels, + height, + width, + outputH, + outputW, + sizeX, + sizeY, + strideH, + strideW, + paddingH, + paddingW, + data_, + getStride()); +} + +void GpuMatrix::maxPoolBackward(Matrix& inputMat, + size_t imgSizeH, + size_t imgSizeW, + Matrix& outGrad, + Matrix& outV, + size_t sizeX, + size_t sizeY, + size_t strideH, + size_t strideW, + size_t outputH, + size_t outputW, + real scaleTargets, + real scaleOutput, + size_t paddingH, + size_t paddingW) { CHECK(inputMat.useGpu_ == true && outGrad.useGpu_ == true && outV.useGpu_ == true) << "Matrix type are not equal"; @@ -982,16 +1162,38 @@ void GpuMatrix::maxPoolBackward(Matrix& inputMat, size_t imgSizeH, CHECK(outGrad.getHeight() == outV.getHeight() && outGrad.getWidth() == outV.getWidth()); - hl_maxpool_backward(frameNum, inputData, outData, outDiff, channels, height, - width, outputH, outputW, sizeX, sizeY, strideH, strideW, - paddingH, paddingW, scaleTargets, scaleOutput, data_, + hl_maxpool_backward(frameNum, + inputData, + outData, + outDiff, + channels, + height, + width, + outputH, + outputW, + sizeX, + sizeY, + strideH, + strideW, + paddingH, + paddingW, + scaleTargets, + scaleOutput, + data_, outGrad.getStride()); } -void GpuMatrix::avgPoolForward(Matrix& inputMat, size_t imgSizeH, - size_t imgSizeW, size_t channels, size_t sizeX, - size_t sizeY, size_t strideH, size_t strideW, - size_t outputH, size_t outputW, size_t paddingH, +void GpuMatrix::avgPoolForward(Matrix& inputMat, + size_t imgSizeH, + size_t imgSizeW, + size_t channels, + size_t sizeX, + size_t sizeY, + size_t strideH, + size_t strideW, + size_t outputH, + size_t outputW, + size_t paddingH, size_t paddingW) { CHECK(inputMat.useGpu_ == true) << "Matrix type are not equal"; @@ -1003,16 +1205,35 @@ void GpuMatrix::avgPoolForward(Matrix& inputMat, size_t imgSizeH, CHECK(height_ == inputMat.getHeight()); CHECK(width_ == outputH * outputW * channels); - hl_avgpool_forward(frameNum, inputData, channels, height, width, outputH, - outputW, sizeX, sizeY, strideH, strideW, paddingH, - paddingW, data_, getStride()); -} - -void GpuMatrix::avgPoolBackward(Matrix& outGrad, size_t imgSizeH, - size_t imgSizeW, size_t sizeX, size_t sizeY, - size_t strideH, size_t strideW, size_t outputH, - size_t outputW, real scaleTargets, - real scaleOutput, size_t paddingH, + hl_avgpool_forward(frameNum, + inputData, + channels, + height, + width, + outputH, + outputW, + sizeX, + sizeY, + strideH, + strideW, + paddingH, + paddingW, + data_, + getStride()); +} + +void GpuMatrix::avgPoolBackward(Matrix& outGrad, + size_t imgSizeH, + size_t imgSizeW, + size_t sizeX, + size_t sizeY, + size_t strideH, + size_t strideW, + size_t outputH, + size_t outputW, + real scaleTargets, + real scaleOutput, + size_t paddingH, size_t paddingW) { CHECK(outGrad.useGpu_ == true) << "Matrix type are not equal"; @@ -1025,15 +1246,32 @@ void GpuMatrix::avgPoolBackward(Matrix& outGrad, size_t imgSizeH, CHECK(height_ == outGrad.getHeight()); CHECK(outGrad.getWidth() == outputH * outputW * channels); - hl_avgpool_backward(frameNum, outDiff, channels, height, width, outputH, - outputW, sizeX, sizeY, strideH, strideW, paddingH, - paddingW, scaleTargets, scaleOutput, data_, + hl_avgpool_backward(frameNum, + outDiff, + channels, + height, + width, + outputH, + outputW, + sizeX, + sizeY, + strideH, + strideW, + paddingH, + paddingW, + scaleTargets, + scaleOutput, + data_, outGrad.getStride()); } -void GpuMatrix::crossMapNormalFwd(Matrix& input, size_t imgSizeH, - size_t imgSizeW, Matrix& denoms, - size_t channels, size_t sizeX, float scale, +void GpuMatrix::crossMapNormalFwd(Matrix& input, + size_t imgSizeH, + size_t imgSizeW, + Matrix& denoms, + size_t channels, + size_t sizeX, + float scale, float pow) { size_t num = input.getHeight(); size_t height = imgSizeH; @@ -1043,14 +1281,27 @@ void GpuMatrix::crossMapNormalFwd(Matrix& input, size_t imgSizeH, CHECK(denoms.getHeight() == input.getHeight() && denoms.getWidth() == input.getWidth() && input.getHeight() == height_ && input.getWidth() == width_); - hl_CMRNorm_forward(num, input.getData(), denoms.getData(), data_, channels, - height, width, sizeX, scale, -pow); -} - -void GpuMatrix::crossMapNormalBwd(Matrix& localGrad, Matrix& denoms, - Matrix& preOutV, Matrix& localOutV, - size_t channels, size_t imgSizeH, - size_t imgSizeW, size_t sizeX, float scale, + hl_CMRNorm_forward(num, + input.getData(), + denoms.getData(), + data_, + channels, + height, + width, + sizeX, + scale, + -pow); +} + +void GpuMatrix::crossMapNormalBwd(Matrix& localGrad, + Matrix& denoms, + Matrix& preOutV, + Matrix& localOutV, + size_t channels, + size_t imgSizeH, + size_t imgSizeW, + size_t sizeX, + float scale, float pow) { size_t num = preOutV.getHeight(); size_t height = imgSizeH; @@ -1063,12 +1314,22 @@ void GpuMatrix::crossMapNormalBwd(Matrix& localGrad, Matrix& denoms, CHECK(denoms.getHeight() == localGrad.getHeight() && denoms.getWidth() == localGrad.getWidth()); - hl_CMRNorm_backward(num, preOutV.getData(), denoms.getData(), - localOutV.getData(), localGrad.getData(), data_, channels, - height, width, sizeX, -pow, 2.0f * pow * scale); -} - -void GpuMatrix::maxSequenceForward(Matrix& input, const IVector& sequence, + hl_CMRNorm_backward(num, + preOutV.getData(), + denoms.getData(), + localOutV.getData(), + localGrad.getData(), + data_, + channels, + height, + width, + sizeX, + -pow, + 2.0f * pow * scale); +} + +void GpuMatrix::maxSequenceForward(Matrix& input, + const IVector& sequence, IVector& index) { CHECK(dynamic_cast(&input)); CHECK(dynamic_cast(&sequence)); @@ -1085,11 +1346,12 @@ void GpuMatrix::maxSequenceForward(Matrix& input, const IVector& sequence, CHECK_EQ(numSequences, sequence.getSize() - 1); CHECK_EQ(numSequences * dim, index.getSize()); - hl_max_sequence_forward(inputData, starts, outData, maxIndex, numSequences, - dim); + hl_max_sequence_forward( + inputData, starts, outData, maxIndex, numSequences, dim); } -void GpuMatrix::maxSequenceBackward(Matrix& outputGrad, const IVector& sequence, +void GpuMatrix::maxSequenceBackward(Matrix& outputGrad, + const IVector& sequence, IVector& index) { CHECK(dynamic_cast(&outputGrad)); CHECK(dynamic_cast(&sequence)); @@ -1108,10 +1370,13 @@ void GpuMatrix::maxSequenceBackward(Matrix& outputGrad, const IVector& sequence, hl_max_sequence_backward(outGrad, maxIndex, inputGrad, numSequences, dim); } -void GpuMatrix::contextProjectionForward(MatrixPtr input, MatrixPtr weight, +void GpuMatrix::contextProjectionForward(MatrixPtr input, + MatrixPtr weight, const IVector& sequence, - int contextLength, int contextStart, - size_t beginPad, bool isPadding) { + int contextLength, + int contextStart, + size_t beginPad, + bool isPadding) { CHECK(dynamic_cast(input.get())); CHECK(dynamic_cast(&sequence)); if (weight) CHECK(dynamic_cast(weight.get())); @@ -1125,9 +1390,16 @@ void GpuMatrix::contextProjectionForward(MatrixPtr input, MatrixPtr weight, real* inputData = input->getData(); const int* starts = sequence.getData(); - hl_context_projection_forward( - inputData, starts, isPadding ? weight->getData() : NULL, outData, - numSequences, inputDim, contextLength, contextStart, beginPad, isPadding); + hl_context_projection_forward(inputData, + starts, + isPadding ? weight->getData() : NULL, + outData, + numSequences, + inputDim, + contextLength, + contextStart, + beginPad, + isPadding); } void GpuMatrix::contextProjectionBackwardData(MatrixPtr inputGrad, @@ -1146,14 +1418,20 @@ void GpuMatrix::contextProjectionBackwardData(MatrixPtr inputGrad, real* inGrad = inputGrad->getData(); const int* starts = sequence.getData(); - hl_context_projection_backward_data(outGrad, starts, inGrad, numSequences, - inputDim, contextLength, contextStart); + hl_context_projection_backward_data(outGrad, + starts, + inGrad, + numSequences, + inputDim, + contextLength, + contextStart); } void GpuMatrix::contextProjectionBackwardWeight(MatrixPtr weightGrad, const IVector& sequence, int contextLength, - int contextStart, int totalPad, + int contextStart, + int totalPad, size_t beginPad) { CHECK(dynamic_cast(weightGrad.get())); CHECK(dynamic_cast(&sequence)); @@ -1167,9 +1445,15 @@ void GpuMatrix::contextProjectionBackwardWeight(MatrixPtr weightGrad, real* wtGrad = weightGrad->getData(); const int* starts = sequence.getData(); - hl_context_projection_backward_weight(outGrad, starts, wtGrad, numSequences, - weightDim, totalPad, contextLength, - contextStart, beginPad); + hl_context_projection_backward_weight(outGrad, + starts, + wtGrad, + numSequences, + weightDim, + totalPad, + contextLength, + contextStart, + beginPad); } void GpuMatrix::paramReluForward(Matrix& data, Matrix& W) { @@ -1193,8 +1477,8 @@ void GpuMatrix::paramReluBackwardW(Matrix& oGrad, Matrix& data) { size_t numElements = data.getWidth(); size_t numSamples = data.getHeight(); size_t partial_sum = numElements / (this->getHeight() * this->getWidth()); - hl_param_relu_backward_w(wgrad, ograd, input, numElements, numSamples, - partial_sum); + hl_param_relu_backward_w( + wgrad, ograd, input, numElements, numSamples, partial_sum); } void GpuMatrix::paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) { @@ -1205,8 +1489,8 @@ void GpuMatrix::paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) { size_t numElements = data.getWidth(); size_t numSamples = data.getHeight(); size_t partial_sum = numElements / (W.getHeight() * W.getWidth()); - hl_param_relu_backward_diff(ograd, input, w, diff, numElements, numSamples, - partial_sum); + hl_param_relu_backward_diff( + ograd, input, w, diff, numElements, numSamples, partial_sum); } void GpuMatrix::addColumnVector(const Matrix& b) { @@ -1229,15 +1513,24 @@ void GpuMatrix::bilinearForward(const Matrix& in, const size_t inputH = in.getHeight(); real* outData = getData(); - const real* inData = in.getData(); + const real* inData = in.getData(); if (inImgH == outImgW && inImgW == outImgW) { this->copyFrom(in); } else { - hl_bilinear_forward( - inData, inImgH, inImgW, inputH, inputW, outData, - outImgH, outImgW, outputH, outputW, numChannels, - ratioH, ratioW); + hl_bilinear_forward(inData, + inImgH, + inImgW, + inputH, + inputW, + outData, + outImgH, + outImgW, + outputH, + outputW, + numChannels, + ratioH, + ratioW); } } @@ -1262,47 +1555,56 @@ void GpuMatrix::bilinearBackward(const Matrix& out, if (outImgH == inImgH && outImgW == inImgW) { this->add(const_cast(out)); } else { - hl_bilinear_backward( - inGrad, inImgH, inImgW, inputH, inputW, outGrad, - outImgH, outImgW, outputH, outputW, numChannels, - ratioH, ratioW); + hl_bilinear_backward(inGrad, + inImgH, + inImgW, + inputH, + inputW, + outGrad, + outImgH, + outImgW, + outputH, + outputW, + numChannels, + ratioH, + ratioW); } } void GpuMatrix::multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label) { - GpuMatrix* outputPtr = dynamic_cast(&output); - auto labelPtr = dynamic_cast(&label); - - CHECK(outputPtr && labelPtr) << "Invalid argument pointer"; - CHECK(labelPtr->format_ == SPARSE_CSR) << "Matrix format not supported"; - CHECK(height_ == outputPtr->height_ && width_ == 1 - && outputPtr->width_ == labelPtr->getWidth() - && outputPtr->height_ == labelPtr->getHeight()) - << "Matrix dimensions are not equal"; + GpuMatrix* outputPtr = dynamic_cast(&output); + auto labelPtr = dynamic_cast(&label); + + CHECK(outputPtr && labelPtr) << "Invalid argument pointer"; + CHECK(labelPtr->format_ == SPARSE_CSR) << "Matrix format not supported"; + CHECK(height_ == outputPtr->height_ && width_ == 1 && + outputPtr->width_ == labelPtr->getWidth() && + outputPtr->height_ == labelPtr->getHeight()) + << "Matrix dimensions are not equal"; - real* output_d = outputPtr->data_; - real* entropy_d = data_; - hl_sparse_matrix_s mat_d = labelPtr->sMatrix_.get(); - hl_matrix_multi_binary_cross_entropy( - output_d, entropy_d, mat_d, height_, outputPtr->width_); + real* output_d = outputPtr->data_; + real* entropy_d = data_; + hl_sparse_matrix_s mat_d = labelPtr->sMatrix_.get(); + hl_matrix_multi_binary_cross_entropy( + output_d, entropy_d, mat_d, height_, outputPtr->width_); } -void GpuMatrix::multiBinaryLabelCrossEntropyBp(Matrix &output, Matrix &label) { - GpuMatrix* outputPtr = dynamic_cast(&output); - auto labelPtr = dynamic_cast(&label); +void GpuMatrix::multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) { + GpuMatrix* outputPtr = dynamic_cast(&output); + auto labelPtr = dynamic_cast(&label); - CHECK(outputPtr && labelPtr) << "Invalid argument pointer"; - CHECK(labelPtr->format_ == SPARSE_CSR) << "Matrix format not supported"; - CHECK(height_ == outputPtr->height_ && width_ == outputPtr->width_ - && outputPtr->width_ == labelPtr->getWidth() - && outputPtr->height_ == labelPtr->getHeight()) - << "Matrix dimensions are not equal"; + CHECK(outputPtr && labelPtr) << "Invalid argument pointer"; + CHECK(labelPtr->format_ == SPARSE_CSR) << "Matrix format not supported"; + CHECK(height_ == outputPtr->height_ && width_ == outputPtr->width_ && + outputPtr->width_ == labelPtr->getWidth() && + outputPtr->height_ == labelPtr->getHeight()) + << "Matrix dimensions are not equal"; - real* output_d = outputPtr->data_; - real* grad_d = data_; - hl_sparse_matrix_s mat_d = labelPtr->sMatrix_.get(); - hl_matrix_multi_binary_cross_entropy_bp( - output_d, grad_d, mat_d, height_, width_); + real* output_d = outputPtr->data_; + real* grad_d = data_; + hl_sparse_matrix_s mat_d = labelPtr->sMatrix_.get(); + hl_matrix_multi_binary_cross_entropy_bp( + output_d, grad_d, mat_d, height_, width_); } /** @@ -1311,7 +1613,10 @@ void GpuMatrix::multiBinaryLabelCrossEntropyBp(Matrix &output, Matrix &label) { CpuMatrix::CpuMatrix(size_t height, size_t width, bool trans) : Matrix(std::make_shared(height * width * sizeof(real)), - height, width, trans, false) {} + height, + width, + trans, + false) {} CpuMatrix::~CpuMatrix() {} @@ -1333,8 +1638,8 @@ void CpuMatrix::copyFrom(const Matrix& src) { if (typeid(src) == typeid(GpuMatrix)) { CHECK(src.isContiguous()); CHECK(elementCnt_ == src.getElementCnt()); - hl_memcpy_device2host(data_, const_cast(src.getData()), - sizeof(real) * elementCnt_); + hl_memcpy_device2host( + data_, const_cast(src.getData()), sizeof(real) * elementCnt_); } else if (typeid(src) == typeid(CpuMatrix) || typeid(src) == typeid(SharedCpuMatrix)) { CHECK(src.isContiguous()); @@ -1399,8 +1704,10 @@ void CpuMatrix::copyFrom(const Matrix& src, hl_stream_t stream) { CHECK(src.isContiguous()); CHECK(elementCnt_ == src.getElementCnt()); if (typeid(src) == typeid(GpuMatrix)) { - hl_memcpy_async(this->getData(), const_cast(src.getData()), - sizeof(real) * elementCnt_, stream); + hl_memcpy_async(this->getData(), + const_cast(src.getData()), + sizeof(real) * elementCnt_, + stream); } else if (typeid(src) == typeid(CpuMatrix)) { memcpy(data_, src.getData(), sizeof(real) * elementCnt_); } else { @@ -1502,7 +1809,7 @@ void CpuMatrix::accumulateColSum(Matrix& src) { CHECK_EQ(getWidth(), src.getWidth()); CHECK_EQ(getHeight(), (size_t)1); - sumCols(src, /* scaleSum= */1, /* scaleDest= */1); + sumCols(src, /* scaleSum= */ 1, /* scaleDest= */ 1); } real CpuMatrix::getAbsSum() { @@ -1519,8 +1826,10 @@ real CpuMatrix::getAbsSum() { MatrixPtr CpuMatrix::getTranspose() { if (memoryHandle_.get() != NULL) { return std::make_shared( - std::dynamic_pointer_cast(memoryHandle_), height_, - width_, true); + std::dynamic_pointer_cast(memoryHandle_), + height_, + width_, + true); } else { MatrixPtr copy_T(new CpuMatrix(data_, height_, width_, true)); return copy_T; @@ -1545,7 +1854,6 @@ void CpuMatrix::transpose(MatrixPtr matTrans, bool memAlloc) { } } - MatrixPtr CpuMatrix::getInverse() { MatrixPtr matInv; inverse(matInv, true); @@ -1586,9 +1894,17 @@ void CpuMatrix::inverse(MatrixPtr matInv, bool memAlloc) { CHECK_EQ(info, 0); } -void CpuMatrix::convExpand(Matrix& feature, int feaImgHeight, int feaImgWidth, - int channels, int blockH, int blockW, int strideH, - int strideW, int paddingH, int paddingW, int outputH, +void CpuMatrix::convExpand(Matrix& feature, + int feaImgHeight, + int feaImgWidth, + int channels, + int blockH, + int blockW, + int strideH, + int strideW, + int paddingH, + int paddingW, + int outputH, int outputW) { CHECK(feature.useGpu_ == false) << "Matrix type are not equal"; @@ -1626,10 +1942,19 @@ void CpuMatrix::convExpand(Matrix& feature, int feaImgHeight, int feaImgWidth, } } -void CpuMatrix::convShrink(Matrix& expandFeat, int thisImgHeight, - int thisImgWidth, int channels, int blockH, - int blockW, int strideH, int strideW, int paddingH, - int paddingW, int outputH, int outputW, real alpha, +void CpuMatrix::convShrink(Matrix& expandFeat, + int thisImgHeight, + int thisImgWidth, + int channels, + int blockH, + int blockW, + int strideH, + int strideW, + int paddingH, + int paddingW, + int outputH, + int outputW, + real alpha, real beta) { CHECK(expandFeat.useGpu_ == false) << "Matrix type are not equal"; CHECK_EQ(size_t(thisImgHeight * thisImgWidth * channels), @@ -1666,10 +1991,17 @@ void CpuMatrix::convShrink(Matrix& expandFeat, int thisImgHeight, } } -void CpuMatrix::maxPoolForward(Matrix& inputMat, size_t imgSizeH, - size_t imgSizeW, size_t channels, size_t sizeX, - size_t sizeY, size_t strideH, size_t strideW, - size_t outputH, size_t outputW, size_t paddingH, +void CpuMatrix::maxPoolForward(Matrix& inputMat, + size_t imgSizeH, + size_t imgSizeW, + size_t channels, + size_t sizeX, + size_t sizeY, + size_t strideH, + size_t strideW, + size_t outputH, + size_t outputW, + size_t paddingH, size_t paddingW) { real* inputData = inputMat.getData(); real* outData = data_; @@ -1717,12 +2049,21 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat, size_t imgSizeH, } } -void CpuMatrix::maxPoolBackward(Matrix& image, size_t imgSizeH, size_t imgSizeW, - Matrix& outGrad, Matrix& outV, size_t sizeX, - size_t sizeY, size_t strideH, size_t strideW, - size_t outputH, size_t outputW, - real scaleTargets, real scaleOutput, - size_t paddingH, size_t paddingW) { +void CpuMatrix::maxPoolBackward(Matrix& image, + size_t imgSizeH, + size_t imgSizeW, + Matrix& outGrad, + Matrix& outV, + size_t sizeX, + size_t sizeY, + size_t strideH, + size_t strideW, + size_t outputH, + size_t outputW, + real scaleTargets, + real scaleOutput, + size_t paddingH, + size_t paddingW) { size_t num = image.getHeight(); size_t channels = size_t(width_ / imgSizeH / imgSizeW); CHECK(image.getWidth() == imgSizeH * imgSizeW * channels); @@ -1772,10 +2113,17 @@ void CpuMatrix::maxPoolBackward(Matrix& image, size_t imgSizeH, size_t imgSizeW, } } -void CpuMatrix::avgPoolForward(Matrix& input, size_t imgSizeH, size_t imgSizeW, - size_t channels, size_t sizeX, size_t sizeY, - size_t strideH, size_t strideW, size_t outputH, - size_t outputW, size_t paddingH, +void CpuMatrix::avgPoolForward(Matrix& input, + size_t imgSizeH, + size_t imgSizeW, + size_t channels, + size_t sizeX, + size_t sizeY, + size_t strideH, + size_t strideW, + size_t outputH, + size_t outputW, + size_t paddingH, size_t paddingW) { // The main loop size_t num = input.getHeight(); @@ -1820,11 +2168,19 @@ void CpuMatrix::avgPoolForward(Matrix& input, size_t imgSizeH, size_t imgSizeW, } } -void CpuMatrix::avgPoolBackward(Matrix& input, size_t imgSizeH, size_t imgSizeW, - size_t sizeX, size_t sizeY, size_t strideH, - size_t strideW, size_t outputH, size_t outputW, - real scaleTargets, real scaleOutput, - size_t paddingH, size_t paddingW) { +void CpuMatrix::avgPoolBackward(Matrix& input, + size_t imgSizeH, + size_t imgSizeW, + size_t sizeX, + size_t sizeY, + size_t strideH, + size_t strideW, + size_t outputH, + size_t outputW, + real scaleTargets, + real scaleOutput, + size_t paddingH, + size_t paddingW) { size_t num = input.getHeight(); size_t channels = input.getWidth() / outputH / outputW; CHECK(imgSizeH * imgSizeW * channels == getWidth()); @@ -1863,9 +2219,13 @@ void CpuMatrix::avgPoolBackward(Matrix& input, size_t imgSizeH, size_t imgSizeW, } } -void CpuMatrix::crossMapNormalFwd(Matrix& input, size_t imgSizeH, - size_t imgSizeW, Matrix& denoms, - size_t channels, size_t sizeX, float scale, +void CpuMatrix::crossMapNormalFwd(Matrix& input, + size_t imgSizeH, + size_t imgSizeW, + Matrix& denoms, + size_t channels, + size_t sizeX, + float scale, float pow) { size_t num = input.getHeight(); size_t height = imgSizeH; @@ -1915,10 +2275,15 @@ void CpuMatrix::crossMapNormalFwd(Matrix& input, size_t imgSizeH, integralData = NULL; } -void CpuMatrix::crossMapNormalBwd(Matrix& localGrad, Matrix& denoms, - Matrix& preOutV, Matrix& localOutV, - size_t channels, size_t imgSizeH, - size_t imgSizeW, size_t size, float scale, +void CpuMatrix::crossMapNormalBwd(Matrix& localGrad, + Matrix& denoms, + Matrix& preOutV, + Matrix& localOutV, + size_t channels, + size_t imgSizeH, + size_t imgSizeW, + size_t size, + float scale, float pow) { LOG(FATAL) << "Not implemented"; @@ -1937,7 +2302,8 @@ void CpuMatrix::crossMapNormalBwd(Matrix& localGrad, Matrix& denoms, * Output: output size is the number of input sequences (NOT input instances). * output[i] is set to max_{for each instance in this sequence}{input[i]} */ -void CpuMatrix::maxSequenceForward(Matrix& input, const IVector& sequence, +void CpuMatrix::maxSequenceForward(Matrix& input, + const IVector& sequence, IVector& index) { CHECK(dynamic_cast(&input)); CHECK(dynamic_cast(&sequence)); @@ -1978,7 +2344,8 @@ void CpuMatrix::maxSequenceForward(Matrix& input, const IVector& sequence, } } -void CpuMatrix::maxSequenceBackward(Matrix& outputGrad, const IVector& sequence, +void CpuMatrix::maxSequenceBackward(Matrix& outputGrad, + const IVector& sequence, IVector& index) { CHECK(dynamic_cast(&outputGrad)); CHECK(dynamic_cast(&sequence)); @@ -2004,10 +2371,13 @@ void CpuMatrix::maxSequenceBackward(Matrix& outputGrad, const IVector& sequence, } } -void CpuMatrix::contextProjectionForward(MatrixPtr input, MatrixPtr weight, +void CpuMatrix::contextProjectionForward(MatrixPtr input, + MatrixPtr weight, const IVector& sequence, - int contextLength, int contextStart, - size_t beginPad, bool isPadding) { + int contextLength, + int contextStart, + size_t beginPad, + bool isPadding) { CHECK(dynamic_cast(input.get())); CHECK(dynamic_cast(&sequence)); if (weight) CHECK(dynamic_cast(weight.get())); @@ -2058,8 +2428,10 @@ void CpuMatrix::contextProjectionForward(MatrixPtr input, MatrixPtr weight, void CpuMatrix::contextProjectionBackward(MatrixPtr inputGrad, MatrixPtr weightGrad, const IVector& sequence, - int contextLength, int contextStart, - size_t beginPad, bool isPadding) { + int contextLength, + int contextStart, + size_t beginPad, + bool isPadding) { if (inputGrad) CHECK(dynamic_cast(inputGrad.get())); if (weightGrad) CHECK(dynamic_cast(weightGrad.get())); CHECK(dynamic_cast(&sequence)); @@ -2125,15 +2497,15 @@ inline void vecAddTo(real* a, const real* b, real scaleB, size_t len) { } } -inline void colVecAddTo(real* a, const real* b, size_t len, size_t aWidth, - size_t bWidth) { +inline void colVecAddTo( + real* a, const real* b, size_t len, size_t aWidth, size_t bWidth) { for (unsigned int i = 0; i < len; ++i) { a[i * aWidth] += b[i * bWidth]; } } -inline void colVecAddTo(real* a, real* b, real c, size_t len, size_t aWidth, - size_t bWidth) { +inline void colVecAddTo( + real* a, real* b, real c, size_t len, size_t aWidth, size_t bWidth) { for (unsigned int i = 0; i < len; ++i) { a[i * aWidth] += b[i * bWidth] * c; } @@ -2189,7 +2561,7 @@ void CpuMatrix::collectBias(Matrix& a, real scale) { CHECK_EQ(width_, a.getWidth()); CpuSparseMatrix* aptr = dynamic_cast(&a); if (!aptr) { - sumCols(a, /* scaleSum= */scale, /* scaleDest= */1); + sumCols(a, /* scaleSum= */ scale, /* scaleDest= */ 1); } else { size_t nnz = aptr->getElementCnt(); int* cols = aptr->getCols(); @@ -2240,15 +2612,17 @@ void CpuMatrix::sequenceAvgForward(Matrix& a, dataMtx->setData(src + starts[i] * width, sequenceLength, width); if (mode == 0) { // plain average - outMtx->sumCols(*dataMtx, (real)1 / (real)sequenceLength, - /* scaleDest= */1); + outMtx->sumCols(*dataMtx, + (real)1 / (real)sequenceLength, + /* scaleDest= */ 1); } else if (mode == 1) { // sum instead of average - outMtx->sumCols(*dataMtx, /* scaleSum= */1, /* scaleDest= */1); + outMtx->sumCols(*dataMtx, /* scaleSum= */ 1, /* scaleDest= */ 1); } else if (mode == 2) { // divide by square root of sequenceLength - outMtx->sumCols(*dataMtx, (real)1 / std::sqrt(sequenceLength), - /* scaleDest= */1); + outMtx->sumCols(*dataMtx, + (real)1 / std::sqrt(sequenceLength), + /* scaleDest= */ 1); } else { LOG(FATAL) << "should not reach here"; } @@ -2256,27 +2630,37 @@ void CpuMatrix::sequenceAvgForward(Matrix& a, } /* this = scaleAB*(a*b) + scaleT*this*/ -void CpuMatrix::mul(const MatrixPtr a, const MatrixPtr b, real scaleAB, +void CpuMatrix::mul(const MatrixPtr a, + const MatrixPtr b, + real scaleAB, real scaleT) { CHECK(!isTransposed()) << "Not supported"; if (dynamic_cast(a.get()) && dynamic_cast(b.get())) { - mul(dynamic_cast(a.get()), dynamic_cast(b.get()), - scaleAB, scaleT); + mul(dynamic_cast(a.get()), + dynamic_cast(b.get()), + scaleAB, + scaleT); } else if (dynamic_cast(a.get()) && dynamic_cast(b.get())) { mul(dynamic_cast(a.get()), - dynamic_cast(b.get()), scaleAB, scaleT); + dynamic_cast(b.get()), + scaleAB, + scaleT); } else if (dynamic_cast(a.get()) && dynamic_cast(b.get())) { mul(dynamic_cast(a.get()), - dynamic_cast(b.get()), scaleAB, scaleT); + dynamic_cast(b.get()), + scaleAB, + scaleT); } else { LOG(FATAL) << "Not supported"; } } -void CpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, +void CpuMatrix::mul(CpuSparseMatrix* a, + CpuMatrix* b, + real scaleAB, real scaleT) { if (dynamic_cast(b)) { return mul(a, dynamic_cast(b), this, scaleAB, scaleT); @@ -2326,11 +2710,35 @@ void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) { int ldb = b->getStride(); int ldc = getStride(); #ifndef PADDLE_TYPE_DOUBLE - cblas_sgemm(CblasRowMajor, a_trans, b_trans, M, N, K, scaleAB, A, lda, B, ldb, - scaleT, C, ldc); + cblas_sgemm(CblasRowMajor, + a_trans, + b_trans, + M, + N, + K, + scaleAB, + A, + lda, + B, + ldb, + scaleT, + C, + ldc); #else - cblas_dgemm(CblasRowMajor, a_trans, b_trans, M, N, K, scaleAB, A, lda, B, ldb, - scaleT, C, ldc); + cblas_dgemm(CblasRowMajor, + a_trans, + b_trans, + M, + N, + K, + scaleAB, + A, + lda, + B, + ldb, + scaleT, + C, + ldc); // TODO(yuyang18): Is gemm defined other place? #endif @@ -2338,8 +2746,8 @@ void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) { << " B[1]=" << B[1] << " C[0]=" << C[0] << " C[1]=" << C[1]; } -void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, CpuSparseMatrix* c, - real scaleAB, real scaleT) { +void CpuMatrix::mul( + CpuMatrix* a, CpuMatrix* b, CpuSparseMatrix* c, real scaleAB, real scaleT) { CHECK(!c->isTransposed()) << "Not supported"; CHECK_EQ(c->getValueType(), FLOAT_VALUE); @@ -2446,7 +2854,9 @@ void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, CpuSparseMatrix* c, } } -void CpuMatrix::mul(CpuMatrix* a, CpuSparseMatrix* b, real scaleAB, +void CpuMatrix::mul(CpuMatrix* a, + CpuSparseMatrix* b, + real scaleAB, real scaleT) { CHECK(!trans_) << "Not supported"; CHECK(!a->isTransposed()) << "Not supported"; @@ -2484,8 +2894,8 @@ void CpuMatrix::mul(CpuMatrix* a, CpuSparseMatrix* b, real scaleAB, int start = b->getColStartIdx(j); int end = b->getColStartIdx(j + 1); for (int i = start; i < end; ++i) { - colVecAddTo(C + j, A + rows[i], B[i], height_, width_, - a->getWidth()); + colVecAddTo( + C + j, A + rows[i], B[i], height_, width_, a->getWidth()); } } } @@ -2507,8 +2917,8 @@ void CpuMatrix::mul(CpuMatrix* a, CpuSparseMatrix* b, real scaleAB, int start = b->getColStartIdx(i); int end = b->getColStartIdx(i + 1); for (int j = start; j < end; ++j) { - colVecAddTo(C + rows[j], A + i, B[j], height_, width_, - a->getWidth()); + colVecAddTo( + C + rows[j], A + i, B[j], height_, width_, a->getWidth()); } } } @@ -2533,8 +2943,8 @@ void CpuMatrix::mul(CpuMatrix* a, CpuSparseMatrix* b, real scaleAB, int start = b->getRowStartIdx(j); int end = b->getRowStartIdx(j + 1); for (int i = start; i < end; ++i) { - colVecAddTo(C + cols[i], A + j, B[i], height_, width_, - a->getWidth()); + colVecAddTo( + C + cols[i], A + j, B[i], height_, width_, a->getWidth()); } } } @@ -2556,8 +2966,8 @@ void CpuMatrix::mul(CpuMatrix* a, CpuSparseMatrix* b, real scaleAB, int start = b->getRowStartIdx(i); int end = b->getRowStartIdx(i + 1); for (int j = start; j < end; ++j) { - colVecAddTo(C + i, A + cols[j], B[j], height_, width_, - a->getWidth()); + colVecAddTo( + C + i, A + cols[j], B[j], height_, width_, a->getWidth()); } } } @@ -2656,8 +3066,8 @@ void CpuMatrix::addToRowsImp(TableMatType& table, IVector& ids) { static ThreadLocal> threadLocalColArray; template -void CpuMatrix::mul(CpuSparseMatrix* a, MatBType* b, MatCType* c, real scaleAB, - real scaleT) { +void CpuMatrix::mul( + CpuSparseMatrix* a, MatBType* b, MatCType* c, real scaleAB, real scaleT) { CHECK(!c->isTransposed()) << "Not supported"; CHECK(!b->isTransposed()) << "Not supported"; // TODO(yuyang18): Maybe bug implementation here. @@ -2760,18 +3170,26 @@ void CpuMatrix::mul(CpuSparseMatrix* a, MatBType* b, MatCType* c, real scaleAB, // instantiation mul() called in SparseRowMatrix.cpp template void CpuMatrix::mul( - CpuSparseMatrix* a, CpuMatrix* b, SparseRowCpuMatrix* c, real scaleAB, + CpuSparseMatrix* a, + CpuMatrix* b, + SparseRowCpuMatrix* c, + real scaleAB, real scaleT); template void CpuMatrix::mul( - CpuSparseMatrix* a, CpuMatrix* b, SparseAutoGrowRowCpuMatrix* c, - real scaleAB, real scaleT); + CpuSparseMatrix* a, + CpuMatrix* b, + SparseAutoGrowRowCpuMatrix* c, + real scaleAB, + real scaleT); template void CpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b, CacheRowCpuMatrix* c, real scaleAB, real scaleT); -void SharedCpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, +void SharedCpuMatrix::mul(CpuSparseMatrix* a, + CpuMatrix* b, + real scaleAB, real scaleT) { CHECK(!isTransposed()) << "Not supported"; CHECK(!b->isTransposed()) << "Not supported"; @@ -2811,8 +3229,8 @@ void SharedCpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, for (int k = 0; k < blockNum_; ++k) { blockSeq.push_back(k); } - std::shuffle(blockSeq.begin(), blockSeq.end(), - ThreadLocalRandomEngine::get()); + std::shuffle( + blockSeq.begin(), blockSeq.end(), ThreadLocalRandomEngine::get()); } std::vector& localBufRows = *localBufRows_; int* cols = a->getCols(); @@ -2850,8 +3268,8 @@ void SharedCpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, localBufRows.push_back(i); size_t bufPos = localBufRows.size() - 1; for (int j = start; j < end; ++j) { - vecAddTo(localC + bufPos * width, B + cols[j] * width, value[j], - width); + vecAddTo( + localC + bufPos * width, B + cols[j] * width, value[j], width); } } } @@ -2935,7 +3353,7 @@ void CpuMatrix::rowSum(Matrix& sum) { CHECK_EQ(sum.getHeight(), getHeight()); CHECK_EQ(sum.getWidth(), (size_t)1); - sum.sumRows(*this, /* scaleSum= */1, /* scaleDest= */0); + sum.sumRows(*this, /* scaleSum= */ 1, /* scaleDest= */ 0); } void CpuMatrix::rowMaxId(IVector& maxIds) { @@ -2987,7 +3405,9 @@ void CpuMatrix::rowMax(IVector& maxIds, Matrix& maxVal) { } std::partial_sort( - vec.begin(), vec.begin() + beam, vec.end(), + vec.begin(), + vec.begin() + beam, + vec.end(), [](const std::pair& l, const std::pair& r) { return l.first > r.first; }); @@ -3023,7 +3443,9 @@ void CpuMatrix::colMax(IVector& maxIds, Matrix& maxVal) { } std::partial_sort( - vec.begin(), vec.begin() + beam, vec.end(), + vec.begin(), + vec.begin() + beam, + vec.end(), [](const std::pair& l, const std::pair& r) { return l.first > r.first; }); @@ -3034,7 +3456,9 @@ void CpuMatrix::colMax(IVector& maxIds, Matrix& maxVal) { } } -void CpuMatrix::maxoutForward(Matrix& a, IVector& id, size_t channels, +void CpuMatrix::maxoutForward(Matrix& a, + IVector& id, + size_t channels, size_t groups) { CHECK(dynamic_cast(&a)); CHECK(dynamic_cast(&id)); @@ -3067,7 +3491,9 @@ void CpuMatrix::maxoutForward(Matrix& a, IVector& id, size_t channels, } } -void CpuMatrix::maxoutBackward(Matrix& a, IVector& id, size_t channels, +void CpuMatrix::maxoutBackward(Matrix& a, + IVector& id, + size_t channels, size_t groups) { CHECK(dynamic_cast(&a)); CHECK(dynamic_cast(&id)); @@ -3189,7 +3615,8 @@ void CpuMatrix::oneHotCrossEntropyBp(Matrix& output, IVector& label) { but we define the scalar function here for sanity check deletion of the function does not affect anything neverthelss */ -void CpuMatrix::oneHotCrossEntropyWithSelfNorm(Matrix& output, IVector& label, +void CpuMatrix::oneHotCrossEntropyWithSelfNorm(Matrix& output, + IVector& label, real alpha) { CHECK(dynamic_cast(&output)); CHECK(dynamic_cast(&label)); @@ -3220,7 +3647,8 @@ void CpuMatrix::oneHotCrossEntropyWithSelfNorm(Matrix& output, IVector& label, but we define the scalar function here for sanity check deletion of the function does not affect anything neverthelss */ -void CpuMatrix::oneHotCrossEntropyWithSelfNormBp(Matrix& output, IVector& label, +void CpuMatrix::oneHotCrossEntropyWithSelfNormBp(Matrix& output, + IVector& label, real alpha) { CHECK(dynamic_cast(&output)); CHECK(dynamic_cast(&label)); @@ -3301,10 +3729,16 @@ void CpuMatrix::sequenceSoftmax(Matrix& output, const IVector& index) { CHECK_EQ(output.getWidth(), 1UL); CHECK(isContiguous()); - MatrixPtr inTmp = Matrix::create(nullptr, /* height= */ 1, 1, - /* trans= */ false, false); - MatrixPtr outTmp = Matrix::create(nullptr, /* height= */ 1, 1, - /* trans= */ false, false); + MatrixPtr inTmp = Matrix::create(nullptr, + /* height= */ 1, + 1, + /* trans= */ false, + false); + MatrixPtr outTmp = Matrix::create(nullptr, + /* height= */ 1, + 1, + /* trans= */ false, + false); size_t numSequences = index.getSize() - 1; auto starts = index.getData(); for (size_t i = 0; i < numSequences; ++i) { @@ -3360,9 +3794,12 @@ void CpuMatrix::cosSim(Matrix& output1, Matrix& output2, real scale) { } } -void CpuMatrix::cosSimDerivative(Matrix& output, Matrix& prevOut1, - Matrix& prevOut2, Matrix& prevGrad1, - Matrix& prevGrad2, real scale) { +void CpuMatrix::cosSimDerivative(Matrix& output, + Matrix& prevOut1, + Matrix& prevOut2, + Matrix& prevGrad1, + Matrix& prevGrad2, + real scale) { CHECK(output.useGpu_ == false) << "Matrix type are not equal"; CHECK_EQ(getWidth(), 1UL); @@ -3392,8 +3829,11 @@ void CpuMatrix::cosSimDerivative(Matrix& output, Matrix& prevOut1, CHECK_EQ(prevOut2.getHeight(), numSamples); CHECK_EQ(prevGrad2.getHeight(), numSamples); } - for (size_t i = 0; i < numSamples; ++i, prevOutX += dim, prevOutY += yInc, - prevGradX += dim, prevGradY += yInc) { + for (size_t i = 0; i < numSamples; ++i, + prevOutX += dim, + prevOutY += yInc, + prevGradX += dim, + prevGradY += yInc) { real squareSumX = 0; real squareSumY = 0; real xy = 0; @@ -3450,7 +3890,8 @@ void CpuMatrix::sumOfSquares(Matrix& output, Matrix& label) { int* cols = labelptr->getCols(); for (size_t i = 0; i < numSamples; ++i) { for (size_t j = labelptr->getRowStartIdx(i); - j < labelptr->getRowStartIdx(i + 1); ++j) { + j < labelptr->getRowStartIdx(i + 1); + ++j) { cost[i] += 1.0 - 2.0 * out[i * dim + cols[j]]; /* * explanation of above line: original codes are follows: @@ -3466,7 +3907,8 @@ void CpuMatrix::sumOfSquares(Matrix& output, Matrix& label) { real sum1 = 0; real sum2 = 0; for (size_t j = labelptr->getRowStartIdx(i); - j < labelptr->getRowStartIdx(i + 1); ++j) { + j < labelptr->getRowStartIdx(i + 1); + ++j) { sum1 += values[j] * values[j]; sum2 += values[j] * out[i * dim + cols[j]]; /* @@ -3488,8 +3930,10 @@ void CpuMatrix::sumOfSquares(Matrix& output, Matrix& label) { } } - BaseMatrix::sumOfSquaredDiffs(output, label, - /* scaleSum= */1, /* scaleDest= */1); + BaseMatrix::sumOfSquaredDiffs(output, + label, + /* scaleSum= */ 1, + /* scaleDest= */ 1); } /* calculate the error of outputV according to label */ @@ -3519,7 +3963,8 @@ void CpuMatrix::sumOfSquaresBp(Matrix& output, Matrix& label) { int* cols = labelptr->getCols(); for (size_t i = 0; i < numSamples; ++i) { for (size_t j = labelptr->getRowStartIdx(i); - j < labelptr->getRowStartIdx(i + 1); ++j) { + j < labelptr->getRowStartIdx(i + 1); + ++j) { grad[i * dim + cols[j]] -= 2.0; /* * explanation of above line: original codes are follows: @@ -3534,7 +3979,8 @@ void CpuMatrix::sumOfSquaresBp(Matrix& output, Matrix& label) { real* values = labelptr->getValue(); for (size_t i = 0; i < numSamples; ++i) { for (size_t j = labelptr->getRowStartIdx(i); - j < labelptr->getRowStartIdx(i + 1); ++j) { + j < labelptr->getRowStartIdx(i + 1); + ++j) { grad[i * dim + cols[j]] -= 2.0 * values[j]; /* * explanation of above line: original codes are follows: @@ -3809,8 +4255,8 @@ void CpuMatrix::circularConv(Matrix& in0, Matrix& in1) { } } -void CpuMatrix::circularConvDerivative(Matrix& outG, Matrix& in0, Matrix& in1, - Matrix& inG0, Matrix& inG1) { +void CpuMatrix::circularConvDerivative( + Matrix& outG, Matrix& in0, Matrix& in1, Matrix& inG0, Matrix& inG1) { size_t height = in0.getHeight(); size_t width0 = in0.getWidth(); size_t width1 = in1.getWidth(); @@ -3830,8 +4276,12 @@ void CpuMatrix::circularConvDerivative(Matrix& outG, Matrix& in0, Matrix& in1, real* inGV1 = inG1.getData(); int leftCtxLen = (width1 - 1) / 2; - for (size_t x = 0; x < height; ++x, outGV += width0, inV0 += width0, - inV1 += width1, inGV0 += width0, inGV1 += width1) { + for (size_t x = 0; x < height; ++x, + outGV += width0, + inV0 += width0, + inV1 += width1, + inGV0 += width0, + inGV1 += width1) { for (size_t j = 0; j < width1; ++j) { // iterate over width1 for (size_t i = 0; i < width0; ++i) { // such over all dimensions of outG @@ -3900,7 +4350,8 @@ void CpuMatrix::multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) { } /* calculate the classification error for multi binary label */ -void CpuMatrix::classificationErrorMulti(Matrix& output, Matrix& label, +void CpuMatrix::classificationErrorMulti(Matrix& output, + Matrix& label, real threshold) { CHECK(dynamic_cast(&output)); auto labelPtr = dynamic_cast(&label); @@ -3954,12 +4405,12 @@ void CpuMatrix::bilinearForward(const Matrix& in, (void)(inputH); real* outData = getData(); - const real* inData = in.getData(); + const real* inData = in.getData(); if (inImgH == outImgH && inImgW == outImgW) { this->copyFrom(in); } else { - for (size_t k = 0; k < batchSize; ++k) { // loop for batches + for (size_t k = 0; k < batchSize; ++k) { // loop for batches for (size_t i = 0; i < outImgH; ++i) { // loop for images size_t h = ratioH * i; size_t hid = (h < inImgH - 1) ? 1 : 0; @@ -3977,9 +4428,9 @@ void CpuMatrix::bilinearForward(const Matrix& in, for (size_t c = 0; c < numChannels; ++c) { // loop for channels // bilinear interpolation outPos[0] = - h2lambda * (w2lambda * inPos[0] + w1lambda * inPos[wid]) + - h1lambda * (w2lambda * inPos[hid * inImgW] + - w1lambda * inPos[hid * inImgW + wid]); + h2lambda * (w2lambda * inPos[0] + w1lambda * inPos[wid]) + + h1lambda * (w2lambda * inPos[hid * inImgW] + + w1lambda * inPos[hid * inImgW + wid]); inPos += inPosOffset; outPos += outPosOffset; } @@ -4013,7 +4464,7 @@ void CpuMatrix::bilinearBackward(const Matrix& out, if (inImgH == outImgH && inImgW == outImgW) { this->add(const_cast(out)); } else { - for (size_t k = 0; k < batchSize; ++k) { // loop for batches + for (size_t k = 0; k < batchSize; ++k) { // loop for batches for (size_t i = 0; i < outImgH; ++i) { // loop for images size_t h = ratioH * i; size_t hid = (h < inImgH - 1) ? 1 : 0; diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h index 6c3c4804d2fc67a3378c61e8b8ff1e3c0087dd83..075dc845768d7dfa156d33d057a30b28628c099c 100644 --- a/paddle/math/Matrix.h +++ b/paddle/math/Matrix.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include @@ -77,12 +76,19 @@ typedef std::shared_ptr CpuSparseMatrixPtr; */ class Matrix : public BaseMatrix { protected: - Matrix(MemoryHandlePtr memHandle, size_t height, size_t width, bool trans, + Matrix(MemoryHandlePtr memHandle, + size_t height, + size_t width, + bool trans, bool use_gpu); Matrix(real* data, size_t height, size_t width, bool trans, bool use_gpu); - Matrix(real* data, size_t height, size_t width, size_t stride, bool trans, + Matrix(real* data, + size_t height, + size_t width, + size_t stride, + bool trans, bool use_gpu); static ThreadLocal tmpMat_; @@ -94,38 +100,66 @@ public: public: virtual ~Matrix() {} - static MatrixPtr create(MemoryHandlePtr memHandle, size_t height, - size_t width, bool trans = false); - static MatrixPtr create(size_t height, size_t width, bool trans = false, + static MatrixPtr create(MemoryHandlePtr memHandle, + size_t height, + size_t width, + bool trans = false); + static MatrixPtr create(size_t height, + size_t width, + bool trans = false, + bool useGpu = false); + static MatrixPtr create(real* data, + size_t height, + size_t width, + bool trans = false, bool useGpu = false); - static MatrixPtr create(real* data, size_t height, size_t width, - bool trans = false, bool useGpu = false); - static MatrixPtr create(real* data, size_t height, size_t width, - size_t stride, bool trans = false, + static MatrixPtr create(real* data, + size_t height, + size_t width, + size_t stride, + bool trans = false, bool useGpu = false); - static MatrixPtr createSparseMatrix(size_t height, size_t width, size_t nnz, + static MatrixPtr createSparseMatrix(size_t height, + size_t width, + size_t nnz, SparseValueType valueType = FLOAT_VALUE, - bool trans = false, bool useGpu = false); - static MatrixPtr createSparseMatrix(size_t height, size_t width, size_t nnz, + bool trans = false, + bool useGpu = false); + static MatrixPtr createSparseMatrix(size_t height, + size_t width, + size_t nnz, SparseValueType valueType = FLOAT_VALUE, SparseFormat foramt = SPARSE_CSR, - bool trans = false, bool useGpu = false); - - static MatrixPtr createSparseMatrix(real* data, int* row, int* col, - size_t height, size_t width, + bool trans = false, + bool useGpu = false); + + static MatrixPtr createSparseMatrix(real* data, + int* row, + int* col, + size_t height, + size_t width, size_t nnz, /* used to allocate space */ SparseValueType valueType, /*value type*/ - SparseFormat format, bool trans, + SparseFormat format, + bool trans, bool useGpu); static void resizeOrCreateSparseMatrix( - MatrixPtr& matrix, size_t height, size_t width, size_t nnz, - SparseValueType valueType = FLOAT_VALUE, SparseFormat foramt = SPARSE_CSR, - bool trans = false, bool useGpu = false); - - static void resizeOrCreate(MatrixPtr& a, size_t height, size_t width, - bool trans = false, bool useGpu = false); + MatrixPtr& matrix, + size_t height, + size_t width, + size_t nnz, + SparseValueType valueType = FLOAT_VALUE, + SparseFormat foramt = SPARSE_CSR, + bool trans = false, + bool useGpu = false); + + static void resizeOrCreate(MatrixPtr& a, + size_t height, + size_t width, + bool trans = false, + bool useGpu = false); /** * @brief set the data buffer used to hold the matrix data. @@ -163,12 +197,12 @@ public: // if refactor sparse matrix virtual int* getRows() const { LOG(FATAL) << "Not implemented"; - return nullptr; //! suppress warning for no return value. + return nullptr; //! suppress warning for no return value. } virtual int* getCols() const { LOG(FATAL) << "Not implemented"; - return nullptr; //! suppress warning for no return value. + return nullptr; //! suppress warning for no return value. } virtual SparseFormat getFormat() const { @@ -178,7 +212,7 @@ public: virtual SparseValueType getValueType() const { LOG(FATAL) << "Not implemented"; - return NO_VALUE; //! suppress warning for no return value. + return NO_VALUE; //! suppress warning for no return value. } /** @@ -208,7 +242,9 @@ public: LOG(FATAL) << "Not implemented"; } - MatrixPtr subMatrix(size_t startRow, size_t endRow, size_t startCol, + MatrixPtr subMatrix(size_t startRow, + size_t endRow, + size_t startCol, size_t endCol); MatrixPtr subRowMatrix(size_t startRow, size_t endRow) { @@ -221,8 +257,11 @@ public: virtual MatrixPtr subMatrix(size_t startRow, size_t numRows) { CHECK_LE(startRow + numRows, getHeight()); - return Matrix::create(getData() + startRow * getWidth(), numRows, - getWidth(), trans_, useGpu_); + return Matrix::create(getData() + startRow * getWidth(), + numRows, + getWidth(), + trans_, + useGpu_); } virtual MatrixPtr subMatrix(size_t startRow, size_t numRows, MatrixPtr dest) { CHECK_LE(startRow + numRows, getHeight()); @@ -267,7 +306,8 @@ public: * as this, otherwise the new matrix will have the specified size. * */ - virtual MatrixPtr clone(size_t height = 0, size_t width = 0, + virtual MatrixPtr clone(size_t height = 0, + size_t width = 0, bool useGpu = false) { LOG(FATAL) << "Not implemented"; return nullptr; @@ -305,9 +345,11 @@ public: /** * @note This should only be used for sparse matrix. */ - virtual void resize(size_t newHeight, size_t newWidth, + virtual void resize(size_t newHeight, + size_t newWidth, size_t newNnz, /* total item used to allocate space */ - SparseValueType valueType, SparseFormat format) = 0; + SparseValueType valueType, + SparseFormat format) = 0; /** * @brief This should only be used for sparse matrix. @@ -315,7 +357,9 @@ public: * Currently must be called for each row in order. * The matrix is not valid until setRow is called for the last row. */ - virtual void setRow(size_t row, size_t colNum, const unsigned int* cols, + virtual void setRow(size_t row, + size_t colNum, + const unsigned int* cols, const real* values) = 0; virtual MatrixPtr getTranspose() = 0; @@ -389,8 +433,9 @@ public: } } - virtual void sequenceAvgForward(Matrix& a, const IVector& startsPos, - int mode) { + virtual void sequenceAvgForward(Matrix& a, + const IVector& startsPos, + int mode) { LOG(FATAL) << "Not implemented"; } @@ -399,7 +444,9 @@ public: * this = scaleAB*(a*b) + scaleT*this * @endcode */ - virtual void mul(const MatrixPtr a, const MatrixPtr b, real scaleAB, + virtual void mul(const MatrixPtr a, + const MatrixPtr b, + real scaleAB, real scaleT) { LOG(FATAL) << "Not implemented"; } @@ -416,7 +463,8 @@ public: * where index(i, j) = ((codes(i) + numClasses) >> (j + 1)) - 1 * @endcode */ - virtual void addByBitCode(size_t numClasses, const IVector& codes, + virtual void addByBitCode(size_t numClasses, + const IVector& codes, const Matrix& vec) { (void)numClasses; (void)codes; @@ -431,7 +479,8 @@ public: * where index is same as the index for addByBitCode * @endcode */ - virtual void addByBitCodeBackward(size_t numClasses, const IVector& codes, + virtual void addByBitCodeBackward(size_t numClasses, + const IVector& codes, Matrix& vec) { (void)numClasses; (void)codes; @@ -446,8 +495,10 @@ public: * where index is same as the index for addByBitCode * @endcode */ - virtual void mulByBitCode(size_t numClasses, const IVector& codes, - const Matrix& mat, const Matrix& input) { + virtual void mulByBitCode(size_t numClasses, + const IVector& codes, + const Matrix& mat, + const Matrix& input) { (void)numClasses; (void)codes; (void)mat; @@ -463,7 +514,8 @@ public: * @endcode */ virtual void mulByBitCodeBackwardWeight(size_t numClasses, - const IVector& codes, Matrix& mat, + const IVector& codes, + Matrix& mat, const Matrix& input) { (void)numClasses; (void)codes; @@ -481,7 +533,8 @@ public: */ virtual void mulByBitCodeBackwardError(size_t numClasses, const IVector& codes, - const Matrix& mat, Matrix& input) { + const Matrix& mat, + Matrix& input) { (void)numClasses; (void)codes; (void)mat; @@ -496,7 +549,9 @@ public: * where bit(i, j) = ((codes(i) + numClasses) & 2^j) ? 1 : 0 * @endcode */ - virtual void sumByBitCode(size_t numClasses, IVector& codes, Matrix& sum, + virtual void sumByBitCode(size_t numClasses, + IVector& codes, + Matrix& sum, real scaleSum) { (void)numClasses; (void)codes; @@ -550,12 +605,16 @@ public: LOG(FATAL) << "not implemented"; } - virtual void maxoutForward(Matrix& a, IVector& id, size_t channels, + virtual void maxoutForward(Matrix& a, + IVector& id, + size_t channels, size_t groups) { LOG(FATAL) << "not implemented"; } - virtual void maxoutBackward(Matrix& a, IVector& id, size_t channels, + virtual void maxoutBackward(Matrix& a, + IVector& id, + size_t channels, size_t groups) { LOG(FATAL) << "not implemented"; } @@ -634,7 +693,8 @@ public: } /// copy -log(output[label]) to this->data[i]. - virtual void oneHotCrossEntropyWithSelfNorm(Matrix& output, IVector& label, + virtual void oneHotCrossEntropyWithSelfNorm(Matrix& output, + IVector& label, real alpha) { LOG(FATAL) << "Not implemented"; } @@ -660,13 +720,14 @@ public: LOG(FATAL) << "Not implemented"; } - virtual void circularConvDerivative(Matrix& output, Matrix& prevOut1, - Matrix& prevOut2, Matrix& prevGrad1, + virtual void circularConvDerivative(Matrix& output, + Matrix& prevOut1, + Matrix& prevOut2, + Matrix& prevGrad1, Matrix& prevGrad2) { LOG(FATAL) << "Not implemented"; } - /* output_ij = exp(this_{ij}) / (sum_j exp(this_ij)) */ virtual void softmax(Matrix& output) { (void)output; @@ -727,9 +788,12 @@ public: LOG(FATAL) << "Not implemented"; } - virtual void cosSimDerivative(Matrix& output, Matrix& prevOut1, - Matrix& prevOut2, Matrix& prevGrad1, - Matrix& prevGrad2, real scale = 1.0f) { + virtual void cosSimDerivative(Matrix& output, + Matrix& prevOut1, + Matrix& prevOut2, + Matrix& prevGrad1, + Matrix& prevGrad2, + real scale = 1.0f) { LOG(FATAL) << "Not implemented"; } @@ -781,10 +845,18 @@ public: * It will expand a feature matrix according to the * convolution filters */ - virtual void convExpand(Matrix& feature, int feaImgHeight, int feaImgWidth, - int channels, int blockH, int blockW, int strideH, - int strideW, int paddingH, int paddingW, - int outputH, int outputW) { + virtual void convExpand(Matrix& feature, + int feaImgHeight, + int feaImgWidth, + int channels, + int blockH, + int blockW, + int strideH, + int strideW, + int paddingH, + int paddingW, + int outputH, + int outputW) { LOG(FATAL) << "Not implemeted"; } @@ -793,11 +865,20 @@ public: * * Its function is to restore a expanded-matrix into a feature matrix */ - virtual void convShrink(Matrix& expandColMat, int thisImgHeight, - int thisImgWidth, int channels, int blockH, - int blockW, int strideH, int strideW, int paddingH, - int paddingW, int outputH, int outputW, - real alpha = 1.0f, real beta = 0.0f) { + virtual void convShrink(Matrix& expandColMat, + int thisImgHeight, + int thisImgWidth, + int channels, + int blockH, + int blockW, + int strideH, + int strideW, + int paddingH, + int paddingW, + int outputH, + int outputW, + real alpha = 1.0f, + real beta = 0.0f) { LOG(FATAL) << "Not implemeted"; } @@ -805,54 +886,93 @@ public: * Pooling forward operation, pick out the largest element * in the sizeX of value */ - virtual void maxPoolForward(Matrix& inputMat, size_t imgSizeH, - size_t imgSizeW, size_t channels, size_t sizeX, - size_t sizeY, size_t strideH, size_t strideW, - size_t outputH, size_t outputW, - size_t paddingH, size_t paddingW) { + virtual void maxPoolForward(Matrix& inputMat, + size_t imgSizeH, + size_t imgSizeW, + size_t channels, + size_t sizeX, + size_t sizeY, + size_t strideH, + size_t strideW, + size_t outputH, + size_t outputW, + size_t paddingH, + size_t paddingW) { LOG(FATAL) << "Not implemeted"; } /// Pooling backward operation. - virtual void maxPoolBackward(Matrix& image, size_t imgSizeH, size_t imgSizeW, - Matrix& outGrad, Matrix& outV, size_t sizeX, - size_t sizeY, size_t strideH, size_t strideW, - size_t outputH, size_t outputW, - real scaleTargets, real scaleOutput, - size_t paddingH, size_t paddingW) { + virtual void maxPoolBackward(Matrix& image, + size_t imgSizeH, + size_t imgSizeW, + Matrix& outGrad, + Matrix& outV, + size_t sizeX, + size_t sizeY, + size_t strideH, + size_t strideW, + size_t outputH, + size_t outputW, + real scaleTargets, + real scaleOutput, + size_t paddingH, + size_t paddingW) { LOG(FATAL) << "Not implemeted"; } /// Pooling forward operation, caculate the average of sizeX elements. - virtual void avgPoolForward(Matrix& input, size_t imgSizeH, size_t imgSizeW, - size_t channels, size_t sizeX, size_t sizeY, - size_t strideH, size_t strideW, - size_t outputH, size_t outputW, - size_t paddingH, size_t paddingW) { + virtual void avgPoolForward(Matrix& input, + size_t imgSizeH, + size_t imgSizeW, + size_t channels, + size_t sizeX, + size_t sizeY, + size_t strideH, + size_t strideW, + size_t outputH, + size_t outputW, + size_t paddingH, + size_t paddingW) { LOG(FATAL) << "Not implemeted"; } - virtual void avgPoolBackward(Matrix& input, size_t imgSizeH, size_t imgSizeW, - size_t sizeX, size_t sizeY, - size_t strideH, size_t strideW, - size_t outputH, size_t outputW, - real scaleTargets, real scaleOutput, - size_t paddingH, size_t paddingW) { + virtual void avgPoolBackward(Matrix& input, + size_t imgSizeH, + size_t imgSizeW, + size_t sizeX, + size_t sizeY, + size_t strideH, + size_t strideW, + size_t outputH, + size_t outputW, + real scaleTargets, + real scaleOutput, + size_t paddingH, + size_t paddingW) { LOG(FATAL) << "Not implemeted"; } /// normalize-operation. - virtual void crossMapNormalFwd(Matrix& input, size_t imgSizeH, - size_t imgSizeW, Matrix& denoms, - size_t channels, size_t sizeX, float scale, + virtual void crossMapNormalFwd(Matrix& input, + size_t imgSizeH, + size_t imgSizeW, + Matrix& denoms, + size_t channels, + size_t sizeX, + float scale, float pow) { LOG(FATAL) << "Not implemeted"; } - virtual void crossMapNormalBwd(Matrix& localGrad, Matrix& denoms, - Matrix& preOutV, Matrix& localOutV, - size_t channels, size_t imgSizeH, - size_t imgSizeW, size_t size, float scale, + virtual void crossMapNormalBwd(Matrix& localGrad, + Matrix& denoms, + Matrix& preOutV, + Matrix& localOutV, + size_t channels, + size_t imgSizeH, + size_t imgSizeW, + size_t size, + float scale, float pow) { LOG(FATAL) << "Not implemeted"; } @@ -865,20 +985,24 @@ public: * * output[i] is set to max_input[i]. */ - virtual void maxSequenceForward(Matrix& input, const IVector& sequence, + virtual void maxSequenceForward(Matrix& input, + const IVector& sequence, IVector& index) { LOG(FATAL) << "Not implemeted"; } - virtual void maxSequenceBackward(Matrix& outputGrad, const IVector& sequence, + virtual void maxSequenceBackward(Matrix& outputGrad, + const IVector& sequence, IVector& index) { LOG(FATAL) << "Not implemeted"; } - virtual void contextProjectionForward(MatrixPtr input, MatrixPtr weight, + virtual void contextProjectionForward(MatrixPtr input, + MatrixPtr weight, const IVector& sequence, int contextLength, - int contextStart, size_t beginPad, + int contextStart, + size_t beginPad, bool isPadding) { LOG(FATAL) << "Not implemeted"; } @@ -887,7 +1011,8 @@ public: MatrixPtr weightGrad, const IVector& sequence, int contextLength, - int contextStart, size_t beginPad, + int contextStart, + size_t beginPad, bool isPadding) { LOG(FATAL) << "Not implemeted"; } @@ -902,7 +1027,8 @@ public: virtual void contextProjectionBackwardWeight(MatrixPtr weightGrad, const IVector& sequence, int contextLength, - int contextStart, int totalPad, + int contextStart, + int totalPad, size_t beginPad) { LOG(FATAL) << "Not implemeted"; } @@ -981,7 +1107,8 @@ public: * / output->getWidth() * @endcode */ - virtual void classificationErrorMulti(Matrix& output, Matrix& label, + virtual void classificationErrorMulti(Matrix& output, + Matrix& label, real threshold) { LOG(FATAL) << "Not implemented"; } @@ -1029,10 +1156,15 @@ public: GpuMatrix(size_t height, size_t width, bool trans = false); GpuMatrix(real* data, size_t height, size_t width, bool trans = false) : Matrix(data, height, width, trans, true) {} - GpuMatrix(real* data, size_t height, size_t width, size_t stride, + GpuMatrix(real* data, + size_t height, + size_t width, + size_t stride, bool trans = false) : Matrix(data, height, width, stride, trans, true) {} - GpuMatrix(GpuMemHandlePtr dataHandle, size_t height, size_t width, + GpuMatrix(GpuMemHandlePtr dataHandle, + size_t height, + size_t width, bool trans = false) : Matrix(dataHandle, height, width, trans, true) {} ~GpuMatrix(); @@ -1042,12 +1174,16 @@ public: void setDiag(real value); void resize(size_t newHeight, size_t newWidth); - void resize(size_t newHeight, size_t newWidth, + void resize(size_t newHeight, + size_t newWidth, size_t newNnz, /* used to allocate space */ - SparseValueType valueType, SparseFormat format) { + SparseValueType valueType, + SparseFormat format) { LOG(FATAL) << "Only Support Sparse Matrix"; } - void setRow(size_t row, size_t colNum, const unsigned int* cols, + void setRow(size_t row, + size_t colNum, + const unsigned int* cols, const real* values) { LOG(FATAL) << "Only Support Sparse Matrix"; } @@ -1137,10 +1273,14 @@ public: void mul(const GpuMatrix& a, const GpuMatrix& b, real scaleAB, real scaleT); - void mul(const GpuSparseMatrix& a, const GpuMatrix& b, real scaleAB, + void mul(const GpuSparseMatrix& a, + const GpuMatrix& b, + real scaleAB, real scaleT); - void mul(const GpuMatrix& a, const GpuSparseMatrix& b, real scaleAB, + void mul(const GpuMatrix& a, + const GpuSparseMatrix& b, + real scaleAB, real scaleT); /** @@ -1182,9 +1322,11 @@ public: void oneHotCrossEntropy(Matrix& output, IVector& label); void oneHotCrossEntropyBp(Matrix& outputV, IVector& label); - void oneHotCrossEntropyWithSelfNorm(Matrix& output, IVector& label, + void oneHotCrossEntropyWithSelfNorm(Matrix& output, + IVector& label, real alpha); - void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV, IVector& label, + void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV, + IVector& label, real alpha); void softmax(Matrix& output); @@ -1204,8 +1346,12 @@ public: void scaledTanh(Matrix& output, real p1, real p2); void cosSim(Matrix& output1, Matrix& output2, real scale); - void cosSimDerivative(Matrix& output, Matrix& prevOut1, Matrix& prevOut2, - Matrix& prevGrad1, Matrix& prevGrad2, real scale); + void cosSimDerivative(Matrix& output, + Matrix& prevOut1, + Matrix& prevOut2, + Matrix& prevGrad1, + Matrix& prevGrad2, + real scale); virtual void print(std::ostream& os) const; virtual void print(std::ostream& os, size_t height, size_t width) const; @@ -1219,71 +1365,136 @@ public: void classificationError(MatrixPtr output, IVectorPtr label); - void convExpand(Matrix& feature, int feaImgHeight, int feaImgWidth, - int channels, int blockH, int blockW, int strideH, - int strideW, int paddingH, int paddingW, - int outputH, int outputW); - - void convShrink(Matrix& expandColMat, int thisImgHeight, int thisImgWidth, - int channels, int blockH, int blochW, int strideH, - int strideW, int paddingH, int paddingWreal, - int outputH, int outputW, - real alpha = 1.0f, real beta = 0.0f); - - void maxPoolForward(Matrix& inputMat, size_t imgSizeH, size_t imgSizeW, - size_t channels, size_t sizeX, size_t sizeY, - size_t strideH, size_t strideW, - size_t outputH, size_t outputW, - size_t paddingH, size_t paddingW); - - void maxPoolBackward(Matrix& image, size_t imgSizeH, size_t imgSizeW, - Matrix& outGrad, Matrix& outV, size_t sizeX, - size_t sizeY, size_t strideH, size_t strideW, - size_t outputH, size_t outputW, - real scaleTargets, real scaleOutput, - size_t paddingH, size_t paddingW); - - void avgPoolForward(Matrix& input, size_t imgSizeH, size_t imgSizeW, - size_t channels, size_t sizeX, size_t sizeY, - size_t strideH, size_t strideW, - size_t outputH, size_t outputW, - size_t paddingH, size_t paddingW); - - void avgPoolBackward(Matrix& input, size_t imgSizeH, size_t imgSizeW, - size_t sizeX, size_t sizeY, - size_t strideH, size_t strideW, - size_t outputH, size_t outputW, - real scaleTargets, real scaleOutput, - size_t paddingH, size_t paddingW); - - void crossMapNormalFwd(Matrix& input, size_t imgSizeH, size_t imgSizeW, - Matrix& denoms, size_t channels, size_t sizeX, - float scale, float pow); - - void crossMapNormalBwd(Matrix& localGrad, Matrix& denoms, Matrix& preOutV, - Matrix& localOutV, size_t channels, size_t imgSizeH, - size_t imgSizeW, size_t sizeX, - float scale, float pow); - - void maxSequenceForward(Matrix& input, const IVector& sequence, + void convExpand(Matrix& feature, + int feaImgHeight, + int feaImgWidth, + int channels, + int blockH, + int blockW, + int strideH, + int strideW, + int paddingH, + int paddingW, + int outputH, + int outputW); + + void convShrink(Matrix& expandColMat, + int thisImgHeight, + int thisImgWidth, + int channels, + int blockH, + int blochW, + int strideH, + int strideW, + int paddingH, + int paddingWreal, + int outputH, + int outputW, + real alpha = 1.0f, + real beta = 0.0f); + + void maxPoolForward(Matrix& inputMat, + size_t imgSizeH, + size_t imgSizeW, + size_t channels, + size_t sizeX, + size_t sizeY, + size_t strideH, + size_t strideW, + size_t outputH, + size_t outputW, + size_t paddingH, + size_t paddingW); + + void maxPoolBackward(Matrix& image, + size_t imgSizeH, + size_t imgSizeW, + Matrix& outGrad, + Matrix& outV, + size_t sizeX, + size_t sizeY, + size_t strideH, + size_t strideW, + size_t outputH, + size_t outputW, + real scaleTargets, + real scaleOutput, + size_t paddingH, + size_t paddingW); + + void avgPoolForward(Matrix& input, + size_t imgSizeH, + size_t imgSizeW, + size_t channels, + size_t sizeX, + size_t sizeY, + size_t strideH, + size_t strideW, + size_t outputH, + size_t outputW, + size_t paddingH, + size_t paddingW); + + void avgPoolBackward(Matrix& input, + size_t imgSizeH, + size_t imgSizeW, + size_t sizeX, + size_t sizeY, + size_t strideH, + size_t strideW, + size_t outputH, + size_t outputW, + real scaleTargets, + real scaleOutput, + size_t paddingH, + size_t paddingW); + + void crossMapNormalFwd(Matrix& input, + size_t imgSizeH, + size_t imgSizeW, + Matrix& denoms, + size_t channels, + size_t sizeX, + float scale, + float pow); + + void crossMapNormalBwd(Matrix& localGrad, + Matrix& denoms, + Matrix& preOutV, + Matrix& localOutV, + size_t channels, + size_t imgSizeH, + size_t imgSizeW, + size_t sizeX, + float scale, + float pow); + + void maxSequenceForward(Matrix& input, + const IVector& sequence, IVector& index); - void maxSequenceBackward(Matrix& outputGrad, const IVector& sequence, + void maxSequenceBackward(Matrix& outputGrad, + const IVector& sequence, IVector& index); - void contextProjectionForward(MatrixPtr input, MatrixPtr weight, - const IVector& sequence, int contextLength, - int contextStart, size_t beginPad, + void contextProjectionForward(MatrixPtr input, + MatrixPtr weight, + const IVector& sequence, + int contextLength, + int contextStart, + size_t beginPad, bool isPadding); void contextProjectionBackwardData(MatrixPtr inputGrad, const IVector& sequence, - int contextLength, int contextStart); + int contextLength, + int contextStart); void contextProjectionBackwardWeight(MatrixPtr weightGrad, const IVector& sequence, int contextLength, - int contextStart, int totalPad, + int contextStart, + int totalPad, size_t beginPad); void bilinearForward(const Matrix& in, @@ -1314,11 +1525,16 @@ public: CpuMatrix(size_t height, size_t width, bool trans = false); CpuMatrix(real* data, size_t height, size_t width, bool trans = false) : Matrix(data, height, width, trans, false) {} - CpuMatrix(real* data, size_t height, size_t width, size_t stride, + CpuMatrix(real* data, + size_t height, + size_t width, + size_t stride, bool trans = false) : Matrix(data, height, width, stride, trans, false) {} - CpuMatrix(CpuMemHandlePtr dataHandle, size_t height, size_t width, + CpuMatrix(CpuMemHandlePtr dataHandle, + size_t height, + size_t width, bool trans = false) : Matrix(dataHandle, height, width, trans, false) {} @@ -1329,12 +1545,16 @@ public: void setDiag(real value); void resize(size_t newHeight, size_t newWidth); - void resize(size_t newHeight, size_t newWidth, + void resize(size_t newHeight, + size_t newWidth, size_t newNnz, /* used to allocate space */ - SparseValueType valueType, SparseFormat format) { + SparseValueType valueType, + SparseFormat format) { LOG(FATAL) << "Only Support Sparse Matrix"; } - void setRow(size_t row, size_t colNum, const unsigned int* cols, + void setRow(size_t row, + size_t colNum, + const unsigned int* cols, const real* values) { LOG(FATAL) << "Only Support Sparse Matrix"; } @@ -1366,67 +1586,132 @@ public: MatrixPtr clone(size_t height, size_t width, bool useGpu = false); - void convExpand(Matrix& feature, int feaImgHeight, int feaImgWidth, - int channels, int blcokH, int blockW, int strideH, - int strideW, int paddingH, int paddingW, - int outputH, int outputW); - - void convShrink(Matrix& expandFeat, int thisImgHeight, int thisImgWidth, - int channels, int blockH, int blockW, int strideH, - int strideW, int paddingH, int paddingW, - int outputH, int outputW, - real alpha = 1.0f, real beta = 0.0f); - - void maxPoolForward(Matrix& inputMat, size_t imgSizeH, size_t imgSizeW, - size_t channels, size_t sizeX, size_t sizeY, - size_t strideH, size_t strideW, - size_t outputH, size_t outputW, - size_t paddingH, size_t paddingW); - - void maxPoolBackward(Matrix& image, size_t imgSizeH, size_t imgSizeW, - Matrix& outGrad, Matrix& outV, - size_t sizeX, size_t sizeY, - size_t strideH, size_t strideW, - size_t outputH, size_t outputW, - real scaleTargets, real scaleOutput, - size_t paddingH, size_t paddingW); - - void avgPoolForward(Matrix& input, size_t imgSizeH, size_t imgSizeW, - size_t channels, size_t sizeX, size_t sizeY, - size_t strideH, size_t strideW, - size_t outputH, size_t outputW, - size_t paddingH, size_t paddingW); - - void avgPoolBackward(Matrix& input, size_t imgSizeH, size_t imgSizeW, - size_t sizeX, size_t sizeY, - size_t strideH, size_t strideW, - size_t outputH, size_t outputW, - real scaleTargets, real scaleOutput, - size_t paddingH, size_t paddingW); - - void crossMapNormalFwd(Matrix& input, size_t imgSizeH, size_t imgSizeW, - Matrix& denoms, size_t channels, size_t sizeX, - float scale, float pow); - - void crossMapNormalBwd(Matrix& localGrad, Matrix& denoms, Matrix& preOutV, - Matrix& localOutV, size_t channels, size_t imgSizeH, - size_t imgSizeW, size_t sizeX, - float scale, float pow); - - void maxSequenceForward(Matrix& input, const IVector& sequence, + void convExpand(Matrix& feature, + int feaImgHeight, + int feaImgWidth, + int channels, + int blcokH, + int blockW, + int strideH, + int strideW, + int paddingH, + int paddingW, + int outputH, + int outputW); + + void convShrink(Matrix& expandFeat, + int thisImgHeight, + int thisImgWidth, + int channels, + int blockH, + int blockW, + int strideH, + int strideW, + int paddingH, + int paddingW, + int outputH, + int outputW, + real alpha = 1.0f, + real beta = 0.0f); + + void maxPoolForward(Matrix& inputMat, + size_t imgSizeH, + size_t imgSizeW, + size_t channels, + size_t sizeX, + size_t sizeY, + size_t strideH, + size_t strideW, + size_t outputH, + size_t outputW, + size_t paddingH, + size_t paddingW); + + void maxPoolBackward(Matrix& image, + size_t imgSizeH, + size_t imgSizeW, + Matrix& outGrad, + Matrix& outV, + size_t sizeX, + size_t sizeY, + size_t strideH, + size_t strideW, + size_t outputH, + size_t outputW, + real scaleTargets, + real scaleOutput, + size_t paddingH, + size_t paddingW); + + void avgPoolForward(Matrix& input, + size_t imgSizeH, + size_t imgSizeW, + size_t channels, + size_t sizeX, + size_t sizeY, + size_t strideH, + size_t strideW, + size_t outputH, + size_t outputW, + size_t paddingH, + size_t paddingW); + + void avgPoolBackward(Matrix& input, + size_t imgSizeH, + size_t imgSizeW, + size_t sizeX, + size_t sizeY, + size_t strideH, + size_t strideW, + size_t outputH, + size_t outputW, + real scaleTargets, + real scaleOutput, + size_t paddingH, + size_t paddingW); + + void crossMapNormalFwd(Matrix& input, + size_t imgSizeH, + size_t imgSizeW, + Matrix& denoms, + size_t channels, + size_t sizeX, + float scale, + float pow); + + void crossMapNormalBwd(Matrix& localGrad, + Matrix& denoms, + Matrix& preOutV, + Matrix& localOutV, + size_t channels, + size_t imgSizeH, + size_t imgSizeW, + size_t sizeX, + float scale, + float pow); + + void maxSequenceForward(Matrix& input, + const IVector& sequence, IVector& index); - void maxSequenceBackward(Matrix& outputGrad, const IVector& sequence, + void maxSequenceBackward(Matrix& outputGrad, + const IVector& sequence, IVector& index); - void contextProjectionForward(MatrixPtr input, MatrixPtr weight, - const IVector& sequence, int contextLength, - int contextStart, size_t beginPad, + void contextProjectionForward(MatrixPtr input, + MatrixPtr weight, + const IVector& sequence, + int contextLength, + int contextStart, + size_t beginPad, bool isPadding); - void contextProjectionBackward(MatrixPtr inputGrad, MatrixPtr weightGrad, - const IVector& sequence, int contextLength, - int contextStart, size_t beginPad, + void contextProjectionBackward(MatrixPtr inputGrad, + MatrixPtr weightGrad, + const IVector& sequence, + int contextLength, + int contextStart, + size_t beginPad, bool isPadding); real* getRow(size_t row) { return BaseMatrix::rowBuf(row); } @@ -1443,7 +1728,6 @@ public: void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode); - /** * @code * this.row[i] += table.row[ids[i]] @@ -1490,7 +1774,10 @@ public: void mul(CpuMatrix* a, CpuSparseMatrix* b, real scaleAB, real scaleT); - static void mul(CpuMatrix* a, CpuMatrix* b, CpuSparseMatrix* c, real scaleAB, + static void mul(CpuMatrix* a, + CpuMatrix* b, + CpuSparseMatrix* c, + real scaleAB, real scaleT); /** @@ -1500,8 +1787,8 @@ public: * Define B,C as template instead of virtual class for performance sake. */ template - static void mul(CpuSparseMatrix* a, MatBType* b, MatCType* c, real scaleAB, - real scaleT); + static void mul( + CpuSparseMatrix* a, MatBType* b, MatCType* c, real scaleAB, real scaleT); virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT); @@ -1525,14 +1812,18 @@ public: void oneHotCrossEntropy(Matrix& output, IVector& label); void oneHotCrossEntropyBp(Matrix& outputV, IVector& label); - void oneHotCrossEntropyWithSelfNorm(Matrix& output, IVector& label, + void oneHotCrossEntropyWithSelfNorm(Matrix& output, + IVector& label, real alpha); - void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV, IVector& label, + void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV, + IVector& label, real alpha); void circularConv(Matrix& b, Matrix& c); - void circularConvDerivative(Matrix& output, Matrix& prevOut1, - Matrix& prevOut2, Matrix& prevGrad1, + void circularConvDerivative(Matrix& output, + Matrix& prevOut1, + Matrix& prevOut2, + Matrix& prevGrad1, Matrix& prevGrad2); void softmax(Matrix& output); @@ -1553,8 +1844,12 @@ public: void scaledTanh(Matrix& output, real p1, real p2); void cosSim(Matrix& output1, Matrix& output2, real scale); - void cosSimDerivative(Matrix& output, Matrix& prevOut1, Matrix& prevOut2, - Matrix& prevGrad1, Matrix& prevGrad2, real scale); + void cosSimDerivative(Matrix& output, + Matrix& prevOut1, + Matrix& prevOut2, + Matrix& prevGrad1, + Matrix& prevGrad2, + real scale); void print(std::ostream& os) const; void print(std::ostream& os, size_t height, size_t width) const; @@ -1575,19 +1870,28 @@ public: void addByBitCode(size_t numClasses, const IVector& codes, const Matrix& vec); - void addByBitCodeBackward(size_t numClasses, const IVector& codes, + void addByBitCodeBackward(size_t numClasses, + const IVector& codes, Matrix& vec); - void mulByBitCode(size_t numClasses, const IVector& codes, const Matrix& mat, + void mulByBitCode(size_t numClasses, + const IVector& codes, + const Matrix& mat, const Matrix& input); - void mulByBitCodeBackwardWeight(size_t numClasses, const IVector& codes, - Matrix& mat, const Matrix& input); + void mulByBitCodeBackwardWeight(size_t numClasses, + const IVector& codes, + Matrix& mat, + const Matrix& input); - void mulByBitCodeBackwardError(size_t numClasses, const IVector& codes, - const Matrix& mat, Matrix& input); + void mulByBitCodeBackwardError(size_t numClasses, + const IVector& codes, + const Matrix& mat, + Matrix& input); - void sumByBitCode(size_t numClasses, IVector& codes, Matrix& sum, + void sumByBitCode(size_t numClasses, + IVector& codes, + Matrix& sum, real scaleSum); void subByBitCode(size_t numClasses_, IVector& codes); @@ -1622,20 +1926,25 @@ public: : CpuMatrix(height, width, trans) { initShared(blockNum); } - SharedCpuMatrix(int blockNum, real* data, size_t height, size_t width, - bool trans = false) + SharedCpuMatrix( + int blockNum, real* data, size_t height, size_t width, bool trans = false) : CpuMatrix(data, height, width, trans) { initShared(blockNum); } - SharedCpuMatrix(int blockNum, CpuMemHandlePtr dataHandle, size_t height, - size_t width, bool trans = false) + SharedCpuMatrix(int blockNum, + CpuMemHandlePtr dataHandle, + size_t height, + size_t width, + bool trans = false) : CpuMatrix(dataHandle, height, width, trans) { initShared(blockNum); } - SharedCpuMatrix(CpuMemHandlePtr dataHandle, size_t height, - size_t width, bool trans = false) + SharedCpuMatrix(CpuMemHandlePtr dataHandle, + size_t height, + size_t width, + bool trans = false) : CpuMatrix(dataHandle, height, width, trans) { initBlock(1); } diff --git a/paddle/math/MatrixBitCode.cpp b/paddle/math/MatrixBitCode.cpp index 8497c26e35404a4de970bc2d28b23ebf1090ae6c..ac5b10c7bd56bb34393ac8abb98900351afc2e41 100644 --- a/paddle/math/MatrixBitCode.cpp +++ b/paddle/math/MatrixBitCode.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/utils/Logging.h" #include "paddle/utils/Util.h" #include "Matrix.h" @@ -80,8 +79,8 @@ private: op(tmat(i, j), vec(0, index(i, j))) */ template -static void addByBitCodeT(Op op, CodeTable codeTable, const IVector& codes, - TMat& tmat, Mat& vec) { +static void addByBitCodeT( + Op op, CodeTable codeTable, const IVector& codes, TMat& tmat, Mat& vec) { CHECK(!vec.useGpu()); size_t numClasses = codeTable.size(); @@ -109,7 +108,8 @@ static void addByBitCodeT(Op op, CodeTable codeTable, const IVector& codes, /* For j < codeLength: this(i, j) += vec(0, index(i, j)) */ -void CpuMatrix::addByBitCode(size_t numClasses, const IVector& codes, +void CpuMatrix::addByBitCode(size_t numClasses, + const IVector& codes, const Matrix& vec) { auto op = [](real& t, real v) { t += v; }; addByBitCodeT(op, SimpleCodeTable(numClasses), codes, *this, vec); @@ -118,7 +118,8 @@ void CpuMatrix::addByBitCode(size_t numClasses, const IVector& codes, /* For j < codeLength: vec(0, index(i, j)) += this(i, j) */ -void CpuMatrix::addByBitCodeBackward(size_t numClasses, const IVector& codes, +void CpuMatrix::addByBitCodeBackward(size_t numClasses, + const IVector& codes, Matrix& vec) { auto op = [](real t, real& v) { v += t; }; addByBitCodeT(op, SimpleCodeTable(numClasses), codes, *this, vec); @@ -129,10 +130,18 @@ void CpuMatrix::addByBitCodeBackward(size_t numClasses, const IVector& codes, for j < codeLength: op(tmat(i, j), mat.row(index(i, j)), input.row(i)) */ -template -void mulByBitCodeT(Op op, CodeTable codeTable, IVec& codes, TMat& tmat, - WMat& weight, InMat& input) { +void mulByBitCodeT(Op op, + CodeTable codeTable, + IVec& codes, + TMat& tmat, + WMat& weight, + InMat& input) { CHECK(!tmat.useGpu() && !weight.useGpu() && !input.useGpu()); size_t numClasses = codeTable.size(); @@ -161,10 +170,12 @@ void mulByBitCodeT(Op op, CodeTable codeTable, IVec& codes, TMat& tmat, /* For j < codeLength: this(i, j) += */ -void CpuMatrix::mulByBitCode(size_t numClasses, const IVector& codes, - const Matrix& weight, const Matrix& input) { - auto op = [](real& t, const real* weightRow, const real* inputRow, - size_t inputDim) { +void CpuMatrix::mulByBitCode(size_t numClasses, + const IVector& codes, + const Matrix& weight, + const Matrix& input) { + auto op = []( + real& t, const real* weightRow, const real* inputRow, size_t inputDim) { real sum = 0; for (size_t k = 0; k < inputDim; ++k) { sum += weightRow[k] * inputRow[k]; @@ -179,14 +190,15 @@ void CpuMatrix::mulByBitCode(size_t numClasses, const IVector& codes, weight.row(index(i, j)) += this(i, j) * input.row(i) */ void CpuMatrix::mulByBitCodeBackwardWeight(size_t numClasses, - const IVector& codes, Matrix& weight, + const IVector& codes, + Matrix& weight, const Matrix& input) { - auto op = - [](const real t, real* weightRow, const real* inputRow, size_t inputDim) { - for (size_t k = 0; k < inputDim; ++k) { - weightRow[k] += t * inputRow[k]; - } - }; + auto op = []( + const real t, real* weightRow, const real* inputRow, size_t inputDim) { + for (size_t k = 0; k < inputDim; ++k) { + weightRow[k] += t * inputRow[k]; + } + }; mulByBitCodeT(op, SimpleCodeTable(numClasses), codes, *this, weight, input); } @@ -196,20 +208,24 @@ void CpuMatrix::mulByBitCodeBackwardWeight(size_t numClasses, */ void CpuMatrix::mulByBitCodeBackwardError(size_t numClasses, const IVector& codes, - const Matrix& weight, Matrix& input) { - auto op = - [](const real t, const real* weightRow, real* inputRow, size_t inputDim) { - for (size_t k = 0; k < inputDim; ++k) { - inputRow[k] += t * weightRow[k]; - } - }; + const Matrix& weight, + Matrix& input) { + auto op = []( + const real t, const real* weightRow, real* inputRow, size_t inputDim) { + for (size_t k = 0; k < inputDim; ++k) { + inputRow[k] += t * weightRow[k]; + } + }; mulByBitCodeT(op, SimpleCodeTable(numClasses), codes, *this, weight, input); } template -void sumByBitCodeT(CodeTable codeTable, IVector& codes, const CpuMatrix& tmat, - Matrix& sum, real scaleSum) { +void sumByBitCodeT(CodeTable codeTable, + IVector& codes, + const CpuMatrix& tmat, + Matrix& sum, + real scaleSum) { size_t maxCodeLength = codeTable.getMaxCodeLength(); size_t numSamples = tmat.getHeight(); size_t oWidth = tmat.getWidth(); @@ -237,7 +253,9 @@ void sumByBitCodeT(CodeTable codeTable, IVector& codes, const CpuMatrix& tmat, /* For j < codeLength: sum(i, 0) = \sum_j bit(i, j) * this(i, j) */ -void CpuMatrix::sumByBitCode(size_t numClasses, IVector& codes, Matrix& sum, +void CpuMatrix::sumByBitCode(size_t numClasses, + IVector& codes, + Matrix& sum, real scaleSum) { sumByBitCodeT(SimpleCodeTable(numClasses), codes, *this, sum, scaleSum); } diff --git a/paddle/math/MemoryHandle.cpp b/paddle/math/MemoryHandle.cpp index 11f746df5c2fb32175ebace1fd7dac3a2934cf9d..9101957fc6c221bed4aa8e0c76b4c6735e50fd2d 100644 --- a/paddle/math/MemoryHandle.cpp +++ b/paddle/math/MemoryHandle.cpp @@ -21,8 +21,7 @@ namespace paddle { /** * Calculate the actual allocation size according to the required size. */ -MemoryHandle::MemoryHandle(size_t size) - : size_(size), buf_(nullptr) { +MemoryHandle::MemoryHandle(size_t size) : size_(size), buf_(nullptr) { if (size_ <= 256) { // Memory allocation in cuda is always aligned to at least 256 bytes. // In many cases it is 512 bytes. @@ -44,9 +43,7 @@ GpuMemoryHandle::GpuMemoryHandle(size_t size) : MemoryHandle(size) { buf_ = allocator_->alloc(allocSize_); } -GpuMemoryHandle::~GpuMemoryHandle() { - allocator_->free(buf_, allocSize_); -} +GpuMemoryHandle::~GpuMemoryHandle() { allocator_->free(buf_, allocSize_); } CpuMemoryHandle::CpuMemoryHandle(size_t size) : MemoryHandle(size) { CHECK(size != 0) << " allocate 0 bytes"; @@ -54,8 +51,6 @@ CpuMemoryHandle::CpuMemoryHandle(size_t size) : MemoryHandle(size) { buf_ = allocator_->alloc(allocSize_); } -CpuMemoryHandle::~CpuMemoryHandle() { - allocator_->free(buf_, allocSize_); -} +CpuMemoryHandle::~CpuMemoryHandle() { allocator_->free(buf_, allocSize_); } } // namespace paddle diff --git a/paddle/math/MemoryHandle.h b/paddle/math/MemoryHandle.h index 809fba2d0a8963ba60f5abaa2d2daf415c2d985d..f12635d5d4b6ff7204d4d3e8d6f07d438c0ce1e8 100644 --- a/paddle/math/MemoryHandle.h +++ b/paddle/math/MemoryHandle.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include @@ -32,9 +31,9 @@ public: protected: PoolAllocator* allocator_; - size_t size_; // the requested size - size_t allocSize_; // the allocated size - int deviceId_; // the device id of memory if gpu memory + size_t size_; // the requested size + size_t allocSize_; // the allocated size + int deviceId_; // the device id of memory if gpu memory void* buf_; }; diff --git a/paddle/math/PoolAllocator.cpp b/paddle/math/PoolAllocator.cpp index 3a03496eb190ba6792708d9bcffd77cd0e45d4fc..2c150949dd4eca08824401685beecc19142cbd76 100644 --- a/paddle/math/PoolAllocator.cpp +++ b/paddle/math/PoolAllocator.cpp @@ -12,21 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "PoolAllocator.h" namespace paddle { PoolAllocator::PoolAllocator(Allocator* allocator, - size_t sizeLimit, const std::string& name) + size_t sizeLimit, + const std::string& name) : allocator_(allocator), sizeLimit_(sizeLimit), poolMemorySize_(0), name_(name) {} -PoolAllocator::~PoolAllocator() { - freeAll(); -} +PoolAllocator::~PoolAllocator() { freeAll(); } void* PoolAllocator::alloc(size_t size) { if (sizeLimit_ > 0) { diff --git a/paddle/math/PoolAllocator.h b/paddle/math/PoolAllocator.h index aca8ffb0ab42e10d76dc9fbaad657a8afab316e9..5d33b453127a5aaa355ba8c569baf1eefe931c96 100644 --- a/paddle/math/PoolAllocator.h +++ b/paddle/math/PoolAllocator.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include diff --git a/paddle/math/SIMDFunctions.cpp b/paddle/math/SIMDFunctions.cpp index 6147bed3d81112d57f03d23bbb6f5c2f327d4dc1..1fb156f29bbb586b6251f961bb4fd5f4d5da0737 100644 --- a/paddle/math/SIMDFunctions.cpp +++ b/paddle/math/SIMDFunctions.cpp @@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - - #include "SIMDFunctions.h" #include #include @@ -85,7 +83,9 @@ static void batch_addto_sse(float* a, const float* b[], int batch, size_t len) { return; } -static void col_max_sse(float* result, const float* data, int dim, +static void col_max_sse(float* result, + const float* data, + int dim, int numSamples) { // first sample, direct copy for (int d = 0; d < dim; ++d) { @@ -195,7 +195,9 @@ static void batch_addto_avx(float* a, const float* b[], int batch, size_t len) { return; } -static void col_max_avx(float* result, const float* data, int dim, +static void col_max_avx(float* result, + const float* data, + int dim, int numSamples) { // first sample, direct copy for (int d = 0; d < dim; ++d) { @@ -289,8 +291,8 @@ static void decayL1_avx(float* dst, float* src, float lambda, size_t sz) { } } -static void decayL1_avx(float* dst, float* src, float* lr, float lambda, - size_t sz) { +static void decayL1_avx( + float* dst, float* src, float* lr, float lambda, size_t sz) { int64_t i; int64_t size = sz; float src_val; @@ -379,8 +381,8 @@ void colMaxImpl(float* result, const float* data, int dim, int numSamples) { void decayL1AvxImpl(float* dst, float* src, float lambda, size_t len) { decayL1_avx(dst, src, lambda, len); } -void decayL1AvxImpl(float* dst, float* src, float* lr, float lambda, - size_t len) { +void decayL1AvxImpl( + float* dst, float* src, float* lr, float lambda, size_t len) { decayL1_avx(dst, src, lr, lambda, len); } diff --git a/paddle/math/SIMDFunctions.h b/paddle/math/SIMDFunctions.h index 2b984d5f96a620a95752231749a8b8b74f47d010..ac82f109104d7c21f346f909984306de105c0fd4 100644 --- a/paddle/math/SIMDFunctions.h +++ b/paddle/math/SIMDFunctions.h @@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - - #pragma once #include #include @@ -123,8 +121,8 @@ void batchAddToImpl(float* a, const float* b[], int batch, size_t len); void colMaxImpl(float* result, const float* data, int dim, int numSamples); #ifdef __AVX__ void decayL1AvxImpl(float* dst, float* src, float lambda, size_t len); -void decayL1AvxImpl(float* dst, float* src, float* lr, float lambda, - size_t len); +void decayL1AvxImpl( + float* dst, float* src, float* lr, float lambda, size_t len); #endif } // namespace internal @@ -153,8 +151,8 @@ inline void decayL1(float* dst, float* src, float lambda, size_t len) { } template <> -inline void decayL1(float* dst, float* src, float* lr, float lambda, - size_t len) { +inline void decayL1( + float* dst, float* src, float* lr, float lambda, size_t len) { #ifdef __AVX__ internal::decayL1AvxImpl(dst, src, lr, lambda, len); #else diff --git a/paddle/math/SparseMatrix.cpp b/paddle/math/SparseMatrix.cpp index 67ac0488623075729996aa603bd0e89c7ce98d9f..2b0bff9535d5a9ba4a47def4d6f964c799325535 100644 --- a/paddle/math/SparseMatrix.cpp +++ b/paddle/math/SparseMatrix.cpp @@ -22,18 +22,25 @@ limitations under the License. */ namespace paddle { -GpuSparseMatrix::GpuSparseMatrix(size_t height, size_t width, size_t nnz, - SparseValueType valueType, SparseFormat format, +GpuSparseMatrix::GpuSparseMatrix(size_t height, + size_t width, + size_t nnz, + SparseValueType valueType, + SparseFormat format, bool trans) : Matrix(NULL, height, width, trans, true) { resize(height, width, nnz, valueType, format); } GpuSparseMatrix::GpuSparseMatrix(GpuMemHandlePtr dataHandle, - hl_sparse_matrix_s_ptr sMatrix, size_t height, - size_t width, size_t nnz, - SparseValueType valueType, SparseFormat format, - bool trans, MemoryHandlePtr sMemoryHandle) + hl_sparse_matrix_s_ptr sMatrix, + size_t height, + size_t width, + size_t nnz, + SparseValueType valueType, + SparseFormat format, + bool trans, + MemoryHandlePtr sMemoryHandle) : Matrix(dataHandle, height, width, trans, true) { CHECK(dataHandle && sMatrix) << "Invalid argument pointer"; @@ -67,10 +74,14 @@ GpuSparseMatrix::GpuSparseMatrix(GpuMemHandlePtr dataHandle, sparseResizeCSC(); } -GpuSparseMatrix::GpuSparseMatrix(hl_sparse_matrix_s_ptr sMatrix, size_t height, - size_t width, size_t nnz, - SparseValueType valueType, SparseFormat format, - bool trans, MemoryHandlePtr sMemoryHandle) +GpuSparseMatrix::GpuSparseMatrix(hl_sparse_matrix_s_ptr sMatrix, + size_t height, + size_t width, + size_t nnz, + SparseValueType valueType, + SparseFormat format, + bool trans, + MemoryHandlePtr sMemoryHandle) : Matrix(NULL, height, width, trans, true) { CHECK(sMatrix) << "Invalid argument pointer"; sMatrix_ = sMatrix; @@ -80,9 +91,14 @@ GpuSparseMatrix::GpuSparseMatrix(hl_sparse_matrix_s_ptr sMatrix, size_t height, valueType_ = valueType; } -GpuSparseMatrix::GpuSparseMatrix(real* value, int* rows, int* cols, - size_t height, size_t width, size_t nnz, - SparseValueType valueType, SparseFormat format, +GpuSparseMatrix::GpuSparseMatrix(real* value, + int* rows, + int* cols, + size_t height, + size_t width, + size_t nnz, + SparseValueType valueType, + SparseFormat format, bool trans) : Matrix(NULL, height, width, trans, true) { size_t size = 0; @@ -118,9 +134,15 @@ GpuSparseMatrix::GpuSparseMatrix(real* value, int* rows, int* cols, /* construct hl_sparse_matrix_s */ hl_sparse_matrix_s tmp; hl_construct_sparse_matrix( - &tmp, value, rows, cols, HL_SPARSE_CSR, - valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE, height_, - width_, elementCnt_); + &tmp, + value, + rows, + cols, + HL_SPARSE_CSR, + valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE, + height_, + width_, + elementCnt_); hl_sparse_matrix_s_ptr tmp2(tmp, hl_destruct_sparse_matrix); sMatrix_ = tmp2; } @@ -143,9 +165,15 @@ GpuSparseMatrix::GpuSparseMatrix(real* value, int* rows, int* cols, /* construct hl_sparse_matrix_s */ hl_sparse_matrix_s tmp; hl_construct_sparse_matrix( - &tmp, value, rows, cols, HL_SPARSE_CSC, - valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE, height_, - width_, elementCnt_); + &tmp, + value, + rows, + cols, + HL_SPARSE_CSC, + valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE, + height_, + width_, + elementCnt_); hl_sparse_matrix_s_ptr tmp2(tmp, hl_destruct_sparse_matrix); sMatrix_ = tmp2; } @@ -171,8 +199,13 @@ void GpuSparseMatrix::sparseResizeCSR() { /* construct hl_sparse_matrix_s */ hl_sparse_matrix_s tmp; hl_construct_sparse_matrix( - &tmp, data_, memoryHandle_->getSize(), HL_SPARSE_CSR, - valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE, height_, width_, + &tmp, + data_, + memoryHandle_->getSize(), + HL_SPARSE_CSR, + valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE, + height_, + width_, elementCnt_); hl_sparse_matrix_s_ptr tmp2(tmp, hl_destruct_sparse_matrix); sMatrix_ = tmp2; @@ -197,16 +230,24 @@ void GpuSparseMatrix::sparseResizeCSC() { /* construct hl_sparse_matrix_s */ hl_sparse_matrix_s tmp; hl_construct_sparse_matrix( - &tmp, memoryHandle_->getBuf(), memoryHandle_->getSize(), HL_SPARSE_CSC, - valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE, height_, width_, + &tmp, + memoryHandle_->getBuf(), + memoryHandle_->getSize(), + HL_SPARSE_CSC, + valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE, + height_, + width_, elementCnt_); hl_sparse_matrix_s_ptr tmp2(tmp, hl_destruct_sparse_matrix); sMatrix_ = tmp2; } } -void GpuSparseMatrix::resize(size_t newHeight, size_t newWidth, size_t newNnz, - SparseValueType valueType, SparseFormat format) { +void GpuSparseMatrix::resize(size_t newHeight, + size_t newWidth, + size_t newNnz, + SparseValueType valueType, + SparseFormat format) { if (format == SPARSE_CSR) { resizeCSR(newHeight, newWidth, newNnz, valueType); } else { @@ -214,8 +255,10 @@ void GpuSparseMatrix::resize(size_t newHeight, size_t newWidth, size_t newNnz, } } -void GpuSparseMatrix::resizeCSR(size_t newHeight, size_t newWidth, - size_t newNnz, SparseValueType valueType) { +void GpuSparseMatrix::resizeCSR(size_t newHeight, + size_t newWidth, + size_t newNnz, + SparseValueType valueType) { size_t newSize = (newHeight + 1) * sizeof(int) + newNnz * sizeof(int); if (NO_VALUE != valueType) { newSize += newNnz * sizeof(real); @@ -266,8 +309,10 @@ void GpuSparseMatrix::resizeCSR(size_t newHeight, size_t newWidth, } } -void GpuSparseMatrix::resizeCSC(size_t newHeight, size_t newWidth, - size_t newNnz, SparseValueType valueType) { +void GpuSparseMatrix::resizeCSC(size_t newHeight, + size_t newWidth, + size_t newNnz, + SparseValueType valueType) { size_t newSize = (newWidth + 1) * sizeof(int) + newNnz * sizeof(int); if (NO_VALUE != valueType) { newSize += newNnz * sizeof(real); @@ -327,24 +372,37 @@ MatrixPtr GpuSparseMatrix::getTranspose() { CHECK(memoryHandle_.get() || sMatrix_) << "not supported"; if (memoryHandle_.get()) { MatrixPtr copy_T(new GpuSparseMatrix( - std::dynamic_pointer_cast(memoryHandle_), sMatrix_, - height_, width_, elementCnt_, valueType_, format_, true, + std::dynamic_pointer_cast(memoryHandle_), + sMatrix_, + height_, + width_, + elementCnt_, + valueType_, + format_, + true, sMemoryHandle_)); return copy_T; } else { - MatrixPtr copy_T(new GpuSparseMatrix(sMatrix_, height_, width_, elementCnt_, - valueType_, format_, true, + MatrixPtr copy_T(new GpuSparseMatrix(sMatrix_, + height_, + width_, + elementCnt_, + valueType_, + format_, + true, sMemoryHandle_)); return copy_T; } } -void GpuSparseMatrix::copyRow(int offsets, size_t colNum, +void GpuSparseMatrix::copyRow(int offsets, + size_t colNum, const sparse_non_value_t* row) { memcpy(cols_ + offsets, row, sizeof(int) * colNum); } -void GpuSparseMatrix::copyRow(int offsets, size_t colNum, +void GpuSparseMatrix::copyRow(int offsets, + size_t colNum, const sparse_float_value_t* row) { for (size_t j = 0; j < colNum; j++) { cols_[offsets + j] = row[j].col; @@ -368,7 +426,9 @@ void GpuSparseMatrix::copyFrom(const Matrix& src) { } template -void GpuSparseMatrix::copyFrom(int64_t* ids, int64_t* indices, T* data, +void GpuSparseMatrix::copyFrom(int64_t* ids, + int64_t* indices, + T* data, hl_stream_t stream) { CHECK_EQ(format_, SPARSE_CSR); size_t nnz = 0; @@ -377,7 +437,9 @@ void GpuSparseMatrix::copyFrom(int64_t* ids, int64_t* indices, T* data, nnz += indices[id + 1] - indices[id]; } - resize(height_, width_, nnz, + resize(height_, + width_, + nnz, sizeof(T) == sizeof(sparse_non_value_t) ? NO_VALUE : FLOAT_VALUE, format_); @@ -399,8 +461,10 @@ void GpuSparseMatrix::copyFrom(int64_t* ids, int64_t* indices, T* data, hl_memcpy_csr_matrix(sMatrix_.get(), value_, rows_, cols_, stream); } -void GpuSparseMatrix::setRow(size_t row, size_t colNum, - const unsigned int* cols, const real* values) { +void GpuSparseMatrix::setRow(size_t row, + size_t colNum, + const unsigned int* cols, + const real* values) { CHECK_EQ(format_, SPARSE_CSR); if (NO_VALUE == valueType_) { CHECK_LT(row, height_); @@ -427,8 +491,8 @@ void GpuSparseMatrix::setRow(size_t row, size_t colNum, sMatrix_->rows = height_; sMatrix_->cols = width_; sMatrix_->nnz = elementCnt_; - hl_memcpy_csr_matrix(sMatrix_.get(), value_, rows_, cols_, - HPPL_STREAM_DEFAULT); + hl_memcpy_csr_matrix( + sMatrix_.get(), value_, rows_, cols_, HPPL_STREAM_DEFAULT); } } @@ -438,8 +502,8 @@ void GpuSparseMatrix::transpose(MatrixPtr matTrans, bool memAlloc) { CHECK_EQ(format_, SPARSE_CSC); int nnz = sMatrix_->nnz; if (memAlloc) { - matTrans = std::make_shared(width_, height_, nnz, - valueType_, format_, false); + matTrans = std::make_shared( + width_, height_, nnz, valueType_, format_, false); } else { CHECK(matTrans != nullptr); } @@ -449,9 +513,14 @@ void GpuSparseMatrix::transpose(MatrixPtr matTrans, bool memAlloc) { CpuIVector cols_full(nnz); CpuVector value(nnz); hl_stream_t stream = HPPL_STREAM_1; - hl_memcpy_from_csc_matrix(value.getData(), nnz, rows.getData(), nnz, - cols.getData(), width_ + 1, - sMatrix_.get(), stream); + hl_memcpy_from_csc_matrix(value.getData(), + nnz, + rows.getData(), + nnz, + cols.getData(), + width_ + 1, + sMatrix_.get(), + stream); hl_stream_synchronize(stream); @@ -465,12 +534,14 @@ void GpuSparseMatrix::transpose(MatrixPtr matTrans, bool memAlloc) { /*sort row index and column index by the ascending order*/ for (int i = 0; i < nnz; i++) { - dataVec.emplace_back(rows.getData()[i], cols_full.getData()[i], - value.getData()[i]); + dataVec.emplace_back( + rows.getData()[i], cols_full.getData()[i], value.getData()[i]); } - std::sort(dataVec.begin(), dataVec.end(), [](Element a, Element b) { - return a.row < b.row || (a.row == b.row && a.col < b.col); - }); + std::sort(dataVec.begin(), + dataVec.end(), + [](Element a, Element b) { + return a.row < b.row || (a.row == b.row && a.col < b.col); + }); /*get sorted data, row index, and col index, put them in the right place*/ cols.resize(height_ + 1); @@ -494,13 +565,18 @@ void GpuSparseMatrix::transpose(MatrixPtr matTrans, bool memAlloc) { /*copy back from cpu*/ GpuSparseMatrixPtr dest = std::dynamic_pointer_cast(matTrans); - hl_memcpy_csc_matrix((dest->sMatrix_).get(), value.getData(), - rows.getData(), cols.getData(), stream); + hl_memcpy_csc_matrix((dest->sMatrix_).get(), + value.getData(), + rows.getData(), + cols.getData(), + stream); hl_stream_synchronize(stream); } -void GpuSparseMatrix::mul(const GpuMatrixPtr a, const GpuMatrixPtr b, - real scaleAB, real scaleT) { +void GpuSparseMatrix::mul(const GpuMatrixPtr a, + const GpuMatrixPtr b, + real scaleAB, + real scaleT) { CHECK(a->useGpu_ && b->useGpu_) << "type not match"; CHECK(!trans_) << "trans not supported"; real* A_d = a->getData(); @@ -527,11 +603,13 @@ void GpuSparseMatrix::mul(const GpuMatrixPtr a, const GpuMatrixPtr b, int dimM = height_; int dimN = width_; int dimK = !b->trans_ ? b->getHeight() : b->getWidth(); - hl_sparse_matrix_mul(A_d, a_trans, B_d, b_trans, C_d, dimM, - dimN, dimK, scaleAB, scaleT); + hl_sparse_matrix_mul( + A_d, a_trans, B_d, b_trans, C_d, dimM, dimN, dimK, scaleAB, scaleT); } -void GpuSparseMatrix::mul(const MatrixPtr a, const MatrixPtr b, real scaleAB, +void GpuSparseMatrix::mul(const MatrixPtr a, + const MatrixPtr b, + real scaleAB, real scaleT) { if (std::dynamic_pointer_cast(a) && std::dynamic_pointer_cast(b)) { @@ -559,9 +637,14 @@ void GpuSparseMatrix::print(std::ostream& os) const { IVectorPtr cols = IVector::create(width_ + 1, false); VectorPtr value = Vector::create(nnz, false); hl_stream_t stream = HPPL_STREAM_DEFAULT; - hl_memcpy_from_csc_matrix( - value->getData(), value->getSize(), rows->getData(), rows->getSize(), - cols->getData(), cols->getSize(), sMatrix_.get(), stream); + hl_memcpy_from_csc_matrix(value->getData(), + value->getSize(), + rows->getData(), + rows->getSize(), + cols->getData(), + cols->getSize(), + sMatrix_.get(), + stream); hl_stream_synchronize(stream); printBuf(os, cols->getData(), width_ + 1, "col idx"); @@ -574,11 +657,10 @@ void GpuSparseMatrix::copyFromCSR(CpuSparseMatrix& src, hl_stream_t stream) { trans_ = src.trans_; size_t nnz = src.getElementCnt(); - resize(src.getHeight(), src.getWidth(), nnz, valueType_, - src.getFormat()); + resize(src.getHeight(), src.getWidth(), nnz, valueType_, src.getFormat()); // if have different value type, only copy rows and cols SparseValueType vType = - valueType_ != src.getValueType() ? NO_VALUE : valueType_; + valueType_ != src.getValueType() ? NO_VALUE : valueType_; sMatrix_->format = HL_SPARSE_CSR; sMatrix_->type = vType == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE; @@ -588,7 +670,9 @@ void GpuSparseMatrix::copyFromCSR(CpuSparseMatrix& src, hl_stream_t stream) { hl_memcpy_csr_matrix(sMatrix_.get(), vType == NO_VALUE ? NULL : src.getValue(), - src.getRows(), src.getCols(), stream); + src.getRows(), + src.getCols(), + stream); // restore type of sMatrix_ sMatrix_->type = valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE; @@ -598,12 +682,11 @@ void GpuSparseMatrix::copyFromCSC(CpuSparseMatrix& src, hl_stream_t stream) { trans_ = src.trans_; size_t nnz = src.getElementCnt(); - resize(src.getHeight(), src.getWidth(), nnz, valueType_, - src.getFormat()); + resize(src.getHeight(), src.getWidth(), nnz, valueType_, src.getFormat()); // if have different value type, only copy rows and cols SparseValueType vType = - valueType_ != src.getValueType() ? NO_VALUE : valueType_; + valueType_ != src.getValueType() ? NO_VALUE : valueType_; sMatrix_->format = HL_SPARSE_CSC; sMatrix_->type = vType == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE; @@ -613,7 +696,9 @@ void GpuSparseMatrix::copyFromCSC(CpuSparseMatrix& src, hl_stream_t stream) { hl_memcpy_csc_matrix(sMatrix_.get(), vType == NO_VALUE ? NULL : src.getValue(), - src.getRows(), src.getCols(), stream); + src.getRows(), + src.getCols(), + stream); // restore type of sMatrix_ sMatrix_->type = valueType_ == NO_VALUE ? HL_NO_VALUE : HL_FLOAT_VALUE; @@ -622,23 +707,24 @@ void GpuSparseMatrix::copyFromCSC(CpuSparseMatrix& src, hl_stream_t stream) { void GpuSparseMatrix::copyFrom(GpuSparseMatrix& src, hl_stream_t stream) { CHECK(trans_ == src.trans_); CHECK(format_ == src.getFormat()); - resize(src.getHeight(), src.getWidth(), elementCnt_, valueType_, + resize(src.getHeight(), + src.getWidth(), + elementCnt_, + valueType_, src.getFormat()); size_t rowSize = format_ == SPARSE_CSC ? elementCnt_ : height_ + 1; size_t colSize = format_ == SPARSE_CSC ? width_ + 1 : elementCnt_; if (valueType_ == FLOAT_VALUE && src.getValueType() == FLOAT_VALUE) { - hl_memcpy_async(getValue(), src.getValue(), - sizeof(real) * elementCnt_, stream); + hl_memcpy_async( + getValue(), src.getValue(), sizeof(real) * elementCnt_, stream); } CHECK(getRows()); CHECK(src.getRows()); - hl_memcpy_async(getRows(), src.getRows(), - sizeof(int) * rowSize, stream); - hl_memcpy_async(getCols(), src.getCols(), - sizeof(int) * colSize, stream); + hl_memcpy_async(getRows(), src.getRows(), sizeof(int) * rowSize, stream); + hl_memcpy_async(getCols(), src.getCols(), sizeof(int) * colSize, stream); } void GpuSparseMatrix::copyFrom(CpuSparseMatrix& src, hl_stream_t stream) { @@ -652,7 +738,8 @@ void GpuSparseMatrix::copyFrom(CpuSparseMatrix& src, hl_stream_t stream) { void GpuSparseMatrix::trimFromCSR(const CpuSparseMatrix& src) { trans_ = src.trans_; int* srcCols = src.getCols(); - size_t nnz = std::count_if(srcCols, srcCols + src.getElementCnt(), + size_t nnz = std::count_if(srcCols, + srcCols + src.getElementCnt(), [this](size_t n) { return n < this->width_; }); resize(height_, width_, nnz, valueType_, format_); @@ -678,9 +765,11 @@ void GpuSparseMatrix::trimFromCSR(const CpuSparseMatrix& src) { sMatrix_->cols = width_; sMatrix_->nnz = nnz; - hl_memcpy_csr_matrix( - sMatrix_.get(), valueType_ == NO_VALUE ? NULL : value_, rows_, cols_, - /*default stream = */ HPPL_STREAM_DEFAULT); + hl_memcpy_csr_matrix(sMatrix_.get(), + valueType_ == NO_VALUE ? NULL : value_, + rows_, + cols_, + /*default stream = */ HPPL_STREAM_DEFAULT); } void GpuSparseMatrix::trimFromCSC(const CpuSparseMatrix& src) { @@ -703,9 +792,11 @@ void GpuSparseMatrix::trimFromCSC(const CpuSparseMatrix& src) { sMatrix_->cols = width_; sMatrix_->nnz = nnz; - hl_memcpy_csc_matrix( - sMatrix_.get(), valueType_ == NO_VALUE ? NULL : value_, rows_, cols_, - /*default stream = */ HPPL_STREAM_DEFAULT); + hl_memcpy_csc_matrix(sMatrix_.get(), + valueType_ == NO_VALUE ? NULL : value_, + rows_, + cols_, + /*default stream = */ HPPL_STREAM_DEFAULT); } void GpuSparseMatrix::trimFrom(const CpuSparseMatrix& src) { @@ -766,10 +857,12 @@ void GpuSparseMatrix::rowMax(IVector& maxIds, Matrix& maxVal) { #endif } -template void GpuSparseMatrix::copyFrom(int64_t* ids, int64_t* indices, +template void GpuSparseMatrix::copyFrom(int64_t* ids, + int64_t* indices, sparse_non_value_t* data, hl_stream_t stream); -template void GpuSparseMatrix::copyFrom(int64_t* ids, int64_t* indices, +template void GpuSparseMatrix::copyFrom(int64_t* ids, + int64_t* indices, sparse_float_value_t* data, hl_stream_t stream); } // namespace paddle diff --git a/paddle/math/SparseMatrix.h b/paddle/math/SparseMatrix.h index 4b9a03302bf531a08b889a4b15d36fc8e71458dd..175ef54b858b7f8f31f45796d733af81a9d67066 100644 --- a/paddle/math/SparseMatrix.h +++ b/paddle/math/SparseMatrix.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include #include "Matrix.h" @@ -35,25 +34,41 @@ public: SparseFormat format_; public: - GpuSparseMatrix(size_t height, size_t width, + GpuSparseMatrix(size_t height, + size_t width, size_t nnz, /* used to allocate space */ SparseValueType valueType = FLOAT_VALUE, - SparseFormat format_ = SPARSE_CSR, bool trans = false); + SparseFormat format_ = SPARSE_CSR, + bool trans = false); - GpuSparseMatrix(GpuMemHandlePtr dataHandle, hl_sparse_matrix_s_ptr sMatrix, - size_t height, size_t width, + GpuSparseMatrix(GpuMemHandlePtr dataHandle, + hl_sparse_matrix_s_ptr sMatrix, + size_t height, + size_t width, size_t nnz, /* used to allocate space */ SparseValueType valueType = FLOAT_VALUE, - SparseFormat format_ = SPARSE_CSR, bool trans = false, + SparseFormat format_ = SPARSE_CSR, + bool trans = false, MemoryHandlePtr sMemoryHandle = NULL); - GpuSparseMatrix(real* value, int* rows, int* cols, size_t height, - size_t width, size_t nnz, SparseValueType valueType, - SparseFormat format, bool trans); - - GpuSparseMatrix(hl_sparse_matrix_s_ptr sMatrix, size_t height, size_t width, - size_t nnz, SparseValueType valueType, SparseFormat format, - bool trans, MemoryHandlePtr sMemoryHandle); + GpuSparseMatrix(real* value, + int* rows, + int* cols, + size_t height, + size_t width, + size_t nnz, + SparseValueType valueType, + SparseFormat format, + bool trans); + + GpuSparseMatrix(hl_sparse_matrix_s_ptr sMatrix, + size_t height, + size_t width, + size_t nnz, + SparseValueType valueType, + SparseFormat format, + bool trans, + MemoryHandlePtr sMemoryHandle); protected: struct Element { @@ -67,9 +82,11 @@ protected: public: ~GpuSparseMatrix() {} - void resize(size_t newHeight, size_t newWidth, + void resize(size_t newHeight, + size_t newWidth, size_t newNnz, /* used to allocate space */ - SparseValueType valueType, SparseFormat format); + SparseValueType valueType, + SparseFormat format); void resize(size_t newHeight, size_t newWidth); @@ -77,13 +94,19 @@ public: void sparseResizeCSC(); - void resizeCSR(size_t newHeight, size_t newWidth, size_t newNnz, + void resizeCSR(size_t newHeight, + size_t newWidth, + size_t newNnz, SparseValueType valueType); - void resizeCSC(size_t newHeight, size_t newWidth, size_t newNnz, + void resizeCSC(size_t newHeight, + size_t newWidth, + size_t newNnz, SparseValueType valueType); - void mul(const GpuMatrixPtr a, const GpuMatrixPtr b, real scaleAB, + void mul(const GpuMatrixPtr a, + const GpuMatrixPtr b, + real scaleAB, real scaleT); /// B = A , B.trans = !A.trans MatrixPtr getTranspose(); @@ -104,7 +127,9 @@ public: template void copyFrom(int64_t* ids, int64_t* indices, T* data, hl_stream_t stream); - void setRow(size_t row, size_t colNum, const unsigned int* cols, + void setRow(size_t row, + size_t colNum, + const unsigned int* cols, const real* values); SparseValueType getValueType() const; SparseFormat getFormat() const { return format_; } @@ -173,7 +198,7 @@ public: * getData is convenient to get value */ real* getData() { return getValue(); } - const real* getData() const { return getValue();} + const real* getData() const { return getValue(); } /** * @brief Get top k value of each row in sparse matrix. @@ -204,9 +229,7 @@ public: // BaseMatrixT interface public: - bool isSparse() const { - return true; - } + bool isSparse() const { return true; } private: using Matrix::mul; diff --git a/paddle/math/SparseRowMatrix.cpp b/paddle/math/SparseRowMatrix.cpp index 6986624d25c7a498da923f4f77b78c25c874b41f..eefaf4b71f4f027d00405bd4b158adc66a902ef7 100644 --- a/paddle/math/SparseRowMatrix.cpp +++ b/paddle/math/SparseRowMatrix.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "SparseRowMatrix.h" #include "CpuSparseMatrix.h" @@ -26,7 +25,8 @@ limitations under the License. */ #include "paddle/utils/Util.h" #include "paddle/utils/Thread.h" -P_DEFINE_bool(allow_inefficient_sparse_update, false, +P_DEFINE_bool(allow_inefficient_sparse_update, + false, "Whether to allow inefficient sparse update"); namespace paddle { @@ -45,7 +45,9 @@ void SparseRowCpuMatrix::init(size_t height, size_t width) { globalIndices_ = indexDictHandle_->globalIndices.data(); } -void SparseRowCpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, +void SparseRowCpuMatrix::mul(CpuSparseMatrix* a, + CpuMatrix* b, + real scaleAB, real scaleT) { CpuMatrix::mul(a, b, this, scaleAB, scaleT); } @@ -55,24 +57,25 @@ void SparseRowCpuMatrix::copyFrom(const real* src, size_t size) { } void SparseRowCpuMatrix::zeroMem() { - apply( - [](real* buf, size_t len) { - memset(buf, 0, sizeof(real) * len); - }); + apply([](real* buf, size_t len) { memset(buf, 0, sizeof(real) * len); }); clearRows(); } void SparseRowCpuMatrix::applyL1Decay(real learningRate, real decayRate) { apply([=](real* buf, size_t len) { - CpuVector value(0, nullptr); - value.subVecFrom(buf, 0, len); - value.applyL1(learningRate, decayRate); - }); + CpuVector value(0, nullptr); + value.subVecFrom(buf, 0, len); + value.applyL1(learningRate, decayRate); + }); } -void SparseRowCpuMatrix::sgdUpdate(BaseMatrix& value, IVector& t0, - real learningRate, int currentTime, - real decayRate, bool useL1, bool fini) { +void SparseRowCpuMatrix::sgdUpdate(BaseMatrix& value, + IVector& t0, + real learningRate, + int currentTime, + real decayRate, + bool useL1, + bool fini) { std::vector& localIndices = indexDictHandle_->localIndices; // t0 and value are vectors @@ -124,7 +127,7 @@ void SparseRowCpuMatrix::sgdUpdate(BaseMatrix& value, IVector& t0, for (size_t j = 0; j < this->width_; ++j) { v[j] -= learningRate * g[j]; } - simd::decayL1(v, v, learningRate*decayRate, this->width_); + simd::decayL1(v, v, learningRate * decayRate, this->width_); // state update to t+1 t[0] = currentTime + 1; @@ -173,8 +176,10 @@ void SparseRowCpuMatrix::sgdUpdate(BaseMatrix& value, IVector& t0, } } -void SparseRowCpuMatrix::addTo(BaseMatrix& dest, std::vector& ids, - size_t tid, size_t numThreads) { +void SparseRowCpuMatrix::addTo(BaseMatrix& dest, + std::vector& ids, + size_t tid, + size_t numThreads) { CHECK(!dest.useGpu_); CHECK_EQ(dest.height_ * dest.width_, this->height_ * this->width_); @@ -182,14 +187,14 @@ void SparseRowCpuMatrix::addTo(BaseMatrix& dest, std::vector& ids, for (size_t i = 0; i < localIndices.size(); ++i) { uint32_t id = localIndices[i]; if (id % numThreads == tid) { - simd::addTo(dest.rowBuf(id), getLocalRow(i), - this->width_); + simd::addTo(dest.rowBuf(id), getLocalRow(i), this->width_); ids.push_back(id); } } } -void SparseRowCpuMatrix::addTo(SparseRowCpuMatrix& dest, size_t tid, +void SparseRowCpuMatrix::addTo(SparseRowCpuMatrix& dest, + size_t tid, size_t numThreads) { CHECK(!dest.useGpu_); CHECK_EQ(dest.height_ * dest.width_, this->height_ * this->width_); @@ -214,24 +219,28 @@ void SparseRowCpuMatrix::zeroMemThread(size_t tid, size_t numThreads) { } } -void SparseAutoGrowRowCpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b, - real scaleAB, real scaleT) { - CpuMatrix::mul(a, b, this, scaleAB, - scaleT); +void SparseAutoGrowRowCpuMatrix::mul(CpuSparseMatrix* a, + CpuMatrix* b, + real scaleAB, + real scaleT) { + CpuMatrix::mul( + a, b, this, scaleAB, scaleT); } -void CacheRowCpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, +void CacheRowCpuMatrix::mul(CpuSparseMatrix* a, + CpuMatrix* b, + real scaleAB, real scaleT) { CpuMatrix::mul(a, b, this, scaleAB, scaleT); } void SparsePrefetchRowCpuMatrix::addRows(const unsigned int* ids, size_t len) { std::vector& localIndices = indexDictHandle_->localIndices; - for (size_t i = 0; i < len; i ++) { + for (size_t i = 0; i < len; i++) { CHECK_LT(*(ids + i), this->getHeight()) - << "id:" << *(ids + i) << "Height:" << this->getHeight() - << "sparse id value exceeds the max input dimension, " - << "it could be caused invalid input data samples"; + << "id:" << *(ids + i) << "Height:" << this->getHeight() + << "sparse id value exceeds the max input dimension, " + << "it could be caused invalid input data samples"; } localIndices.insert(localIndices.end(), ids, ids + len); } @@ -252,9 +261,9 @@ void SparsePrefetchRowCpuMatrix::addRows(IVectorPtr ids) { unsigned int id = (unsigned int)index[i]; CHECK_LT(id, this->getHeight()) - << "id:" << id << "Height:" << this->getHeight() - << "sparse id value exceeds the max input dimension, " - << "it could be caused invalid input data samples"; + << "id:" << id << "Height:" << this->getHeight() + << "sparse id value exceeds the max input dimension, " + << "it could be caused invalid input data samples"; localIndices.push_back(id); } } diff --git a/paddle/math/SparseRowMatrix.h b/paddle/math/SparseRowMatrix.h index 2dcd81188d6431c317e82ee35e968cddfb334f59..56f113a3614e2e22809abbdaa708557ed3344464 100644 --- a/paddle/math/SparseRowMatrix.h +++ b/paddle/math/SparseRowMatrix.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include @@ -41,12 +40,15 @@ public: /// heightStore is max number of rows of the sparse matrix. SparseRowCpuMatrix(CpuMemHandlePtr dataHandle, - size_t height, size_t width, - IndexDictPtr indexDictHandle = nullptr, bool trans = false) + size_t height, + size_t width, + IndexDictPtr indexDictHandle = nullptr, + bool trans = false) : CpuMatrix(nullptr, height, width, trans), storeMat_(dataHandle, dataHandle ? dataHandle->getSize() / sizeof(real) / width : 0, - width, trans), + width, + trans), indexDictHandle_(indexDictHandle) { init(height, width); } @@ -123,8 +125,12 @@ public: * While pass finished, caller should call this func one more time * with (fini=true) to let weight decay catch up current time. */ - void sgdUpdate(BaseMatrix& value, IVector& t0, real learningRate, - int currentTime, real decayRate, bool useL1, + void sgdUpdate(BaseMatrix& value, + IVector& t0, + real learningRate, + int currentTime, + real decayRate, + bool useL1, bool fini = false); /** @@ -135,7 +141,9 @@ public: * ids occured in *this* append to *ids* * filtered by (id % numThreads == tid) */ - void addTo(BaseMatrix& dest, std::vector& ids, size_t tid, + void addTo(BaseMatrix& dest, + std::vector& ids, + size_t tid, size_t numThreads); /** @@ -166,7 +174,7 @@ public: } protected: - template + template void apply(Func f) { real* data = storeMat_.getData() ? storeMat_.getData() : rowStore_.data(); f(data, localIndices_->size() * width_); @@ -211,9 +219,11 @@ class SyncThreadPool; class SparsePrefetchRowCpuMatrix : public SparseRowCpuMatrix { public: SparsePrefetchRowCpuMatrix(CpuMemHandlePtr dataHandle, - size_t height, size_t width, + size_t height, + size_t width, IndexDictPtr indexDictHandle = nullptr, - SyncThreadPool* pool = nullptr, bool trans = false) + SyncThreadPool* pool = nullptr, + bool trans = false) : SparseRowCpuMatrix(dataHandle, height, width, indexDictHandle, trans), pool_(pool) {} @@ -239,7 +249,8 @@ protected: class SparseAutoGrowRowCpuMatrix : public SparseRowCpuMatrix { public: - SparseAutoGrowRowCpuMatrix(size_t height, size_t width, + SparseAutoGrowRowCpuMatrix(size_t height, + size_t width, IndexDictPtr indexDictHandle = nullptr, bool trans = false) : SparseRowCpuMatrix(nullptr, height, width, indexDictHandle, trans) {} @@ -261,8 +272,10 @@ public: class CacheRowCpuMatrix : public SparseAutoGrowRowCpuMatrix { public: - CacheRowCpuMatrix(size_t height, size_t width, - IndexDictPtr indexDictHandle = nullptr, bool trans = false) + CacheRowCpuMatrix(size_t height, + size_t width, + IndexDictPtr indexDictHandle = nullptr, + bool trans = false) : SparseAutoGrowRowCpuMatrix(height, width, indexDictHandle, trans), sourceData_(nullptr) {} @@ -277,8 +290,8 @@ public: id = globalIndices_[row] = localIndices_->size(); localIndices_->push_back(row); checkStoreSize(); - memcpy(getLocalRow(id), sourceData_ + width_ * row, - sizeof(float) * width_); + memcpy( + getLocalRow(id), sourceData_ + width_ * row, sizeof(float) * width_); } return getLocalRow(id); } @@ -300,7 +313,9 @@ public: */ class SparseRowIdsCpuMatrix : public CpuMatrix { public: - SparseRowIdsCpuMatrix(CpuMemHandlePtr dataHandle, size_t height, size_t width, + SparseRowIdsCpuMatrix(CpuMemHandlePtr dataHandle, + size_t height, + size_t width, bool trans = false) : CpuMatrix(dataHandle, height, width, trans) {} diff --git a/paddle/math/Storage.cpp b/paddle/math/Storage.cpp index 0403c3521cf54d833b32ff0810ba6d29dfc8f3c6..57ea5c926647d21a82c87fc262e2999e45e7534f 100644 --- a/paddle/math/Storage.cpp +++ b/paddle/math/Storage.cpp @@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/utils/Util.h" #include "Allocator.h" #include "Storage.h" -P_DEFINE_int32(pool_limit_size, 536870912, +P_DEFINE_int32(pool_limit_size, + 536870912, "maximum memory size managed by a memory pool, default is 512M"); namespace paddle { @@ -25,11 +25,10 @@ namespace paddle { // Initialization StorageEngine singleton. // Other modules may rely on storage management, // so StorageEngine need to be initialized before other modules. -static InitFunction __init_storage_engine([](){StorageEngine::singleton();}, +static InitFunction __init_storage_engine([]() { StorageEngine::singleton(); }, std::numeric_limits::max()); -StorageEngine::StorageEngine() : cpuAllocator_(nullptr) { -} +StorageEngine::StorageEngine() : cpuAllocator_(nullptr) {} StorageEngine::~StorageEngine() { if (cpuAllocator_) { @@ -49,8 +48,8 @@ PoolAllocator* StorageEngine::getGpuAllocator(int deviceId) { { // if gpuAllocator_ has been constructed ReadLockGuard guard(lock_); - if (deviceId < static_cast(gpuAllocator_.size()) - && (gpuAllocator_[deviceId] != nullptr)) { + if (deviceId < static_cast(gpuAllocator_.size()) && + (gpuAllocator_[deviceId] != nullptr)) { return gpuAllocator_[deviceId]; } } @@ -63,9 +62,9 @@ PoolAllocator* StorageEngine::getGpuAllocator(int deviceId) { } if (gpuAllocator_[deviceId] == nullptr) { std::string name = - "gpu" + std::to_string(deviceId) + std::string("_pool"); - gpuAllocator_[deviceId] = new PoolAllocator( - new GpuAllocator(), FLAGS_pool_limit_size, name); + "gpu" + std::to_string(deviceId) + std::string("_pool"); + gpuAllocator_[deviceId] = + new PoolAllocator(new GpuAllocator(), FLAGS_pool_limit_size, name); } return gpuAllocator_[deviceId]; } @@ -86,10 +85,10 @@ PoolAllocator* StorageEngine::getCpuAllocator() { if (cpuAllocator_ == nullptr) { if (FLAGS_use_gpu) { cpuAllocator_ = new PoolAllocator( - new CudaHostAllocator(), FLAGS_pool_limit_size, "cuda_host_pool"); + new CudaHostAllocator(), FLAGS_pool_limit_size, "cuda_host_pool"); } else { cpuAllocator_ = new PoolAllocator( - new CpuAllocator(), FLAGS_pool_limit_size, "cpu_pool"); + new CpuAllocator(), FLAGS_pool_limit_size, "cpu_pool"); } } return cpuAllocator_; diff --git a/paddle/math/Vector.cpp b/paddle/math/Vector.cpp index 68a1518d67e6b7f2d59aa5e50ac11ec9af4030d9..b2ade83138428a510e6be1bfa82290008e4167d0 100644 --- a/paddle/math/Vector.cpp +++ b/paddle/math/Vector.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/utils/Util.h" #include "Vector.h" @@ -49,7 +48,8 @@ std::shared_ptr> VectorT::createParallelVector( } template -std::shared_ptr> VectorT::create(T* data, size_t size, +std::shared_ptr> VectorT::create(T* data, + size_t size, bool useGpu) { if (useGpu) { return std::make_shared>(size, data); @@ -63,10 +63,10 @@ std::shared_ptr> VectorT::create(size_t size, MemoryHandlePtr memoryHandle, size_t offset) { if (auto cpuMemHandle = - std::dynamic_pointer_cast(memoryHandle)) { + std::dynamic_pointer_cast(memoryHandle)) { return std::make_shared>(size, cpuMemHandle, offset); } else if (auto gpuMemHandle = - std::dynamic_pointer_cast(memoryHandle)) { + std::dynamic_pointer_cast(memoryHandle)) { return std::make_shared>(size, gpuMemHandle, offset); } else { LOG(FATAL) << "Wrong"; @@ -76,8 +76,8 @@ std::shared_ptr> VectorT::create(size_t size, template <> MatrixPtr VectorT::toOneHotSparseMatrix(size_t idRange, bool useGpu) { - LOG(FATAL) << "Wrong for real vector"; - return nullptr; + LOG(FATAL) << "Wrong for real vector"; + return nullptr; } template <> @@ -89,9 +89,9 @@ MatrixPtr VectorT::toOneHotSparseMatrix(size_t idRange, bool useGpu) { CpuIVector cpuIds(height); cpuIds.copyFrom(*this); - int *idData = cpuIds.getData(); + int* idData = cpuIds.getData(); - for (decltype(height) i = 0; i < height; i ++) { + for (decltype(height) i = 0; i < height; i++) { const unsigned int id = idData[i]; CHECK_LT(id, width); mat->setRow(i, 1, &id, nullptr); @@ -101,21 +101,20 @@ MatrixPtr VectorT::toOneHotSparseMatrix(size_t idRange, bool useGpu) { template GpuVectorT::GpuVectorT(size_t size) - : VectorT(size, std::make_shared(sizeof(T) * size), + : VectorT(size, + std::make_shared(sizeof(T) * size), 0, /* offset = 0 */ true /* useGpu = true */) {} template T GpuVectorT::getElement(size_t i) const { T elem = 0; - hl_memcpy_device2host(&elem, const_cast(&this->getData()[i]), - sizeof(T)); + hl_memcpy_device2host(&elem, const_cast(&this->getData()[i]), sizeof(T)); return elem; } template void GpuVectorT::setElement(size_t i, const T& value) { - hl_memcpy_host2device(&this->getData()[i], const_cast(&value), - sizeof(T)); + hl_memcpy_host2device(&this->getData()[i], const_cast(&value), sizeof(T)); } template @@ -219,8 +218,7 @@ real GpuVectorT::getMin() { template T GpuVectorT::get(size_t pos) { T val = (T)0; - hl_memcpy_device2host((void*)&val, (void*)(this->getData() + pos), - sizeof(T)); + hl_memcpy_device2host((void*)&val, (void*)(this->getData() + pos), sizeof(T)); return val; } @@ -229,7 +227,7 @@ void GpuVectorT::histogram(std::ostream& os, int type) { LOG(FATAL) << "Not implemented"; } -template +template void GpuVectorT::zeroMem() { BaseMatrixT::zero(); } @@ -252,8 +250,10 @@ void GpuVectorT::copyFrom(const VectorT& src) { template void GpuVectorT::copyFrom(const VectorT& src, hl_stream_t stream) { CHECK_EQ(src.getSize(), this->getSize()); - hl_memcpy_async((void*)this->getData(), (void*)src.getData(), - sizeof(T) * this->getSize(), stream); + hl_memcpy_async((void*)this->getData(), + (void*)src.getData(), + sizeof(T) * this->getSize(), + stream); } template @@ -269,15 +269,16 @@ void GpuVectorT::copyFrom(const T* gpuSrc, size_t size, hl_stream_t stream) { CHECK(gpuSrc != NULL); CHECK_LE(size, this->size_); - hl_memcpy_async((void*)this->getData(), (void*)gpuSrc, - sizeof(T) * size, stream); + hl_memcpy_async( + (void*)this->getData(), (void*)gpuSrc, sizeof(T) * size, stream); } template void GpuVectorT::copyTo(CpuVectorT* dest) const { CHECK_EQ(this->getSize(), dest->getSize()); - hl_memcpy_device2host((void*)dest->getData(), (void*)this->getData(), + hl_memcpy_device2host((void*)dest->getData(), + (void*)this->getData(), sizeof(T) * this->getSize()); } @@ -285,7 +286,8 @@ template void GpuVectorT::copyTo(GpuVectorT* dest) const { CHECK_EQ(this->getSize(), dest->getSize()); - hl_memcpy_device2device((void*)dest->getData(), (void*)this->getData(), + hl_memcpy_device2device((void*)dest->getData(), + (void*)this->getData(), sizeof(T) * this->getSize()); } @@ -297,7 +299,8 @@ void GpuVectorT::rand() { template <> void GpuVectorT::print(std::ostream& os, size_t num) const { IVectorPtr dest = IVector::create(this->size_, false); - hl_memcpy_device2host((void*)dest->getData(), (void*)this->getData(), + hl_memcpy_device2host((void*)dest->getData(), + (void*)this->getData(), sizeof(int) * this->getSize()); dest->print(os, num); } @@ -305,7 +308,8 @@ void GpuVectorT::print(std::ostream& os, size_t num) const { template <> void GpuVectorT::print(std::ostream& os, size_t num) const { VectorPtr dest = Vector::create(this->size_, false); - hl_memcpy_device2host((void*)dest->getData(), (void*)this->getData(), + hl_memcpy_device2host((void*)dest->getData(), + (void*)this->getData(), sizeof(int) * this->getSize()); dest->print(os, num); } @@ -428,8 +432,8 @@ void GpuVectorT::randnorm(real mean, real std) { CpuVector cpuVec = CpuVector(this->getSize()); cpuVec.randnorm(mean, std); - hl_memcpy_host2device(data_, cpuVec.getData(), - this->getSize() * sizeof(real)); + hl_memcpy_host2device( + data_, cpuVec.getData(), this->getSize() * sizeof(real)); } template <> @@ -437,19 +441,22 @@ void GpuVectorT::uniform(real left, real right) { CpuVector cpuVec = CpuVector(this->getSize()); cpuVec.uniform(left, right); - hl_memcpy_host2device(data_, cpuVec.getData(), - this->getSize() * sizeof(real)); + hl_memcpy_host2device( + data_, cpuVec.getData(), this->getSize() * sizeof(real)); } template CpuVectorT::CpuVectorT(size_t size) - : VectorT(size, std::make_shared(sizeof(T) * size), + : VectorT(size, + std::make_shared(sizeof(T) * size), 0, /* offset = 0 */ false /* useGpu = false */) {} template CpuVectorT::CpuVectorT(const VectorT& src) - : VectorT(src.getSize(), src.getMemoryHandle(), 0, /* offset = 0 */ + : VectorT(src.getSize(), + src.getMemoryHandle(), + 0, /* offset = 0 */ false /* useGpu = false */) { if (typeid(*this->memoryHandle_.get()) != typeid(CpuMemoryHandle)) { this->memoryHandle_ = @@ -646,8 +653,10 @@ void CpuVectorT::copyFrom(const VectorT& src) { template void CpuVectorT::copyFrom(const VectorT& src, hl_stream_t stream) { if (typeid(src) == typeid(GpuVectorT)) { - hl_memcpy_async((void*)this->getData(), (void*)src.getData(), - sizeof(T) * this->getSize(), stream); + hl_memcpy_async((void*)this->getData(), + (void*)src.getData(), + sizeof(T) * this->getSize(), + stream); } else { src.copyTo(this); } @@ -661,7 +670,8 @@ void CpuVectorT::copyFrom(const T* hostSrc, size_t size) { } template -void CpuVectorT::copyFrom(const T* hostSrc, size_t size, +void CpuVectorT::copyFrom(const T* hostSrc, + size_t size, hl_stream_t stream) { (void)stream; @@ -679,7 +689,8 @@ void CpuVectorT::copyTo(CpuVectorT* dest) const { template void CpuVectorT::copyTo(GpuVectorT* dest) const { CHECK_EQ(this->getSize(), dest->getSize()); - hl_memcpy_host2device((void*)dest->getData(), (void*)this->getData(), + hl_memcpy_host2device((void*)dest->getData(), + (void*)this->getData(), sizeof(T) * this->getSize()); } @@ -723,8 +734,8 @@ void ParallelCpuVectorT::parallelExec(ExecFunc func) { template <> void ParallelCpuVectorT::parallelExec(ExecFunc func) { pool_->exec([this, func](int tid, size_t numThreads) { - auto interval = calcSplitArrayInterval(this->getSize(), (size_t)tid, - numThreads, 8LU /*for avx*/); + auto interval = calcSplitArrayInterval( + this->getSize(), (size_t)tid, numThreads, 8LU /*for avx*/); // setup sub bufs CpuVector subVec(0, nullptr); subVec.subVecFrom(*this, interval); @@ -743,7 +754,8 @@ void ParallelCpuVectorT::exec(SyncThreadPool::JobFunc func) { } template -CpuGpuVectorT::CpuGpuVectorT(size_t size, bool useGpu) : sync_(nullptr) { +CpuGpuVectorT::CpuGpuVectorT(size_t size, bool useGpu) + : sync_(nullptr) { if (!useGpu) { cpuVectorT_ = std::make_shared>(size); } else { @@ -754,7 +766,7 @@ CpuGpuVectorT::CpuGpuVectorT(size_t size, bool useGpu) : sync_(nullptr) { template CpuGpuVectorT::CpuGpuVectorT(const std::shared_ptr>& src) - : sync_(nullptr) { + : sync_(nullptr) { bool useGpu = src->useGpu(); if (useGpu) { gpuVectorT_ = src; @@ -766,7 +778,7 @@ CpuGpuVectorT::CpuGpuVectorT(const std::shared_ptr>& src) template CpuGpuVectorT::CpuGpuVectorT(size_t size, T* data, bool useGpu) - : sync_(nullptr) { + : sync_(nullptr) { if (!useGpu) { cpuVectorT_ = std::make_shared>(size, data); setSync(DATA_AT_CPU); @@ -777,8 +789,8 @@ CpuGpuVectorT::CpuGpuVectorT(size_t size, T* data, bool useGpu) } template -std::shared_ptr> -CpuGpuVectorT::create(size_t size, bool useGpu) { +std::shared_ptr> CpuGpuVectorT::create(size_t size, + bool useGpu) { return std::make_shared>(size, useGpu); } @@ -809,9 +821,9 @@ void CpuGpuVectorT::resize(size_t size, bool useGpu) { } template -void CpuGpuVectorT::resizeOrCreate( - std::shared_ptr>& vec, - size_t size, bool useGpu) { +void CpuGpuVectorT::resizeOrCreate(std::shared_ptr>& vec, + size_t size, + bool useGpu) { if (vec) { vec->resize(size, useGpu); } else { @@ -833,7 +845,9 @@ void CpuGpuVectorT::resizeOrCreate(size_t size, bool useGpu) { template CpuGpuVectorT::CpuGpuVectorT(CpuGpuVectorT& src, - size_t offset, size_t size) : sync_(nullptr) { + size_t offset, + size_t size) + : sync_(nullptr) { CHECK_LE(offset + size, static_cast(src.getSize())); #ifndef PADDLE_ONLY_CPU SyncedFlag* flag = src.getSync(); @@ -844,21 +858,21 @@ CpuGpuVectorT::CpuGpuVectorT(CpuGpuVectorT& src, } #endif auto cMemHandle = (src.getVector(false))->getMemoryHandle(); - cpuVectorT_ = std::make_shared>(size, - std::dynamic_pointer_cast(cMemHandle), offset); + cpuVectorT_ = std::make_shared>( + size, std::dynamic_pointer_cast(cMemHandle), offset); #ifndef PADDLE_ONLY_CPU auto gMemHandle = (src.getVector(true))->getMemoryHandle(); - gpuVectorT_ = std::make_shared>(size, - std::dynamic_pointer_cast(gMemHandle), offset); + gpuVectorT_ = std::make_shared>( + size, std::dynamic_pointer_cast(gMemHandle), offset); src.setSync(SYNCED); #endif setSync(src.getSync()); } template -std::shared_ptr> -CpuGpuVectorT::getVector(bool useGpu) const { - auto * self = const_cast*>(this); +std::shared_ptr> CpuGpuVectorT::getVector( + bool useGpu) const { + auto* self = const_cast*>(this); if (useGpu) { self->copyToGpu(); return std::const_pointer_cast>(gpuVectorT_); @@ -964,8 +978,10 @@ void CpuGpuVectorT::copyFrom(const T* data, size_t size, bool useGpu) { } template -void CpuGpuVectorT::copyFrom(const T* data, size_t size, - hl_stream_t stream, bool useGpu) { +void CpuGpuVectorT::copyFrom(const T* data, + size_t size, + hl_stream_t stream, + bool useGpu) { if (useGpu) { copyToGpu(data, size, stream); } else { @@ -975,7 +991,10 @@ void CpuGpuVectorT::copyFrom(const T* data, size_t size, template void CpuGpuVectorT::copyFrom(CpuGpuVectorT& src, - size_t offset, size_t size, bool useGpu, hl_stream_t stream) { + size_t offset, + size_t size, + bool useGpu, + hl_stream_t stream) { if (useGpu) { VectorT::resizeOrCreate(gpuVectorT_, size, true); gpuVectorT_->copyFrom(src.getData(true) + offset, size, stream); @@ -987,8 +1006,7 @@ void CpuGpuVectorT::copyFrom(CpuGpuVectorT& src, } template -void CpuGpuVectorT::copyFrom(CpuGpuVectorT& src, - hl_stream_t stream) { +void CpuGpuVectorT::copyFrom(CpuGpuVectorT& src, hl_stream_t stream) { switch (*src.getSync()) { case DATA_AT_CPU: copyFrom(*(src.getVector(false)), stream); diff --git a/paddle/math/Vector.h b/paddle/math/Vector.h index faf8186b6d10d7cbc14376ff3b6543d1303b2ab1..46a25c04dff6041222b8c97b8904322546f2bbe3 100644 --- a/paddle/math/Vector.h +++ b/paddle/math/Vector.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include @@ -39,12 +38,11 @@ class SyncThreadPool; class Matrix; -template +template class BaseVector : public BaseMatrixT { public: BaseVector(size_t size, T* data, bool useGpu) - : BaseMatrixT(1, size, data, false, useGpu), - size_(this->width_) {} + : BaseMatrixT(1, size, data, false, useGpu), size_(this->width_) {} ~BaseVector() {} @@ -113,7 +111,8 @@ public: this->size_ = newSize; } - static void resizeOrCreate(std::shared_ptr>& vec, size_t size, + static void resizeOrCreate(std::shared_ptr>& vec, + size_t size, bool useGpu) { if (vec) { vec->resize(size); @@ -431,11 +430,7 @@ public: * * SYNCED: data is located in CPU and GPU simultaneously. */ - enum SyncedFlag { - DATA_AT_CPU = 0, - DATA_AT_GPU = 1, - SYNCED = 2 - }; + enum SyncedFlag { DATA_AT_CPU = 0, DATA_AT_GPU = 1, SYNCED = 2 }; /** * @brief A constructor, create cpuVectorT_ or gpuVectorT_. @@ -469,8 +464,7 @@ public: */ CpuGpuVectorT(size_t size, T* data, bool useGpu); - CpuGpuVectorT(CpuGpuVectorT& src, - size_t offset, size_t size); + CpuGpuVectorT(CpuGpuVectorT& src, size_t offset, size_t size); virtual ~CpuGpuVectorT() {} @@ -489,8 +483,8 @@ public: * @brief resize or create CpuGpuVectorT. */ static void resizeOrCreate(std::shared_ptr>& vec, - size_t size, bool useGpu); - + size_t size, + bool useGpu); /** * @brief return a const cpuVectorT_ or gpuVectorT_. @@ -522,10 +516,10 @@ public: */ const T* getData(bool useGpu) const; -// TODO(yuyang18): Make getData more c++ style. -// inline T* getData(bool useGpu) { -// return getMutableData(useGpu); -// } + // TODO(yuyang18): Make getData more c++ style. + // inline T* getData(bool useGpu) { + // return getMutableData(useGpu); + // } T* getMutableData(bool useGpu); @@ -615,8 +609,11 @@ public: /** * @brief copy from (src + offset) using specifed-stream. */ - void copyFrom(CpuGpuVectorT& src, size_t offset, size_t size, - bool useGpu, hl_stream_t stream); + void copyFrom(CpuGpuVectorT& src, + size_t offset, + size_t size, + bool useGpu, + hl_stream_t stream); /** * @brief copy from src using specifed-stream. @@ -626,16 +623,12 @@ public: /** * @brief return sync_. */ - inline SyncedFlag* getSync() const { - return sync_; - } + inline SyncedFlag* getSync() const { return sync_; } /** * @brief set sync_. */ - inline void setSync(SyncedFlag* sync) { - sync_ = sync; - } + inline void setSync(SyncedFlag* sync) { sync_ = sync; } inline void setSync(SyncedFlag syncFlag) { if (sync_) { diff --git a/paddle/math/tests/test_Allocator.cpp b/paddle/math/tests/test_Allocator.cpp index c94e7f043c04a4551e0be76c6761a1078fadcd36..084322a1caf579cf6237b41c51efa220c6f2d5a2 100644 --- a/paddle/math/tests/test_Allocator.cpp +++ b/paddle/math/tests/test_Allocator.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include #include "paddle/utils/Util.h" #include "paddle/utils/Logging.h" @@ -21,11 +20,12 @@ limitations under the License. */ #include "paddle/math/Allocator.h" #include "paddle/math/PoolAllocator.h" -using namespace paddle; // NOLINT +using namespace paddle; // NOLINT -template +template void testPoolAllocator() { - PoolAllocator* pool = new PoolAllocator(new Allocator(), /* sizeLimit */1024); + PoolAllocator* pool = + new PoolAllocator(new Allocator(), /* sizeLimit */ 1024); /* alloc from system memory */ void* ptr1 = pool->alloc(10); diff --git a/paddle/math/tests/test_ExecViaCpu.cpp b/paddle/math/tests/test_ExecViaCpu.cpp index ae201f172373caa45186cdc378cf9dd06a136181..b3eca19a7291d2b71b801793f824c1087a3ded27 100644 --- a/paddle/math/tests/test_ExecViaCpu.cpp +++ b/paddle/math/tests/test_ExecViaCpu.cpp @@ -23,7 +23,10 @@ using namespace paddle; // NOLINT const int height = 10; const int width = 16; -real f(Matrix& mat1, const Matrix& mat2, IVector& vec1, const IVector& vec2, +real f(Matrix& mat1, + const Matrix& mat2, + IVector& vec1, + const IVector& vec2, real scalar) { CHECK(!mat1.useGpu()); CHECK(!mat2.useGpu()); @@ -37,8 +40,11 @@ real f(Matrix& mat1, const Matrix& mat2, IVector& vec1, const IVector& vec2, class Functor { public: - real operator()(Matrix& mat1, const Matrix& mat2, IVector& vec1, - const IVector& vec2, real scalar) { + real operator()(Matrix& mat1, + const Matrix& mat2, + IVector& vec1, + const IVector& vec2, + real scalar) { a_ = f(mat1, mat2, vec1, vec2, scalar); return a_; } @@ -93,9 +99,13 @@ TEST(ExecViaCpu, test1) { testWrapper(f); testWrapper(&f); - auto lambda = - [](Matrix& mat1, const Matrix& mat2, IVector& vec1, const IVector& vec2, - real scalar) -> real { return f(mat1, mat2, vec1, vec2, scalar); }; + auto lambda = [](Matrix& mat1, + const Matrix& mat2, + IVector& vec1, + const IVector& vec2, + real scalar) -> real { + return f(mat1, mat2, vec1, vec2, scalar); + }; LOG(INFO) << "lambda is_class=" << std::is_class::value << " is_function=" << std::is_function::value; testWrapper(lambda); diff --git a/paddle/math/tests/test_FPException.cpp b/paddle/math/tests/test_FPException.cpp index 174278c2aaac4575a6ea0b219bf7a389db712703..f996e0daddd3ef41e195de48640631a979a87192 100644 --- a/paddle/math/tests/test_FPException.cpp +++ b/paddle/math/tests/test_FPException.cpp @@ -12,14 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - /** * This test is about floating point calculation exception. * Paddle catches FE_INVALID, FE DIVBYZERO and FE_OVERFLOW exceptions. * - * Some exceptions occur in the middle of a set of formulas, + * Some exceptions occur in the middle of a set of formulas, * that can be circumvented by some tricks. - * For example, + * For example, * calculate tanh * b = 2.0 / (1.0 + exp(-2 * a)) - 1.0 * @@ -34,7 +33,7 @@ limitations under the License. */ #include "paddle/math/Matrix.h" #include "paddle/utils/Excepts.h" -using namespace paddle; // NOLINT +using namespace paddle; // NOLINT void SetTensorValue(Matrix& matrix, real value) { int height = matrix.getHeight(); @@ -53,7 +52,7 @@ void SetTensorValue(Matrix& matrix, real value) { } } -template +template void testTanh(real illegal) { MatrixPtr A = std::make_shared(10, 10); MatrixPtr B = std::make_shared(10, 10); @@ -65,7 +64,7 @@ void testTanh(real illegal) { A->tanh(*B); } -template +template void testSigmoid(real illegal) { MatrixPtr A = std::make_shared(10, 10); MatrixPtr B = std::make_shared(10, 10); diff --git a/paddle/math/tests/test_SIMDFunctions.cpp b/paddle/math/tests/test_SIMDFunctions.cpp index 491b0cda7b9e1a13882aee6621e0de984709ae80..8405b96fc2b915e2e1a5676ab5e3f25b4acde75a 100644 --- a/paddle/math/tests/test_SIMDFunctions.cpp +++ b/paddle/math/tests/test_SIMDFunctions.cpp @@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - - #include "paddle/math/SIMDFunctions.h" #include "paddle/utils/Util.h" @@ -128,13 +126,13 @@ TEST(SIMDFunction, decayL1_WithLR) { typedef std::function DecayL1MethodType; - DecayL1MethodType naive = [](float* d, float* s, float* lr, float l, - size_t len) { + DecayL1MethodType naive = []( + float* d, float* s, float* lr, float l, size_t len) { paddle::simd::naive::decayL1(d, s, lr, l, len); }; - DecayL1MethodType simd = [](float* d, float* s, float* lr, float l, - size_t len) { + DecayL1MethodType simd = []( + float* d, float* s, float* lr, float l, size_t len) { paddle::simd::decayL1(d, s, lr, l, len); }; diff --git a/paddle/math/tests/test_batchTranspose.cpp b/paddle/math/tests/test_batchTranspose.cpp index 737504da388be72de70d37d87dc866b8448f6cd2..a9596992b2b1fced417c048600b05b39882b2bf2 100644 --- a/paddle/math/tests/test_batchTranspose.cpp +++ b/paddle/math/tests/test_batchTranspose.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "test_matrixUtil.h" #include "hl_batch_transpose.h" @@ -48,8 +47,8 @@ TEST(MatrixBatchTransTest, test_batch_matrix_transpose) { cData[sample_id * nx * ny + j * nx + i]; // device gMat->copyFrom(*cMat, HPPL_STREAM_DEFAULT); - batchTranspose(gMat->getData(), gBatchTransMat->getData(), nx, ny, - numSamples); + batchTranspose( + gMat->getData(), gBatchTransMat->getData(), nx, ny, numSamples); cMat_d2h->copyFrom(*gBatchTransMat, HPPL_STREAM_DEFAULT); checkMatrixEqual(cBatchTransMat, cMat_d2h); } diff --git a/paddle/math/tests/test_matrix.cpp b/paddle/math/tests/test_matrix.cpp index 71c9622420aef73848ee7e85c505a6d40f64f3c1..3788218aab100d4ad683e85149a9513e54ca2480 100644 --- a/paddle/math/tests/test_matrix.cpp +++ b/paddle/math/tests/test_matrix.cpp @@ -48,7 +48,8 @@ struct MatrixPara { }; #ifndef PADDLE_ONLY_CPU -void test_sparse_matrix_mul(MatrixPara paraA, MatrixPara paraB, +void test_sparse_matrix_mul(MatrixPara paraA, + MatrixPara paraB, MatrixPara paraC) { // for cpu sparse matrix mul MatrixPtr cpuMatrixA, cpuMatrixB, cpuMatrixC, gpuMatrixC_d2h; @@ -58,12 +59,20 @@ void test_sparse_matrix_mul(MatrixPara paraA, MatrixPara paraB, MatrixPtr cpuDenseA, cpuDenseB, cpuDenseC; if (paraA.sparse) { - cpuMatrixA = Matrix::createSparseMatrix(paraA.height, paraA.width, - paraA.nnz, FLOAT_VALUE, - paraA.format, paraA.trans, false); - gpuMatrixA = Matrix::createSparseMatrix(paraA.height, paraA.width, - paraA.nnz, FLOAT_VALUE, - paraA.format, paraA.trans, true); + cpuMatrixA = Matrix::createSparseMatrix(paraA.height, + paraA.width, + paraA.nnz, + FLOAT_VALUE, + paraA.format, + paraA.trans, + false); + gpuMatrixA = Matrix::createSparseMatrix(paraA.height, + paraA.width, + paraA.nnz, + FLOAT_VALUE, + paraA.format, + paraA.trans, + true); } else { cpuMatrixA = Matrix::create(paraA.height, paraA.width, paraA.trans, false); gpuMatrixA = Matrix::create(paraA.height, paraA.width, paraA.trans, true); @@ -71,12 +80,20 @@ void test_sparse_matrix_mul(MatrixPara paraA, MatrixPara paraB, cpuDenseA = Matrix::create(paraA.height, paraA.width, paraA.trans, false); if (paraB.sparse) { - cpuMatrixB = Matrix::createSparseMatrix(paraB.height, paraB.width, - paraB.nnz, FLOAT_VALUE, - paraB.format, paraB.trans, false); - gpuMatrixB = Matrix::createSparseMatrix(paraB.height, paraB.width, - paraB.nnz, FLOAT_VALUE, - paraB.format, paraB.trans, true); + cpuMatrixB = Matrix::createSparseMatrix(paraB.height, + paraB.width, + paraB.nnz, + FLOAT_VALUE, + paraB.format, + paraB.trans, + false); + gpuMatrixB = Matrix::createSparseMatrix(paraB.height, + paraB.width, + paraB.nnz, + FLOAT_VALUE, + paraB.format, + paraB.trans, + true); } else { cpuMatrixB = Matrix::create(paraB.height, paraB.width, paraB.trans, false); gpuMatrixB = Matrix::create(paraB.height, paraB.width, paraB.trans, true); @@ -84,15 +101,27 @@ void test_sparse_matrix_mul(MatrixPara paraA, MatrixPara paraB, cpuDenseB = Matrix::create(paraB.height, paraB.width, paraB.trans, false); if (paraC.sparse) { - cpuMatrixC = Matrix::createSparseMatrix(paraC.height, paraC.width, - paraC.nnz, FLOAT_VALUE, - paraC.format, paraC.trans, false); - gpuMatrixC = Matrix::createSparseMatrix(paraC.height, paraC.width, - paraC.nnz, FLOAT_VALUE, - paraC.format, paraC.trans, true); - gpuMatrixC_d2h = Matrix::createSparseMatrix( - paraC.height, paraC.width, paraC.nnz, FLOAT_VALUE, paraC.format, - paraC.trans, false); + cpuMatrixC = Matrix::createSparseMatrix(paraC.height, + paraC.width, + paraC.nnz, + FLOAT_VALUE, + paraC.format, + paraC.trans, + false); + gpuMatrixC = Matrix::createSparseMatrix(paraC.height, + paraC.width, + paraC.nnz, + FLOAT_VALUE, + paraC.format, + paraC.trans, + true); + gpuMatrixC_d2h = Matrix::createSparseMatrix(paraC.height, + paraC.width, + paraC.nnz, + FLOAT_VALUE, + paraC.format, + paraC.trans, + false); } else { cpuMatrixC = Matrix::create(paraC.height, paraC.width, paraC.trans, false); gpuMatrixC = Matrix::create(paraC.height, paraC.width, paraC.trans, true); @@ -267,8 +296,8 @@ TEST(Matrix, CpuSparseMatrixSubMatrix) { } } -void sparseValid(int* major, int* minor, size_t nnz, size_t majorLen, - size_t minorLen) { +void sparseValid( + int* major, int* minor, size_t nnz, size_t majorLen, size_t minorLen) { CHECK_EQ(nnz, size_t(major[majorLen - 1])); CHECK_EQ(nnz, minorLen); for (size_t i = 0; i < majorLen - 1; i++) { @@ -375,14 +404,25 @@ TEST(Matrix, SparseMatrixCSRFormatTrimFrom) { int64_t trimedIndices[11] = {0, 1, 3, 3, 7, 7, 9, 10, 12, 16, 19}; sparse_float_value_t trimedData[19]; int trimedValue[19] = { - 1, // row_0 : 1 - 3, 1, // row_1 : 2 - 0, 1, 2, 3, // row_3 : 4 - 2, 3, // row_5 : 2 - 3, // row_6 : 1 - 0, 1, // row_7 : 2 - 0, 1, 2, 3, // row_8 : 4 - 2, 3, 1 // row_9 : 3 + 1, // row_0 : 1 + 3, + 1, // row_1 : 2 + 0, + 1, + 2, + 3, // row_3 : 4 + 2, + 3, // row_5 : 2 + 3, // row_6 : 1 + 0, + 1, // row_7 : 2 + 0, + 1, + 2, + 3, // row_8 : 4 + 2, + 3, + 1 // row_9 : 3 }; for (size_t i = 0; i < 19; i++) { trimedData[i].col = trimedValue[i]; @@ -415,9 +455,13 @@ TEST(Matrix, SparseMatrixCSRFormatTrimFrom) { height, trimedWidth, height, FLOAT_VALUE, SPARSE_CSR, true); matC->trimFrom(*mat); - CpuSparseMatrixPtr matD = std::make_shared( - height, trimedWidth, matC->getElementCnt(), FLOAT_VALUE, SPARSE_CSR, - false); + CpuSparseMatrixPtr matD = + std::make_shared(height, + trimedWidth, + matC->getElementCnt(), + FLOAT_VALUE, + SPARSE_CSR, + false); matD->copyFrom(*matC, HPPL_STREAM_DEFAULT); hl_stream_synchronize(HPPL_STREAM_DEFAULT); checkSMatrixEqual2(matA, matD); @@ -462,11 +506,17 @@ TEST(Matrix, SparseMatrixCSCFormatTrimFrom) { int trimedIndices[6] = {0, 1, 5, 5, 9, 13}; int trimedValue[13] = { 1, // col_0 : 1 - 5, 3, 1, + 5, + 3, + 1, 6, // col_1 : 4 - 0, 1, 2, + 0, + 1, + 2, 3, // col_3 : 4 - 4, 5, 6, + 4, + 5, + 6, 7 // col_4 : 4 }; std::vector rowsA(trimedValue, trimedValue + 13); @@ -499,9 +549,13 @@ TEST(Matrix, SparseMatrixCSCFormatTrimFrom) { height, trimedWidth, height, FLOAT_VALUE, SPARSE_CSC, true); matC->trimFrom(*mat); - CpuSparseMatrixPtr matD = std::make_shared( - height, trimedWidth, matC->getElementCnt(), FLOAT_VALUE, SPARSE_CSC, - false); + CpuSparseMatrixPtr matD = + std::make_shared(height, + trimedWidth, + matC->getElementCnt(), + FLOAT_VALUE, + SPARSE_CSC, + false); matD->copyFrom(*matC, HPPL_STREAM_DEFAULT); hl_stream_synchronize(HPPL_STREAM_DEFAULT); checkSMatrixEqual2(matA, matD); diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp index 9c03695ba5055c4bdb3e7c578d3e352fbd6fae6f..ae5bc5a86a1790ce30a8d7f83c9564f52d7cf7ea 100644 --- a/paddle/math/tests/test_matrixCompare.cpp +++ b/paddle/math/tests/test_matrixCompare.cpp @@ -23,11 +23,10 @@ limitations under the License. */ #include "paddle/gserver/tests/TestUtil.h" #include "paddle/utils/Stat.h" - using namespace paddle; // NOLINT using namespace std; // NOLINT -template +template void VectorCheckEqual(const VectorT& vector1, const VectorT& vector2) { CHECK(vector1.getSize() == vector2.getSize()); @@ -90,7 +89,9 @@ void MatrixCheckErr(const Matrix& matrix1, const Matrix& matrix2) { EXPECT_EQ(count, 0) << "There are " << count << " different element."; } -void testBilinearFwdBwd(int numSamples, int imgSizeH, int imgSizeW, +void testBilinearFwdBwd(int numSamples, + int imgSizeH, + int imgSizeW, int channels) { int inWidth = imgSizeH * imgSizeW * channels; int outWidth = 2 * imgSizeH * 2 * imgSizeW * channels; @@ -107,10 +108,22 @@ void testBilinearFwdBwd(int numSamples, int imgSizeH, int imgSizeW, input->randomizeUniform(); inputGpu->copyFrom(*input); - target->bilinearForward(*input, imgSizeH, imgSizeW, - 2 * imgSizeH, 2 * imgSizeW, channels, ratioH, ratioW); - targetGpu->bilinearForward(*inputGpu, imgSizeH, imgSizeW, - 2 * imgSizeH, 2 * imgSizeW, channels, ratioH, ratioW); + target->bilinearForward(*input, + imgSizeH, + imgSizeW, + 2 * imgSizeH, + 2 * imgSizeW, + channels, + ratioH, + ratioW); + targetGpu->bilinearForward(*inputGpu, + imgSizeH, + imgSizeW, + 2 * imgSizeH, + 2 * imgSizeW, + channels, + ratioH, + ratioW); // check targetCheck->copyFrom(*targetGpu); @@ -121,8 +134,8 @@ void testBilinearFwdBwd(int numSamples, int imgSizeH, int imgSizeW, MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true); MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false); - MatrixPtr targetGpuGrad = GpuMatrix::create(numSamples, outWidth, false, - true); + MatrixPtr targetGpuGrad = + GpuMatrix::create(numSamples, outWidth, false, true); MatrixPtr targetCheckGrad = CpuMatrix::create(numSamples, inWidth, false, false); @@ -131,10 +144,22 @@ void testBilinearFwdBwd(int numSamples, int imgSizeH, int imgSizeW, inputGpuGrad->copyFrom(*inputGrad); targetGpuGrad->copyFrom(*targetGrad); - inputGrad->bilinearBackward(*targetGrad, 2 * imgSizeH, 2 * imgSizeW, - imgSizeH, imgSizeW, channels, ratioH, ratioW); - inputGpuGrad->bilinearBackward(*targetGpuGrad, 2 * imgSizeH, 2 * imgSizeW, - imgSizeH, imgSizeW, channels, ratioH, ratioW); + inputGrad->bilinearBackward(*targetGrad, + 2 * imgSizeH, + 2 * imgSizeW, + imgSizeH, + imgSizeW, + channels, + ratioH, + ratioW); + inputGpuGrad->bilinearBackward(*targetGpuGrad, + 2 * imgSizeH, + 2 * imgSizeW, + imgSizeH, + imgSizeW, + channels, + ratioH, + ratioW); // check targetCheckGrad->copyFrom(*inputGpuGrad); @@ -146,10 +171,8 @@ TEST(Matrix, BilinearFwdBwd) { for (auto channels : {8, 16}) { for (auto imgSizeH : {14, 28}) { for (auto imgSizeW : {16, 30}) { - VLOG(3) << " numSamples=" << numSamples - << " channels=" << channels - << " imgSizeH=" << imgSizeH - << " imgSizeW=" << imgSizeW; + VLOG(3) << " numSamples=" << numSamples << " channels=" << channels + << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW; testBilinearFwdBwd(numSamples, imgSizeH, imgSizeW, channels); } } @@ -157,8 +180,11 @@ TEST(Matrix, BilinearFwdBwd) { } } -void testMatrixProjectionForward(int contextStart, int contextLength, - bool padding, int batchSize, int inputDim) { +void testMatrixProjectionForward(int contextStart, + int contextLength, + bool padding, + int batchSize, + int inputDim) { MatrixPtr cpuInput = std::make_shared(batchSize, inputDim); MatrixPtr gpuInput = std::make_shared(batchSize, inputDim); cpuInput->randomizeUniform(); @@ -190,12 +216,20 @@ void testMatrixProjectionForward(int contextStart, int contextLength, // calculate int beginPad = std::max(0, -contextStart); - cpuOutput->contextProjectionForward(cpuInput, cpuWeight, *cpuSequence, - contextLength, contextStart, beginPad, + cpuOutput->contextProjectionForward(cpuInput, + cpuWeight, + *cpuSequence, + contextLength, + contextStart, + beginPad, padding); - gpuOutput->contextProjectionForward(gpuInput, gpuWeight, *gpuSequence, - contextLength, contextStart, beginPad, + gpuOutput->contextProjectionForward(gpuInput, + gpuWeight, + *gpuSequence, + contextLength, + contextStart, + beginPad, padding); // check @@ -206,8 +240,11 @@ void testMatrixProjectionForward(int contextStart, int contextLength, MatrixCheckEqual(*cpuOutput, *outputCheck); } -void testMatrixProjectionBackward(int contextStart, int contextLength, - bool padding, int batchSize, int inputDim) { +void testMatrixProjectionBackward(int contextStart, + int contextLength, + bool padding, + int batchSize, + int inputDim) { MatrixPtr cpuOutputGrad = std::make_shared(batchSize, inputDim * contextLength); MatrixPtr gpuOutputGrad = @@ -239,15 +276,22 @@ void testMatrixProjectionBackward(int contextStart, int contextLength, // calculate int beginPad = std::max(0, -contextStart); - cpuOutputGrad->contextProjectionBackward(cpuInputGrad, cpuWeightGrad, - *cpuSequence, contextLength, - contextStart, beginPad, padding); - gpuOutputGrad->contextProjectionBackwardData(gpuInputGrad, *gpuSequence, - contextLength, contextStart); + cpuOutputGrad->contextProjectionBackward(cpuInputGrad, + cpuWeightGrad, + *cpuSequence, + contextLength, + contextStart, + beginPad, + padding); + gpuOutputGrad->contextProjectionBackwardData( + gpuInputGrad, *gpuSequence, contextLength, contextStart); if (padding) { - gpuOutputGrad->contextProjectionBackwardWeight( - gpuWeightGrad, *gpuSequence, contextLength, - contextStart, pad, beginPad); + gpuOutputGrad->contextProjectionBackwardWeight(gpuWeightGrad, + *gpuSequence, + contextLength, + contextStart, + pad, + beginPad); } // check @@ -269,13 +313,19 @@ TEST(Matrix, projection) { for (auto batchSize : {1, 2, 5, 20, 100}) { for (auto inputDim : {15, 32, 63, 128, 200}) { VLOG(3) << " contextStart=" << contextStart - << " contextLength=" << contextLength - << " trainablePadding=" << trainablePadding - << " batchSize=" << batchSize << " inputDim=" << inputDim; - testMatrixProjectionForward(contextStart, contextLength, - trainablePadding, batchSize, inputDim); - testMatrixProjectionBackward(contextStart, contextLength, - trainablePadding, batchSize, inputDim); + << " contextLength=" << contextLength + << " trainablePadding=" << trainablePadding + << " batchSize=" << batchSize << " inputDim=" << inputDim; + testMatrixProjectionForward(contextStart, + contextLength, + trainablePadding, + batchSize, + inputDim); + testMatrixProjectionBackward(contextStart, + contextLength, + trainablePadding, + batchSize, + inputDim); } } } @@ -813,7 +863,6 @@ void testSequenceSoftmax(int batchSize) { MatrixCheckErr(*cpuInput, *outputCheck); } - void testMatrixSoftmaxThreshold(int height, int width) { MatrixPtr cpuInput = std::make_shared(height, width); MatrixPtr cpuOutput = std::make_shared(height, width); @@ -1216,7 +1265,7 @@ TEST(Matrix, AtOffset) { for (auto width1 : {1, 32, 100, 512, 1000}) { for (auto width2 : {1, 32, 100, 512, 1000}) { VLOG(3) << " height=" << height << " width1=" << width1 - << " width2=" << width2; + << " width2=" << width2; testMatrixAddAtOffset(height, width1, width2); testMatrixAssignAtOffset(height, width1, width2); @@ -1284,7 +1333,7 @@ TEST(Matrix, tableProjection) { for (auto tableSize : {10, 100}) { for (auto inputDim : {20, 50}) { VLOG(3) << " numSamples=" << numSamples << " tableSize=" << tableSize - << " inputDim=" << inputDim; + << " inputDim=" << inputDim; testMatrixSelectRows(numSamples, tableSize, inputDim); testMatrixAddToRows(numSamples, tableSize, inputDim); } @@ -1359,8 +1408,12 @@ void testSubMatrixMul(bool transa, bool transb, int dimM, int dimN, int dimK) { } }; - auto subMatrix = [](MatrixPtr& sub, MatrixPtr matrix, size_t startRow, - size_t endRow, size_t startCol, size_t endCol) { + auto subMatrix = [](MatrixPtr& sub, + MatrixPtr matrix, + size_t startRow, + size_t endRow, + size_t startCol, + size_t endCol) { if (!matrix->isTransposed()) { sub = matrix->subMatrix(startRow, endRow, startCol, endCol); } else { @@ -1404,9 +1457,9 @@ TEST(Matrix, mul) { continue; } VLOG(3) << setiosflags(ios::left) << setfill(' ') - << " transa=" << transa << " transb=" << transb - << " dimM=" << setw(5) << dimM << " dimN=" << setw(5) - << dimN << " dimK=" << setw(5) << dimK; + << " transa=" << transa << " transb=" << transb + << " dimM=" << setw(5) << dimM << " dimN=" << setw(5) + << dimN << " dimK=" << setw(5) << dimK; testMatrixMul(transa, transb, dimM, dimN, dimK); testSubMatrixMul(transa, transb, dimM, dimN, dimK); @@ -1436,7 +1489,7 @@ TEST(Vector, rowFunc) { } } -template +template void testVectorReset(int size) { std::shared_ptr> cpu = std::make_shared>(size); std::shared_ptr> gpu = std::make_shared>(size); @@ -1450,14 +1503,14 @@ void testVectorReset(int size) { VectorCheckEqual(*cpu, *out); } -template +template void testVecortSelectFrom(int size) { std::shared_ptr> cpuDst = std::make_shared>(size); std::shared_ptr> gpuDst = std::make_shared>(size); - std::shared_ptr> - cpuSrc = std::make_shared>(size*2); - std::shared_ptr> - gpuSrc = std::make_shared>(size*2); + std::shared_ptr> cpuSrc = + std::make_shared>(size * 2); + std::shared_ptr> gpuSrc = + std::make_shared>(size * 2); CpuIVectorPtr cpuIds = std::make_shared>(size); GpuIVectorPtr gpuIds = std::make_shared>(size); @@ -1478,7 +1531,7 @@ void testVecortSelectFrom(int size) { VectorCheckEqual(*cpuDst, *out); } -template +template void testVecotrZeroMem(int size) { std::shared_ptr> cpu = std::make_shared>(size); std::shared_ptr> gpu = std::make_shared>(size); @@ -1491,7 +1544,7 @@ void testVecotrZeroMem(int size) { VectorCheckEqual(*cpu, *out); } -template +template void testVectorIsEqual(int size) { std::shared_ptr> cpuA = std::make_shared>(size); std::shared_ptr> cpuB = std::make_shared>(size); @@ -1549,12 +1602,11 @@ void testMatrixTopK(int samples, int dim, int beamSize) { TEST(Matrix, topK) { for (auto samples : {1, 5, 31, 90, 150, 500}) { - for (auto dim : {1, 5 , 8, 10, 15, 64, 80, 120, 256, 300, - 1280, 5120, 50000}) { + for (auto dim : + {1, 5, 8, 10, 15, 64, 80, 120, 256, 300, 1280, 5120, 50000}) { for (auto beamSize : {1, 5, 10, 20, 40, (int)rand() % dim + 1}) { if (beamSize > dim) continue; - VLOG(3) << " samples=" << samples - << " beamSize=" << beamSize + VLOG(3) << " samples=" << samples << " beamSize=" << beamSize << " dim=" << dim; testMatrixTopK(samples, dim, beamSize); } @@ -1604,10 +1656,8 @@ TEST(SMatrix, topK) { for (auto beamSize : {1, 5, 40, 100, 500}) { for (auto ratio : {0.01, 0.001}) { if (beamSize > dim) continue; - VLOG(3) << " samples=" << samples - << " beamSize=" << beamSize - << " dim=" << dim - << " ratio=" << ratio; + VLOG(3) << " samples=" << samples << " beamSize=" << beamSize + << " dim=" << dim << " ratio=" << ratio; testSMatrixTopK(samples, dim, beamSize, ratio); } } @@ -1728,8 +1778,7 @@ TEST(Matrix, cosSim) { } } -void testCosSimDerivate(int heightX, int heightY, int width, - real scale) { +void testCosSimDerivate(int heightX, int heightY, int width, real scale) { MatrixPtr prevOutX = CpuMatrix::create(heightX, width, false, false); MatrixPtr prevOutY = CpuMatrix::create(heightY, width, false, false); MatrixPtr grad = CpuMatrix::create(heightX, 1, false, false); @@ -1758,12 +1807,8 @@ void testCosSimDerivate(int heightX, int heightY, int width, prevGradXGpu->copyFrom(*prevGradX); prevGradYGpu->copyFrom(*prevGradY); - grad->cosSimDerivative(*output, - *prevOutX, - *prevOutY, - *prevGradX, - *prevGradY, - scale); + grad->cosSimDerivative( + *output, *prevOutX, *prevOutY, *prevGradX, *prevGradY, scale); gradGpu->cosSimDerivative(*outputGpu, *prevOutXGpu, @@ -1772,10 +1817,8 @@ void testCosSimDerivate(int heightX, int heightY, int width, *prevGradYGpu, scale); - MatrixPtr prevGradXCheck = CpuMatrix::create(heightX, width, false, - false); - MatrixPtr prevGradYCheck = CpuMatrix::create(heightY, width, false, - false); + MatrixPtr prevGradXCheck = CpuMatrix::create(heightX, width, false, false); + MatrixPtr prevGradYCheck = CpuMatrix::create(heightY, width, false, false); prevGradXCheck->copyFrom(*prevGradXGpu); prevGradYCheck->copyFrom(*prevGradYGpu); MatrixCheckErr(*prevGradX, *prevGradXCheck); @@ -1794,8 +1837,7 @@ TEST(Matrix, cosSimDerivate) { } } -void testParamReluForward(int height, int width, int w_height, - int w_width) { +void testParamReluForward(int height, int width, int w_height, int w_width) { MatrixPtr output = CpuMatrix::create(height, width, false, false); MatrixPtr input = CpuMatrix::create(height, width, false, false); MatrixPtr w = CpuMatrix::create(w_height, w_width, false, false); @@ -1832,8 +1874,7 @@ TEST(Matrix, paramReluForward) { } } -void testParamReluBackwardW(int height, int width, int w_height, - int w_width) { +void testParamReluBackwardW(int height, int width, int w_height, int w_width) { MatrixPtr oGrad = CpuMatrix::create(height, width, false, false); MatrixPtr input = CpuMatrix::create(height, width, false, false); MatrixPtr w = CpuMatrix::create(w_height, w_width, false, false); @@ -1870,8 +1911,10 @@ TEST(Matrix, paramReluBackwardW) { } } -void testParamReluBackwardDiff(int height, int width, int w_height, - int w_width) { +void testParamReluBackwardDiff(int height, + int width, + int w_height, + int w_width) { MatrixPtr oGrad = CpuMatrix::create(height, width, false, false); MatrixPtr input = CpuMatrix::create(height, width, false, false); MatrixPtr diff = CpuMatrix::create(height, width, false, false); @@ -1943,11 +1986,16 @@ TEST(Matrix, classificationError) { } } -void testMaxPoolFwdBwd(int numSamples, int channels, - int imgSizeH, int imgSizeW, - int ksizeH, int ksizeW, - int strideH, int strideW, - int padH, int padW) { +void testMaxPoolFwdBwd(int numSamples, + int channels, + int imgSizeH, + int imgSizeW, + int ksizeH, + int ksizeW, + int strideH, + int strideW, + int padH, + int padW) { int outH = 0, outW = 0; outH = (imgSizeH - ksizeH + 2 * padH + strideH - 1) / strideH + 1; outW = (imgSizeW - ksizeW + 2 * padW + strideW - 1) / strideW + 1; @@ -1965,12 +2013,30 @@ void testMaxPoolFwdBwd(int numSamples, int channels, inputGpu->copyFrom(*input); targetGpu->copyFrom(*target); - target->maxPoolForward(*input, imgSizeH, imgSizeW, - channels, ksizeW, ksizeH, - strideH, strideW, outH, outW, padH, padW); - targetGpu->maxPoolForward(*inputGpu, imgSizeH, imgSizeW, - channels, ksizeW, ksizeH, - strideH, strideW, outH, outW, padH, padW); + target->maxPoolForward(*input, + imgSizeH, + imgSizeW, + channels, + ksizeW, + ksizeH, + strideH, + strideW, + outH, + outW, + padH, + padW); + targetGpu->maxPoolForward(*inputGpu, + imgSizeH, + imgSizeW, + channels, + ksizeW, + ksizeH, + strideH, + strideW, + outH, + outW, + padH, + padW); MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false); targetCheck->copyFrom(*targetGpu); checkMatrixEqual(target, targetCheck); @@ -1978,35 +2044,60 @@ void testMaxPoolFwdBwd(int numSamples, int channels, MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false); MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true); MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false); - MatrixPtr targetGpuGrad = GpuMatrix::create(numSamples, outWidth, - false, true); + MatrixPtr targetGpuGrad = + GpuMatrix::create(numSamples, outWidth, false, true); inputGrad->randomizeUniform(); targetGrad->randomizeUniform(); inputGpuGrad->copyFrom(*inputGrad); targetGpuGrad->copyFrom(*targetGrad); - inputGrad->maxPoolBackward(*input, imgSizeH, imgSizeW, - *targetGrad, *target, - ksizeW, ksizeH, - strideH, strideW, - outH, outW, 1.0, 1.0, padH, padW); - inputGpuGrad->maxPoolBackward(*inputGpu, imgSizeH, imgSizeW, - *targetGpuGrad, *targetGpu, - ksizeW, ksizeH, - strideH, strideW, - outH, outW, 1.0, 1.0, padH, padW); - MatrixPtr targetBwdCheck = CpuMatrix::create(numSamples, inWidth, - false, false); + inputGrad->maxPoolBackward(*input, + imgSizeH, + imgSizeW, + *targetGrad, + *target, + ksizeW, + ksizeH, + strideH, + strideW, + outH, + outW, + 1.0, + 1.0, + padH, + padW); + inputGpuGrad->maxPoolBackward(*inputGpu, + imgSizeH, + imgSizeW, + *targetGpuGrad, + *targetGpu, + ksizeW, + ksizeH, + strideH, + strideW, + outH, + outW, + 1.0, + 1.0, + padH, + padW); + MatrixPtr targetBwdCheck = + CpuMatrix::create(numSamples, inWidth, false, false); targetBwdCheck->copyFrom(*inputGpuGrad); checkMatrixEqual(inputGrad, targetBwdCheck); } -void testAvgPoolFwdBwd(int numSamples, int channels, - int imgSizeH, int imgSizeW, - int ksizeH, int ksizeW, - int strideH, int strideW, - int padH, int padW) { +void testAvgPoolFwdBwd(int numSamples, + int channels, + int imgSizeH, + int imgSizeW, + int ksizeH, + int ksizeW, + int strideH, + int strideW, + int padH, + int padW) { int outH = 0, outW = 0; outH = (imgSizeH - ksizeH + 2 * padH + strideH - 1) / strideH + 1; outW = (imgSizeW - ksizeW + 2 * padW + strideW - 1) / strideW + 1; @@ -2024,12 +2115,30 @@ void testAvgPoolFwdBwd(int numSamples, int channels, inputGpu->copyFrom(*input); targetGpu->copyFrom(*target); - target->avgPoolForward(*input, imgSizeH, imgSizeW, - channels, ksizeW, ksizeH, - strideH, strideW, outH, outW, padH, padW); - targetGpu->avgPoolForward(*inputGpu, imgSizeH, imgSizeW, - channels, ksizeW, ksizeH, - strideH, strideW, outH, outW, padH, padW); + target->avgPoolForward(*input, + imgSizeH, + imgSizeW, + channels, + ksizeW, + ksizeH, + strideH, + strideW, + outH, + outW, + padH, + padW); + targetGpu->avgPoolForward(*inputGpu, + imgSizeH, + imgSizeW, + channels, + ksizeW, + ksizeH, + strideH, + strideW, + outH, + outW, + padH, + padW); MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false); targetCheck->copyFrom(*targetGpu); MatrixCheckErr(*target, *targetCheck); @@ -2037,24 +2146,42 @@ void testAvgPoolFwdBwd(int numSamples, int channels, MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false); MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true); MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false); - MatrixPtr targetGpuGrad = GpuMatrix::create(numSamples, outWidth, - false, true); + MatrixPtr targetGpuGrad = + GpuMatrix::create(numSamples, outWidth, false, true); inputGrad->randomizeUniform(); targetGrad->randomizeUniform(); inputGpuGrad->copyFrom(*inputGrad); targetGpuGrad->copyFrom(*targetGrad); - inputGrad->avgPoolBackward(*targetGrad, imgSizeH, imgSizeW, - ksizeW, ksizeH, - strideH, strideW, - outH, outW, 1.0, 1.0, padH, padW); - inputGpuGrad->avgPoolBackward(*targetGpuGrad, imgSizeH, imgSizeW, - ksizeW, ksizeH, - strideH, strideW, - outH, outW, 1.0, 1.0, padH, padW); - MatrixPtr targetBwdCheck = CpuMatrix::create(numSamples, inWidth, - false, false); + inputGrad->avgPoolBackward(*targetGrad, + imgSizeH, + imgSizeW, + ksizeW, + ksizeH, + strideH, + strideW, + outH, + outW, + 1.0, + 1.0, + padH, + padW); + inputGpuGrad->avgPoolBackward(*targetGpuGrad, + imgSizeH, + imgSizeW, + ksizeW, + ksizeH, + strideH, + strideW, + outH, + outW, + 1.0, + 1.0, + padH, + padW); + MatrixPtr targetBwdCheck = + CpuMatrix::create(numSamples, inWidth, false, false); targetBwdCheck->copyFrom(*inputGpuGrad); MatrixCheckErr(*inputGrad, *targetBwdCheck); } @@ -2068,24 +2195,37 @@ TEST(Matrix, PoolFwdBwd) { for (auto sizeY : {2, 5}) { for (auto sH : {1, 2}) { for (auto sW : {1, 2}) { - for (auto pH : {0, (sizeY - 1)/2}) { - for (auto pW : {0, (sizeX - 1)/2}) { - VLOG(3) << " numSamples=" << numSamples - << " channels=" << channels - << " imgSizeH=" << imgSizeH - << " imgSizeW=" << imgSizeW - << " sizeX=" << sizeX - << " sizeY=" << sizeY - << " strideH=" << sH - << " strideW=" << sW - << " padingH=" << pH - << " padingW=" << pW; - testMaxPoolFwdBwd(numSamples, channels, imgSizeH, - imgSizeW, sizeX, sizeY, sH, sW, pH, pW); - testAvgPoolFwdBwd(numSamples, channels, imgSizeH, - imgSizeW, sizeX, sizeY, sH, sW, pH, pW); - } - } + for (auto pH : {0, (sizeY - 1) / 2}) { + for (auto pW : {0, (sizeX - 1) / 2}) { + VLOG(3) << " numSamples=" << numSamples + << " channels=" << channels + << " imgSizeH=" << imgSizeH + << " imgSizeW=" << imgSizeW << " sizeX=" << sizeX + << " sizeY=" << sizeY << " strideH=" << sH + << " strideW=" << sW << " padingH=" << pH + << " padingW=" << pW; + testMaxPoolFwdBwd(numSamples, + channels, + imgSizeH, + imgSizeW, + sizeX, + sizeY, + sH, + sW, + pH, + pW); + testAvgPoolFwdBwd(numSamples, + channels, + imgSizeH, + imgSizeW, + sizeX, + sizeY, + sH, + sW, + pH, + pW); + } + } } } } @@ -2096,8 +2236,8 @@ TEST(Matrix, PoolFwdBwd) { } } -void testMaxOutFwdBwd(int numSamples, int imgSizeH, int imgSizeW, - int channels, int groups) { +void testMaxOutFwdBwd( + int numSamples, int imgSizeH, int imgSizeW, int channels, int groups) { int inWidth = imgSizeH * imgSizeW * channels; int outChannels = channels / groups; int outWidth = imgSizeH * imgSizeW * outChannels; @@ -2131,10 +2271,10 @@ void testMaxOutFwdBwd(int numSamples, int imgSizeH, int imgSizeW, MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true); MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false); - MatrixPtr targetGpuGrad = GpuMatrix::create(numSamples, outWidth, false, - true); - MatrixPtr targetCheckGrad = CpuMatrix::create(numSamples, inWidth, false, - false); + MatrixPtr targetGpuGrad = + GpuMatrix::create(numSamples, outWidth, false, true); + MatrixPtr targetCheckGrad = + CpuMatrix::create(numSamples, inWidth, false, false); inputGrad->randomizeUniform(); targetGrad->randomizeUniform(); @@ -2155,10 +2295,8 @@ TEST(Matrix, MaxOutFwdBwd) { for (auto imgSizeH : {14, 28}) { for (auto imgSizeW : {16, 30}) { for (auto groups : {2, 4}) { - VLOG(3) << " numSamples=" << numSamples - << " channels=" << channels - << " imgSizeH=" << imgSizeH - << " imgSizeW=" << imgSizeW + VLOG(3) << " numSamples=" << numSamples << " channels=" << channels + << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW << " groups=" << groups; testMaxOutFwdBwd(numSamples, imgSizeH, imgSizeW, channels, groups); } @@ -2232,12 +2370,12 @@ void testMultiBinaryLabelCrossEntropy(int numSamples, int dim) { MatrixPtr cpuGrad = std::make_shared(numSamples, dim); MatrixPtr gpuGrad = std::make_shared(numSamples, dim); - MatrixPtr cpuLabel = std::make_shared - (numSamples, dim, numSamples, NO_VALUE, SPARSE_CSR, false); - MatrixPtr gpuLabel = std::make_shared - (numSamples, dim, numSamples, NO_VALUE, SPARSE_CSR, false); - for (int i = 0; i < numSamples; i ++) { - const unsigned int id = rand() % dim; // NOLINT + MatrixPtr cpuLabel = std::make_shared( + numSamples, dim, numSamples, NO_VALUE, SPARSE_CSR, false); + MatrixPtr gpuLabel = std::make_shared( + numSamples, dim, numSamples, NO_VALUE, SPARSE_CSR, false); + for (int i = 0; i < numSamples; i++) { + const unsigned int id = rand() % dim; // NOLINT cpuLabel->setRow(i, 1, &id, nullptr); gpuLabel->setRow(i, 1, &id, nullptr); } diff --git a/paddle/math/tests/test_matrixUtil.h b/paddle/math/tests/test_matrixUtil.h index fa682164aa8643dd088bd0ece757728e03488b76..5300e7168b9dc61b65e64346424e65c11665cf99 100644 --- a/paddle/math/tests/test_matrixUtil.h +++ b/paddle/math/tests/test_matrixUtil.h @@ -104,8 +104,7 @@ void checkSMatrixEqual2Dense(const CpuSparseMatrixPtr& a, } } -void checkSMatrixErr(const CpuSparseMatrixPtr& a, - const CpuSparseMatrixPtr& b) { +void checkSMatrixErr(const CpuSparseMatrixPtr& a, const CpuSparseMatrixPtr& b) { #ifndef PADDLE_TYPE_DOUBLE real err = 1e-3; #else @@ -126,7 +125,8 @@ void checkSMatrixErr(const CpuSparseMatrixPtr& a, real bVal = b->getValue()[r]; if (std::abs(aVal - bVal) > err) { if ((std::abs(aVal - bVal) / std::abs(aVal)) > (err / 10.0f)) { - LOG(INFO) << "a=" << aVal << "\t" << "b=" << bVal; + LOG(INFO) << "a=" << aVal << "\t" + << "b=" << bVal; count++; } } diff --git a/paddle/math/tests/test_perturbation.cpp b/paddle/math/tests/test_perturbation.cpp index 4fa9bc72013da6a3d551854516e0f0d2fe5ee1ef..837c2f47ba05a04988431e14cb6bc2490f42d32e 100644 --- a/paddle/math/tests/test_perturbation.cpp +++ b/paddle/math/tests/test_perturbation.cpp @@ -37,7 +37,9 @@ protected: virtual void TearDown() {} - void allocateMem(real*& gpuAngle, real*& gpuScale, int*& gpuCenterR, + void allocateMem(real*& gpuAngle, + real*& gpuScale, + int*& gpuCenterR, int*& gpuCenterC) { gpuAngle = (real*)hl_malloc_device(sizeof(real) * NUM_IMAGES); gpuScale = (real*)hl_malloc_device(sizeof(real) * NUM_IMAGES); @@ -48,7 +50,8 @@ protected: } // Generate translation parameters for testing. - void generateTranslationParams(int*& gpuCenterR, int*& gpuCenterC, + void generateTranslationParams(int*& gpuCenterR, + int*& gpuCenterC, int imgSize) { int cpuCenterR[NUM_IMAGES * SAMPLING_RATE]; int cpuCenterC[NUM_IMAGES * SAMPLING_RATE]; @@ -59,13 +62,13 @@ protected: gpuCenterR = (int*)hl_malloc_device(sizeof(int) * NUM_IMAGES * SAMPLING_RATE); - hl_memcpy_host2device(gpuCenterR, cpuCenterR, - sizeof(int) * NUM_IMAGES * SAMPLING_RATE); + hl_memcpy_host2device( + gpuCenterR, cpuCenterR, sizeof(int) * NUM_IMAGES * SAMPLING_RATE); gpuCenterC = (int*)hl_malloc_device(sizeof(int) * NUM_IMAGES * SAMPLING_RATE); - hl_memcpy_host2device(gpuCenterC, cpuCenterC, - sizeof(int) * NUM_IMAGES * SAMPLING_RATE); + hl_memcpy_host2device( + gpuCenterC, cpuCenterC, sizeof(int) * NUM_IMAGES * SAMPLING_RATE); } // Generate rotation parameters for testing. @@ -84,8 +87,7 @@ protected: cpuScale[i] = static_cast(TGT_SIZE - 2) / TGT_SIZE; } gpuScale = (real*)hl_malloc_device(sizeof(real) * NUM_IMAGES); - hl_memcpy_host2device(gpuScale, cpuScale, - sizeof(real) * NUM_IMAGES); + hl_memcpy_host2device(gpuScale, cpuScale, sizeof(real) * NUM_IMAGES); } // Generate the test images, only the center regions are set to 1. @@ -111,8 +113,7 @@ protected: } } gpuImages = (real*)hl_malloc_device(sizeof(real) * IMAGE_MEM_SIZE); - hl_memcpy_host2device(gpuImages, cpuImages, - sizeof(real) * IMAGE_MEM_SIZE); + hl_memcpy_host2device(gpuImages, cpuImages, sizeof(real) * IMAGE_MEM_SIZE); } real* gpuImages_; @@ -120,64 +121,99 @@ protected: // Random perturbation. Only to make sure the code does not break. TEST_F(PerturbationTest, random_perturb) { - real* gpuAngle, *gpuScaleRatio; - int* gpuCenterR, *gpuCenterC; + real *gpuAngle, *gpuScaleRatio; + int *gpuCenterR, *gpuCenterC; allocateMem(gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC); real* targets = NULL; const int TARGET_MEM_SIZE = NUM_IMAGES * SAMPLING_RATE * TGT_SIZE * TGT_SIZE * CHANNELS; targets = (real*)hl_malloc_device(sizeof(real) * TARGET_MEM_SIZE); - hl_conv_random_disturb(gpuImages_, IMG_SIZE, TGT_SIZE, CHANNELS, - NUM_IMAGES, 1.0, 1.0, SAMPLING_RATE, gpuAngle, - gpuScaleRatio, gpuCenterR, gpuCenterC, 2, true, + hl_conv_random_disturb(gpuImages_, + IMG_SIZE, + TGT_SIZE, + CHANNELS, + NUM_IMAGES, + 1.0, + 1.0, + SAMPLING_RATE, + gpuAngle, + gpuScaleRatio, + gpuCenterR, + gpuCenterC, + 2, + true, targets); real cpuTargets[TARGET_MEM_SIZE]; - hl_memcpy_device2host(cpuTargets, targets, - sizeof(real) * TARGET_MEM_SIZE); + hl_memcpy_device2host(cpuTargets, targets, sizeof(real) * TARGET_MEM_SIZE); } TEST_F(PerturbationTest, identity_perturb) { - real* gpuAngle, *gpuScaleRatio; - int* gpuCenterR, *gpuCenterC; + real *gpuAngle, *gpuScaleRatio; + int *gpuCenterR, *gpuCenterC; allocateMem(gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC); real* targets = NULL; const int TARGET_MEM_SIZE = NUM_IMAGES * SAMPLING_RATE * TGT_SIZE * TGT_SIZE * CHANNELS; targets = (real*)hl_malloc_device(sizeof(real) * TARGET_MEM_SIZE); - hl_conv_random_disturb(gpuImages_, IMG_SIZE, TGT_SIZE, CHANNELS, - NUM_IMAGES, 1.0, 1.0, SAMPLING_RATE, gpuAngle, - gpuScaleRatio, gpuCenterR, gpuCenterC, 2, false, + hl_conv_random_disturb(gpuImages_, + IMG_SIZE, + TGT_SIZE, + CHANNELS, + NUM_IMAGES, + 1.0, + 1.0, + SAMPLING_RATE, + gpuAngle, + gpuScaleRatio, + gpuCenterR, + gpuCenterC, + 2, + false, targets); real cpuTargets[TARGET_MEM_SIZE]; - hl_memcpy_device2host(cpuTargets, targets, - sizeof(real) * TARGET_MEM_SIZE); + hl_memcpy_device2host(cpuTargets, targets, sizeof(real) * TARGET_MEM_SIZE); for (int i = 0; i < TARGET_MEM_SIZE; ++i) { EXPECT_FLOAT_EQ(1.0, cpuTargets[i]); } } TEST_F(PerturbationTest, translation_test) { - real* gpuAngle, *gpuScaleRatio; - int* gpuCenterR, *gpuCenterC; + real *gpuAngle, *gpuScaleRatio; + int *gpuCenterR, *gpuCenterC; allocateMem(gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC); - hl_generate_disturb_params(gpuAngle, gpuScaleRatio, gpuCenterR, - gpuCenterC, NUM_IMAGES, IMG_SIZE, 0.0, - 0.0, SAMPLING_RATE, false); + hl_generate_disturb_params(gpuAngle, + gpuScaleRatio, + gpuCenterR, + gpuCenterC, + NUM_IMAGES, + IMG_SIZE, + 0.0, + 0.0, + SAMPLING_RATE, + false); generateTranslationParams(gpuCenterR, gpuCenterC, IMG_SIZE); real* targets = NULL; const int TARGET_MEM_SIZE = NUM_IMAGES * SAMPLING_RATE * TGT_SIZE * TGT_SIZE * CHANNELS; targets = (real*)hl_malloc_device(sizeof(real) * TARGET_MEM_SIZE); - hl_conv_random_disturb_with_params( - gpuImages_, IMG_SIZE, TGT_SIZE, CHANNELS, NUM_IMAGES, SAMPLING_RATE, - gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC, 2, targets); + hl_conv_random_disturb_with_params(gpuImages_, + IMG_SIZE, + TGT_SIZE, + CHANNELS, + NUM_IMAGES, + SAMPLING_RATE, + gpuAngle, + gpuScaleRatio, + gpuCenterR, + gpuCenterC, + 2, + targets); real cpuTargets[TARGET_MEM_SIZE]; - hl_memcpy_device2host(cpuTargets, targets, - sizeof(real) * TARGET_MEM_SIZE); + hl_memcpy_device2host(cpuTargets, targets, sizeof(real) * TARGET_MEM_SIZE); for (int i = 0; i < SAMPLING_RATE * NUM_IMAGES; ++i) { for (int p = 0; p < TGT_SIZE * TGT_SIZE * CHANNELS; ++p) { const int offset = i * TGT_SIZE * TGT_SIZE * CHANNELS + p; @@ -191,50 +227,80 @@ TEST_F(PerturbationTest, translation_test) { } TEST_F(PerturbationTest, rotation_test) { - real* gpuAngle, *gpuScaleRatio; - int* gpuCenterR, *gpuCenterC; + real *gpuAngle, *gpuScaleRatio; + int *gpuCenterR, *gpuCenterC; allocateMem(gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC); - hl_generate_disturb_params(gpuAngle, gpuScaleRatio, gpuCenterR, - gpuCenterC, NUM_IMAGES, IMG_SIZE, 0.0, - 0.0, SAMPLING_RATE, false); + hl_generate_disturb_params(gpuAngle, + gpuScaleRatio, + gpuCenterR, + gpuCenterC, + NUM_IMAGES, + IMG_SIZE, + 0.0, + 0.0, + SAMPLING_RATE, + false); generateRotationParams(gpuAngle); real* targets = NULL; const int TARGET_MEM_SIZE = NUM_IMAGES * SAMPLING_RATE * TGT_SIZE * TGT_SIZE * CHANNELS; targets = (real*)hl_malloc_device(sizeof(real) * TARGET_MEM_SIZE); - hl_conv_random_disturb_with_params( - gpuImages_, IMG_SIZE, TGT_SIZE, CHANNELS, NUM_IMAGES, SAMPLING_RATE, - gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC, 2, targets); + hl_conv_random_disturb_with_params(gpuImages_, + IMG_SIZE, + TGT_SIZE, + CHANNELS, + NUM_IMAGES, + SAMPLING_RATE, + gpuAngle, + gpuScaleRatio, + gpuCenterR, + gpuCenterC, + 2, + targets); real cpuTargets[TARGET_MEM_SIZE]; - hl_memcpy_device2host(cpuTargets, targets, - sizeof(real) * TARGET_MEM_SIZE); + hl_memcpy_device2host(cpuTargets, targets, sizeof(real) * TARGET_MEM_SIZE); for (int i = 0; i < TARGET_MEM_SIZE; ++i) { EXPECT_FLOAT_EQ(1.0, cpuTargets[i]); } } TEST_F(PerturbationTest, scale_test) { - real* gpuAngle, *gpuScaleRatio; - int* gpuCenterR, *gpuCenterC; + real *gpuAngle, *gpuScaleRatio; + int *gpuCenterR, *gpuCenterC; allocateMem(gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC); - hl_generate_disturb_params(gpuAngle, gpuScaleRatio, gpuCenterR, - gpuCenterC, NUM_IMAGES, IMG_SIZE, 0.0, - 0.0, SAMPLING_RATE, false); + hl_generate_disturb_params(gpuAngle, + gpuScaleRatio, + gpuCenterR, + gpuCenterC, + NUM_IMAGES, + IMG_SIZE, + 0.0, + 0.0, + SAMPLING_RATE, + false); generateScaleParams(gpuScaleRatio); real* targets = NULL; const int TARGET_MEM_SIZE = NUM_IMAGES * SAMPLING_RATE * TGT_SIZE * TGT_SIZE * CHANNELS; targets = (real*)hl_malloc_device(sizeof(real) * TARGET_MEM_SIZE); - hl_conv_random_disturb_with_params( - gpuImages_, IMG_SIZE, TGT_SIZE, CHANNELS, NUM_IMAGES, SAMPLING_RATE, - gpuAngle, gpuScaleRatio, gpuCenterR, gpuCenterC, 2, targets); + hl_conv_random_disturb_with_params(gpuImages_, + IMG_SIZE, + TGT_SIZE, + CHANNELS, + NUM_IMAGES, + SAMPLING_RATE, + gpuAngle, + gpuScaleRatio, + gpuCenterR, + gpuCenterC, + 2, + targets); real cpuTargets[TARGET_MEM_SIZE]; - hl_memcpy_device2host(cpuTargets, targets, - sizeof(real) * TARGET_MEM_SIZE); + hl_memcpy_device2host(cpuTargets, targets, sizeof(real) * TARGET_MEM_SIZE); for (int i = 0; i < SAMPLING_RATE * NUM_IMAGES; ++i) { for (int p = 0; p < TGT_SIZE * TGT_SIZE * CHANNELS; ++p) { const int offset = i * TGT_SIZE * TGT_SIZE * CHANNELS + p; diff --git a/paddle/math/tests/test_sparseMatrixCompare.cpp b/paddle/math/tests/test_sparseMatrixCompare.cpp index 6048dd81122292c6af4a726217d13794ee0f019c..d7aa20eb984417ff3907b078a263c5651d6209d3 100644 --- a/paddle/math/tests/test_sparseMatrixCompare.cpp +++ b/paddle/math/tests/test_sparseMatrixCompare.cpp @@ -155,7 +155,7 @@ TEST(SMatrix, sMatrixMul) { for (auto M : {1, 40, 128, 200}) { for (auto N : {100, 2000, 20480}) { for (auto K : {100, 512, 1024}) { - VLOG(3) << " M=" << M << " N=" << N << " K=" << K;; + VLOG(3) << " M=" << M << " N=" << N << " K=" << K; testSpMatrixMul(M, N, K, 0.05); } } diff --git a/paddle/parameter/Argument.cpp b/paddle/parameter/Argument.cpp index 42c74661d2b2cebe0c2f5f14d0970ab2f1fec866..81d53f065b84b2699141fc599b9efba794bbd25a 100644 --- a/paddle/parameter/Argument.cpp +++ b/paddle/parameter/Argument.cpp @@ -12,14 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "Argument.h" #include "paddle/math/SparseMatrix.h" #include namespace paddle { -static void resizeAndCopy(MatrixPtr& dest, const MatrixPtr& src, bool useGpu, +static void resizeAndCopy(MatrixPtr& dest, + const MatrixPtr& src, + bool useGpu, hl_stream_t stream) { if (src) { if (!dest) { @@ -34,7 +35,9 @@ static void resizeAndCopy(MatrixPtr& dest, const MatrixPtr& src, bool useGpu, } } -static void resizeAndCopy(IVectorPtr& dest, const IVectorPtr& src, bool useGpu, +static void resizeAndCopy(IVectorPtr& dest, + const IVectorPtr& src, + bool useGpu, hl_stream_t stream) { if (src) { IVector::resizeOrCreate(dest, src->getSize(), useGpu); @@ -56,8 +59,11 @@ static void resizeAndCopy(ICpuGpuVectorPtr& dest, } } -static void resizeAndCopy(MatrixPtr& dest, const MatrixPtr& src, - int32_t startRow, int32_t copySize, bool useGpu, +static void resizeAndCopy(MatrixPtr& dest, + const MatrixPtr& src, + int32_t startRow, + int32_t copySize, + bool useGpu, hl_stream_t stream = HPPL_STREAM_DEFAULT) { if (src) { CHECK_LE((size_t)startRow + copySize, src->getHeight()); @@ -84,8 +90,11 @@ static void resizeAndCopy(MatrixPtr& dest, const MatrixPtr& src, } } -static void resizeAndCopy(IVectorPtr& dest, const IVectorPtr& src, - int32_t startPos, int32_t copySize, bool useGpu, +static void resizeAndCopy(IVectorPtr& dest, + const IVectorPtr& src, + int32_t startPos, + int32_t copySize, + bool useGpu, hl_stream_t stream = HPPL_STREAM_DEFAULT) { if (src) { CHECK_LE((size_t)startPos + copySize, src->getSize()); @@ -115,7 +124,8 @@ static void resizeAndCopy(ICpuGpuVectorPtr& dest, } static void resizeAndCopy(UserDefinedVectorPtr& dest, - const UserDefinedVectorPtr& src, bool useGpu, + const UserDefinedVectorPtr& src, + bool useGpu, hl_stream_t stream) { if (src) { CHECK(!useGpu) << "not implemented"; @@ -132,8 +142,10 @@ static void resizeAndCopy(UserDefinedVectorPtr& dest, } static void resizeAndCopy(UserDefinedVectorPtr& dest, - const UserDefinedVectorPtr& src, int32_t startPos, - int32_t copySize, bool useGpu, + const UserDefinedVectorPtr& src, + int32_t startPos, + int32_t copySize, + bool useGpu, hl_stream_t stream = HPPL_STREAM_DEFAULT) { if (src) { CHECK(!useGpu) << "not implemented"; @@ -151,7 +163,9 @@ static void resizeAndCopy(UserDefinedVectorPtr& dest, } } -static void resizeAndCopy(SVectorPtr& dest, const SVectorPtr& src, bool useGpu, +static void resizeAndCopy(SVectorPtr& dest, + const SVectorPtr& src, + bool useGpu, hl_stream_t stream) { if (src) { size_t height = src->size(); @@ -166,8 +180,11 @@ static void resizeAndCopy(SVectorPtr& dest, const SVectorPtr& src, bool useGpu, } } -static void resizeAndCopy(SVectorPtr& dest, const SVectorPtr& src, - int32_t startPos, int32_t copySize, bool useGpu, +static void resizeAndCopy(SVectorPtr& dest, + const SVectorPtr& src, + int32_t startPos, + int32_t copySize, + bool useGpu, hl_stream_t stream = HPPL_STREAM_DEFAULT) { if (src) { CHECK_LE((size_t)startPos + copySize, src->size()); @@ -184,37 +201,46 @@ static void resizeAndCopy(SVectorPtr& dest, const SVectorPtr& src, } void Argument::resizeAndCopyFrom(const Argument& src, bool useGpu) { - resizeAndCopyFrom(src, useGpu, HPPL_STREAM_DEFAULT); - hl_stream_synchronize(HPPL_STREAM_DEFAULT); + resizeAndCopyFrom(src, useGpu, HPPL_STREAM_DEFAULT); + hl_stream_synchronize(HPPL_STREAM_DEFAULT); } -void Argument::resizeAndCopyFrom(const Argument& src, bool useGpu, +void Argument::resizeAndCopyFrom(const Argument& src, + bool useGpu, hl_stream_t stream) { dataId = src.dataId; resizeAndCopy(value, src.value, useGpu, stream); resizeAndCopy(grad, src.grad, useGpu, stream); resizeAndCopy(in, src.in, useGpu, stream); resizeAndCopy(ids, src.ids, useGpu, stream); - resizeAndCopy(sequenceStartPositions, src.sequenceStartPositions, - false /* useGpu */, stream); + resizeAndCopy(sequenceStartPositions, + src.sequenceStartPositions, + false /* useGpu */, + stream); if (src.hasSubseq()) { resizeAndCopy(subSequenceStartPositions, - src.subSequenceStartPositions, false /* useGpu */, stream); + src.subSequenceStartPositions, + false /* useGpu */, + stream); } resizeAndCopy(udp, src.udp, useGpu, stream); resizeAndCopy(strs, src.strs, useGpu, stream); } -int32_t Argument::resizeAndCopyFrom(const Argument& src, int32_t startSeq, - int32_t copySize, bool useGpu) { - int32_t size = resizeAndCopyFrom(src, startSeq, copySize, useGpu, - HPPL_STREAM_DEFAULT); - hl_stream_synchronize(HPPL_STREAM_DEFAULT); - return size; +int32_t Argument::resizeAndCopyFrom(const Argument& src, + int32_t startSeq, + int32_t copySize, + bool useGpu) { + int32_t size = + resizeAndCopyFrom(src, startSeq, copySize, useGpu, HPPL_STREAM_DEFAULT); + hl_stream_synchronize(HPPL_STREAM_DEFAULT); + return size; } -int32_t Argument::resizeAndCopyFrom(const Argument& src, int32_t startSeq, - int32_t copySize, bool useGpu, +int32_t Argument::resizeAndCopyFrom(const Argument& src, + int32_t startSeq, + int32_t copySize, + bool useGpu, hl_stream_t stream) { dataId = src.dataId; @@ -239,8 +265,12 @@ int32_t Argument::resizeAndCopyFrom(const Argument& src, int32_t startSeq, resizeAndCopy(grad, src.grad, startRow, copyFeatureSize, useGpu, stream); resizeAndCopy(ids, src.ids, startRow, copyFeatureSize, useGpu, stream); resizeAndCopy(udp, src.udp, startRow, copySize, useGpu, stream); - resizeAndCopy(sequenceStartPositions, src.sequenceStartPositions, - startSeq, copySize + 1, false, stream); + resizeAndCopy(sequenceStartPositions, + src.sequenceStartPositions, + startSeq, + copySize + 1, + false, + stream); // modify new sequenceStartPositions int* destSequences = sequenceStartPositions->getMutableData(false); for (int i = 0; i < copySize + 1; i++) { @@ -264,8 +294,11 @@ int32_t Argument::resizeAndCopyFrom(const Argument& src, int32_t startSeq, } int32_t copySubSize = subEndSeq - subStartSeq; resizeAndCopy(subSequenceStartPositions, - src.subSequenceStartPositions, subStartSeq, - copySubSize + 1, false, stream); + src.subSequenceStartPositions, + subStartSeq, + copySubSize + 1, + false, + stream); // modify new subSequenceStartPositions int* destSubSequences = subSequenceStartPositions->getMutableData(false); for (int i = 0; i < copySubSize + 1; i++) { @@ -281,14 +314,19 @@ int32_t Argument::resizeAndCopyFrom(const Argument& src, int32_t startSeq, void Argument::concat(const std::vector& args, const std::vector& selectRows, - const std::vector& seqStartPos, bool useGpu, - hl_stream_t stream, PassType passType) { + const std::vector& seqStartPos, + bool useGpu, + hl_stream_t stream, + PassType passType) { CHECK(!subSequenceStartPositions) - << "undefined behavior for subsequence positions"; + << "undefined behavior for subsequence positions"; size_t batchSize = selectRows.size(); - auto copyArg = [batchSize, stream](MatrixPtr& dst, MatrixPtr src, - int startRow, int pos, int size, + auto copyArg = [batchSize, stream](MatrixPtr& dst, + MatrixPtr src, + int startRow, + int pos, + int size, bool useGpu) { if (!src) { dst.reset(); @@ -305,8 +343,11 @@ void Argument::concat(const std::vector& args, tmpMatrix->copyFrom(*src->subMatrix(pos, size), stream); }; - auto copyIds = [batchSize, stream](IVectorPtr& dst, const IVectorPtr& src, - int startRow, int pos, int size, + auto copyIds = [batchSize, stream](IVectorPtr& dst, + const IVectorPtr& src, + int startRow, + int pos, + int size, bool useGpu) { if (!src) { dst.reset(); @@ -316,8 +357,11 @@ void Argument::concat(const std::vector& args, dst->subVec(startRow, size)->copyFrom(*src->subVec(pos, size), stream); }; - auto copyStrs = [batchSize, stream](SVectorPtr& dst, const SVectorPtr& src, - int startRow, int pos, int size, + auto copyStrs = [batchSize, stream](SVectorPtr& dst, + const SVectorPtr& src, + int startRow, + int pos, + int size, bool useGpu) { if (!src) { dst.reset(); @@ -328,8 +372,8 @@ void Argument::concat(const std::vector& args, } else { dst->resize(batchSize); } - std::copy(src->begin() + pos, src->begin() + pos + size, - dst->begin() + startRow); + std::copy( + src->begin() + pos, src->begin() + pos + size, dst->begin() + startRow); }; dataId = args[0].dataId; @@ -354,14 +398,16 @@ void Argument::concat(const std::vector& args, copyStrs(strs, arg.strs, j, rowIdx, copySize, useGpu); } } - ICpuGpuVector::resizeOrCreate(sequenceStartPositions, - seqStartPos.size(), useGpu); - sequenceStartPositions->copyFrom(seqStartPos.data(), - seqStartPos.size(), useGpu); + ICpuGpuVector::resizeOrCreate( + sequenceStartPositions, seqStartPos.size(), useGpu); + sequenceStartPositions->copyFrom( + seqStartPos.data(), seqStartPos.size(), useGpu); } -void Argument::concat(const std::vector& args, bool useGpu, - hl_stream_t stream, PassType passType) { +void Argument::concat(const std::vector& args, + bool useGpu, + hl_stream_t stream, + PassType passType) { int32_t batchSize = 0; int64_t numSequences = 0; int64_t numSubSequences = 0; @@ -371,8 +417,8 @@ void Argument::concat(const std::vector& args, bool useGpu, numSubSequences += arg.getNumSubSequences(); } - auto copyArg = [batchSize, stream](MatrixPtr& dst, MatrixPtr src, - int startRow, bool useGpu) { + auto copyArg = [batchSize, stream]( + MatrixPtr& dst, MatrixPtr src, int startRow, bool useGpu) { if (!src) { dst.reset(); return; @@ -388,8 +434,8 @@ void Argument::concat(const std::vector& args, bool useGpu, tmpMatrix->copyFrom(*src, stream); }; - auto copyIds = [batchSize, stream](IVectorPtr& dst, const IVectorPtr& src, - int startRow, bool useGpu) { + auto copyIds = [batchSize, stream]( + IVectorPtr& dst, const IVectorPtr& src, int startRow, bool useGpu) { if (!src) { dst.reset(); return; @@ -398,8 +444,8 @@ void Argument::concat(const std::vector& args, bool useGpu, dst->subVec(startRow, src->getSize())->copyFrom(*src, stream); }; - auto copyStrs = [batchSize, stream](SVectorPtr& dst, const SVectorPtr& src, - int startRow, bool useGpu) { + auto copyStrs = [batchSize, stream]( + SVectorPtr& dst, const SVectorPtr& src, int startRow, bool useGpu) { if (!src) { dst.reset(); return; @@ -412,21 +458,23 @@ void Argument::concat(const std::vector& args, bool useGpu, std::copy(src->begin(), src->end(), dst->begin() + startRow); }; - auto copySequencePos = [] - (ICpuGpuVectorPtr& dstSeq, const ICpuGpuVectorPtr& srcSeq, - int dstNumSequences, int srcNumSequences, - int& startSequences, int startRow) { - if (srcSeq) { - ICpuGpuVector::resizeOrCreate(dstSeq, dstNumSequences + 1, false); - const int* src = srcSeq->getData(false); - int* dest = dstSeq->getMutableData(false); - for (int i = 0; i < srcNumSequences + 1; ++i) { - dest[i + startSequences] = src[i] + startRow; - } - startSequences += srcNumSequences; - } else { - dstSeq.reset(); + auto copySequencePos = [](ICpuGpuVectorPtr& dstSeq, + const ICpuGpuVectorPtr& srcSeq, + int dstNumSequences, + int srcNumSequences, + int& startSequences, + int startRow) { + if (srcSeq) { + ICpuGpuVector::resizeOrCreate(dstSeq, dstNumSequences + 1, false); + const int* src = srcSeq->getData(false); + int* dest = dstSeq->getMutableData(false); + for (int i = 0; i < srcNumSequences + 1; ++i) { + dest[i + startSequences] = src[i] + startRow; } + startSequences += srcNumSequences; + } else { + dstSeq.reset(); + } }; int startRow = 0; @@ -479,8 +527,8 @@ void Argument::splitByDataId(const std::vector& argus, void Argument::getSeqInfo(std::vector* seqInfo) const { const int* starts = sequenceStartPositions->getData(false); - const int* subStarts = hasSubseq() - ? subSequenceStartPositions->getData(false) : nullptr; + const int* subStarts = + hasSubseq() ? subSequenceStartPositions->getData(false) : nullptr; size_t numSequences = getNumSequences(); seqInfo->reserve(numSequences); int subSeqEnd = 0; @@ -501,7 +549,8 @@ void Argument::getSeqInfo(std::vector* seqInfo) const { } seqInfo->push_back(info); } - std::sort(seqInfo->begin(), seqInfo->end(), + std::sort(seqInfo->begin(), + seqInfo->end(), [](const SeqInfo& a, const SeqInfo& b) { return a.topLevelLength > b.topLevelLength; }); @@ -535,9 +584,8 @@ void Argument::degradeSequence(const Argument& input, bool useGpu) { CHECK_EQ(input.hasSubseq(), 1UL); size_t numSequences = input.getNumSequences(); size_t numSubSequences = input.getNumSubSequences(); - ICpuGpuVector::resizeOrCreate(sequenceStartPositions, - numSequences + 1, - false); + ICpuGpuVector::resizeOrCreate( + sequenceStartPositions, numSequences + 1, false); int* tgtBuf = sequenceStartPositions->getMutableData(false); const int* starts = input.sequenceStartPositions->getData(false); const int* subStarts = input.subSequenceStartPositions->getData(false); @@ -551,24 +599,29 @@ void Argument::degradeSequence(const Argument& input, bool useGpu) { tgtBuf[numSequences] = numSubSequences; } -void Argument::subArgFrom(const Argument& input, size_t offset, size_t height, - size_t width, bool useGpu, bool trans, bool seqFlag, - size_t seqStart, size_t seqSize) { +void Argument::subArgFrom(const Argument& input, + size_t offset, + size_t height, + size_t width, + bool useGpu, + bool trans, + bool seqFlag, + size_t seqStart, + size_t seqSize) { if (input.value) { - value = Matrix::create(input.value->getData() + offset * width, - height, width, trans, useGpu); + value = Matrix::create( + input.value->getData() + offset * width, height, width, trans, useGpu); } if (input.ids) { ids = IVector::create(input.ids->getData() + offset, height, useGpu); } if (input.grad) { - grad = Matrix::create(input.grad->getData() + offset * width, - height, width, trans, useGpu); + grad = Matrix::create( + input.grad->getData() + offset * width, height, width, trans, useGpu); } if (seqFlag) { sequenceStartPositions = std::make_shared( - *(input.sequenceStartPositions), - seqStart, seqSize); + *(input.sequenceStartPositions), seqStart, seqSize); } } diff --git a/paddle/parameter/Argument.h b/paddle/parameter/Argument.h index 81ff9029bc4c8fca7adbabd7ae65caf7ac2f3c2a..2b20122debf935562d36f29d872e8ef3243111e0 100644 --- a/paddle/parameter/Argument.h +++ b/paddle/parameter/Argument.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "hl_gpu.h" @@ -153,9 +152,8 @@ struct Argument { } int64_t getNumSubSequences() const { - return subSequenceStartPositions - ? subSequenceStartPositions->getSize() - 1 - : getBatchSize(); + return subSequenceStartPositions ? subSequenceStartPositions->getSize() - 1 + : getBatchSize(); } bool hasSubseq() const { return subSequenceStartPositions != nullptr; } @@ -190,9 +188,14 @@ struct Argument { * @param seqStart[in] offset of input.sequenceStartPositions * @param seqSize[in] lenght of output.sequenceStartPositions */ - void subArgFrom(const Argument& input, size_t offset, size_t height, - size_t width, bool useGpu, bool trans = false, - bool seqFlag = false, size_t seqStart = 0, + void subArgFrom(const Argument& input, + size_t offset, + size_t height, + size_t width, + bool useGpu, + bool trans = false, + bool seqFlag = false, + size_t seqStart = 0, size_t seqSize = 0); /* * for sequence input: @@ -206,16 +209,21 @@ struct Argument { * Note that when specifying the stream explicitly in this case, * synchronize should also be called somewhere after this function */ - int32_t resizeAndCopyFrom(const Argument& src, int32_t startSeq, - int32_t copySize, bool useGpu, hl_stream_t stream); + int32_t resizeAndCopyFrom(const Argument& src, + int32_t startSeq, + int32_t copySize, + bool useGpu, + hl_stream_t stream); /* * same with the above function, except that the stream is * HPPL_STREAM_DEFAULT and synchronize is automatically called * inside it */ - int32_t resizeAndCopyFrom(const Argument& src, int32_t startSeq, - int32_t copySize, bool useGpu = FLAGS_use_gpu); + int32_t resizeAndCopyFrom(const Argument& src, + int32_t startSeq, + int32_t copySize, + bool useGpu = FLAGS_use_gpu); void resizeAndCopyFrom(const Argument& src, bool useGpu, hl_stream_t stream); @@ -237,13 +245,16 @@ struct Argument { */ void concat(const std::vector& args, const std::vector& selectRows, - const std::vector& seqStartPos, bool useGpu, - hl_stream_t stream, PassType passType); + const std::vector& seqStartPos, + bool useGpu, + hl_stream_t stream, + PassType passType); /* Concatenate several args into one and put the result into this. */ - void concat(const std::vector& src, bool useGpu = FLAGS_use_gpu, + void concat(const std::vector& src, + bool useGpu = FLAGS_use_gpu, hl_stream_t stream = HPPL_STREAM_DEFAULT, PassType passType = PASS_TEST); diff --git a/paddle/parameter/AverageOptimizer.cpp b/paddle/parameter/AverageOptimizer.cpp index 4f730059c748f36d690f388d29d213c676ac9626..593594761ed57495b92a30a8f3e8e86cdb45bfce 100644 --- a/paddle/parameter/AverageOptimizer.cpp +++ b/paddle/parameter/AverageOptimizer.cpp @@ -12,15 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "AverageOptimizer.h" namespace paddle { // factory method to create an instance of AverageOptimizer ParameterOptimizer* AverageOptimizer::create( - const OptimizationConfig& optConfig, ParameterOptimizer* optimizer, - bool isParameterSparse, bool useParameterApply) { + const OptimizationConfig& optConfig, + ParameterOptimizer* optimizer, + bool isParameterSparse, + bool useParameterApply) { if (optConfig.average_window() <= 0) { return optimizer; } @@ -44,8 +45,8 @@ AverageOptimizer::AverageOptimizer(const OptimizationConfig& optConfig, prevNumUpdates_(0), numAccumulates_(0), oldNumAccumulates_(0), - minAverageWindow_(std::min( - 10000L, optConfig_.max_average_window())), + minAverageWindow_( + std::min(10000L, optConfig_.max_average_window())), maxAverageWindow_(optConfig_.max_average_window()) { parameterTypes_ = optimizer_->getParameterTypes(); addParameterType(PARAMETER_SUM1); @@ -121,17 +122,27 @@ ParameterOptimizer::TraverseCallback AverageOptimizer::apply() { real scale = 1. / (numAccumulates_ + oldNumAccumulates_); if (useApply_) { - return [scale](const VectorPtr vecs[], const ParameterConfig& config, + return [scale](const VectorPtr vecs[], + const ParameterConfig& config, size_t sparseId) { - vecs[PARAMETER_APPLY]->add3(*vecs[PARAMETER_SUM1], *vecs[PARAMETER_SUM2], - *vecs[PARAMETER_SUM3], scale, scale, scale); + vecs[PARAMETER_APPLY]->add3(*vecs[PARAMETER_SUM1], + *vecs[PARAMETER_SUM2], + *vecs[PARAMETER_SUM3], + scale, + scale, + scale); }; } else { - return [scale](const VectorPtr vecs[], const ParameterConfig& config, + return [scale](const VectorPtr vecs[], + const ParameterConfig& config, size_t sparseId) { vecs[PARAMETER_GRADIENT]->copyFrom(*vecs[PARAMETER_VALUE]); - vecs[PARAMETER_VALUE]->add3(*vecs[PARAMETER_SUM1], *vecs[PARAMETER_SUM2], - *vecs[PARAMETER_SUM3], scale, scale, scale); + vecs[PARAMETER_VALUE]->add3(*vecs[PARAMETER_SUM1], + *vecs[PARAMETER_SUM2], + *vecs[PARAMETER_SUM3], + scale, + scale, + scale); }; } } @@ -144,8 +155,8 @@ ParameterOptimizer::TraverseCallback AverageOptimizer::restore() { return nullptr; } - return [](const VectorPtr vecs[], const ParameterConfig& config, - size_t sparseId) { + return []( + const VectorPtr vecs[], const ParameterConfig& config, size_t sparseId) { vecs[PARAMETER_VALUE]->copyFrom(*vecs[PARAMETER_GRADIENT]); vecs[PARAMETER_GRADIENT]->zeroMem(); }; @@ -174,7 +185,8 @@ ParameterOptimizer::TraverseCallback AverageSparseOptimizer::startCatchUpWith() if (timer_ > 0) { callbacks.emplace_back( - [this](const VectorPtr vecs[], const ParameterConfig& config, + [this](const VectorPtr vecs[], + const ParameterConfig& config, size_t sparseId) { this->catchUpWith(vecs, config, sparseId); }); } diff --git a/paddle/parameter/AverageOptimizer.h b/paddle/parameter/AverageOptimizer.h index 8e0ead84125ab283756acdbd3bf9120918adcf35..ccc2612608db574274f3e0acaacec7f9eb404223 100644 --- a/paddle/parameter/AverageOptimizer.h +++ b/paddle/parameter/AverageOptimizer.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "FirstOrderOptimizer.h" @@ -26,7 +25,8 @@ public: // if *useParameterApply* set, use PARAMETER_APPLY to store averaged parameter // else use PARAMETER_VALUE, and value backup in PARAMETER_GRADIENT AverageOptimizer(const OptimizationConfig& optConfig, - ParameterOptimizer* optimizer, bool useParameterApply); + ParameterOptimizer* optimizer, + bool useParameterApply); static ParameterOptimizer* create(const OptimizationConfig& optConfig, ParameterOptimizer* optimizer, @@ -45,7 +45,8 @@ public: virtual void startBatch(int64_t numSamplesProcessed); virtual void finishBatch(); - virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig, + virtual void update(const VectorPtr vecs[], + const ParameterConfig& paraConfig, size_t sparseId) const { optimizer_->update(vecs, paraConfig, sparseId); vecs[PARAMETER_SUM1]->add(*vecs[PARAMETER_VALUE], 1.0f); @@ -99,7 +100,8 @@ protected: class AverageSparseOptimizer : public AverageOptimizer { public: AverageSparseOptimizer(const OptimizationConfig& optConfig, - ParameterOptimizer* optimizer, bool useParameterApply) + ParameterOptimizer* optimizer, + bool useParameterApply) : AverageOptimizer(optConfig, optimizer, useParameterApply) {} virtual void init(size_t numRows, const ParameterConfig* config) { @@ -114,9 +116,11 @@ public: AverageOptimizer::finishBatch(); timer_++; } - virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig, + virtual void update(const VectorPtr vecs[], + const ParameterConfig& paraConfig, size_t sparseId) const; - void catchUpWith(const VectorPtr vecs[], const ParameterConfig& paraConfig, + void catchUpWith(const VectorPtr vecs[], + const ParameterConfig& paraConfig, size_t sparseId) const; virtual TraverseCallback startCatchUpWith() const; virtual void finishCatchUpWith() { diff --git a/paddle/parameter/FirstOrderOptimizer.cpp b/paddle/parameter/FirstOrderOptimizer.cpp index bb46a51d1e02c6d7e96e33c2cac0585055f026a1..a9be07d062992ff24175339c630426d27e84c22b 100644 --- a/paddle/parameter/FirstOrderOptimizer.cpp +++ b/paddle/parameter/FirstOrderOptimizer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/utils/Util.h" #include "paddle/utils/Flags.h" @@ -71,13 +70,15 @@ void SparseMomentumParameterOptimizer::update(const VectorPtr vecs[], tau_ * alpha_ * gamma_ * learningRate_); vecs[PARAMETER_VALUE]->add(*vecs[PARAMETER_MOMENTUM_UT], tau_ / beta_ + 1.0 / alpha_, - *vecs[PARAMETER_MOMENTUM_VT], 1.0 / beta_); + *vecs[PARAMETER_MOMENTUM_VT], + 1.0 / beta_); } else { - vecs[PARAMETER_VALUE]->sgdUpdate( - *vecs[PARAMETER_GRADIENT], *vecs[PARAMETER_MOMENTUM], - learningRate_ * paraConfig.learning_rate(), paraConfig.momentum(), - applyDecay_ ? paraConfig.decay_rate() : 0); + vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT], + *vecs[PARAMETER_MOMENTUM], + learningRate_ * paraConfig.learning_rate(), + paraConfig.momentum(), + applyDecay_ ? paraConfig.decay_rate() : 0); } } @@ -90,7 +91,8 @@ SparseMomentumParameterOptimizer::needSpecialTraversal( // 2. Note that \tau * u_t + v_t = \beta \theta_t, therefore: // u_t should be rescaled to u_t/alpha_ // v_t should be reset to \theta_t - return [this](const VectorPtr vecs[], const ParameterConfig& config, + return [this](const VectorPtr vecs[], + const ParameterConfig& config, size_t sparseId) { vecs[PARAMETER_MOMENTUM_UT]->divScalar(alpha_); vecs[PARAMETER_MOMENTUM_VT]->assign(*vecs[PARAMETER_VALUE]); @@ -120,10 +122,12 @@ void AdagradParameterOptimizer::update(const VectorPtr vecs[], vecs[PARAMETER_LEARNING_RATE]->add(optConfig_.ada_epsilon()); vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]); - vecs[PARAMETER_VALUE]->sgdUpdate( - *vecs[PARAMETER_GRADIENT], *vecs[PARAMETER_MOMENTUM], - *vecs[PARAMETER_LEARNING_RATE], learningRate_ * config.learning_rate(), - config.momentum(), applyDecay_ ? config.decay_rate() : 0); + vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT], + *vecs[PARAMETER_MOMENTUM], + *vecs[PARAMETER_LEARNING_RATE], + learningRate_ * config.learning_rate(), + config.momentum(), + applyDecay_ ? config.decay_rate() : 0); } ParameterOptimizer::TraverseCallback @@ -132,7 +136,8 @@ AdagradParameterOptimizer::needSpecialTraversal( if (numUpdates_ % kMaxNumAccumulates == 0) { // Move the sum to a different buffer to avoid loss of precision // due to too many sums. - return [this](const VectorPtr vecs[], const ParameterConfig& config, + return [this](const VectorPtr vecs[], + const ParameterConfig& config, size_t sparseId) { vecs[PARAMETER_GRADIENT_SQURESUM]->add( *vecs[PARAMETER_GRADIENT_SQURESUM1]); @@ -148,24 +153,29 @@ void AdaDeltaParameterOptimizer::update(const VectorPtr vecs[], size_t sparseId) const { CHECK(sparseId == -1LU) << "Sparse update is not supported"; // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2 - vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(*vecs[PARAMETER_GRADIENT], - rou_, 1.0f - rou_); + vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare( + *vecs[PARAMETER_GRADIENT], rou_, 1.0f - rou_); // learn_rate = sqrt( ( E(dx_{t-1}^2) + epsilon ) / ( E(g_t^2) + epsilon ) ) vecs[PARAMETER_LEARNING_RATE]->dotDiv(*vecs[PARAMETER_GRADIENT_SQURESUM1], *vecs[PARAMETER_GRADIENT_SQURESUM], - epsilon_, epsilon_); + epsilon_, + epsilon_); vecs[PARAMETER_LEARNING_RATE]->sqrt(); // E(dx_t^2) = \rou * E(dx_{t-1}^2) + (1-\rou) * (-g*learn_rate)^2 vecs[PARAMETER_GRADIENT_SQURESUM1]->decayAddSquareMul( - *vecs[PARAMETER_GRADIENT], *vecs[PARAMETER_LEARNING_RATE], rou_, + *vecs[PARAMETER_GRADIENT], + *vecs[PARAMETER_LEARNING_RATE], + rou_, 1.0f - rou_); - vecs[PARAMETER_VALUE]->sgdUpdate( - *vecs[PARAMETER_GRADIENT], *vecs[PARAMETER_MOMENTUM], - *vecs[PARAMETER_LEARNING_RATE], learningRate_ * config.learning_rate(), - config.momentum(), applyDecay_ ? config.decay_rate() : 0); + vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT], + *vecs[PARAMETER_MOMENTUM], + *vecs[PARAMETER_LEARNING_RATE], + learningRate_ * config.learning_rate(), + config.momentum(), + applyDecay_ ? config.decay_rate() : 0); } void RMSPropParameterOptimizer::update(const VectorPtr vecs[], @@ -185,12 +195,13 @@ void RMSPropParameterOptimizer::update(const VectorPtr vecs[], // For the first time update, make the sum be the current square // so that the initial estimation of E(g_t^2) will not be too small. vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare( - *vecs[PARAMETER_GRADIENT], accumulatedRou, + *vecs[PARAMETER_GRADIENT], + accumulatedRou, firstTime ? 1.0f : 1.0f - rou_); // E(g_t) = \rou * E(g_{t-1}) + (1-\rou) * g - vecs[PARAMETER_GRADIENT_SQURESUM1]->add(*vecs[PARAMETER_GRADIENT], - accumulatedRou, 1.0f - rou_); + vecs[PARAMETER_GRADIENT_SQURESUM1]->add( + *vecs[PARAMETER_GRADIENT], accumulatedRou, 1.0f - rou_); // learn_rate = 1/sqrt( ( E(g_t^2) - (E(g_t))^2 + epsilon ) // Basiclly if the sign of the gradient changes more often, @@ -201,10 +212,12 @@ void RMSPropParameterOptimizer::update(const VectorPtr vecs[], vecs[PARAMETER_LEARNING_RATE]->add(optConfig_.ada_epsilon()); vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]); - vecs[PARAMETER_VALUE]->sgdUpdate( - *vecs[PARAMETER_GRADIENT], *vecs[PARAMETER_MOMENTUM], - *vecs[PARAMETER_LEARNING_RATE], learningRate_ * config.learning_rate(), - config.momentum(), applyDecay_ ? config.decay_rate() : 0); + vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT], + *vecs[PARAMETER_MOMENTUM], + *vecs[PARAMETER_LEARNING_RATE], + learningRate_ * config.learning_rate(), + config.momentum(), + applyDecay_ ? config.decay_rate() : 0); } void DecayedAdagradParameterOptimizer::update(const VectorPtr vecs[], @@ -224,7 +237,8 @@ void DecayedAdagradParameterOptimizer::update(const VectorPtr vecs[], // For the first time update, make the sum be the current square // so that the initial estimation of E(g_t^2) will not be too small. vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare( - *vecs[PARAMETER_GRADIENT], accumulatedRou, + *vecs[PARAMETER_GRADIENT], + accumulatedRou, firstTime ? 1.0f : 1.0f - rou_); // learn_rate = 1/sqrt( ( E(g_t^2) + epsilon ) @@ -234,10 +248,12 @@ void DecayedAdagradParameterOptimizer::update(const VectorPtr vecs[], vecs[PARAMETER_LEARNING_RATE]->add(*vecs[PARAMETER_GRADIENT_SQURESUM]); vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]); - vecs[PARAMETER_VALUE]->sgdUpdate( - *vecs[PARAMETER_GRADIENT], *vecs[PARAMETER_MOMENTUM], - *vecs[PARAMETER_LEARNING_RATE], learningRate_ * config.learning_rate(), - config.momentum(), applyDecay_ ? config.decay_rate() : 0); + vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT], + *vecs[PARAMETER_MOMENTUM], + *vecs[PARAMETER_LEARNING_RATE], + learningRate_ * config.learning_rate(), + config.momentum(), + applyDecay_ ? config.decay_rate() : 0); } void AdamParameterOptimizer::update(const VectorPtr vecs[], @@ -290,7 +306,6 @@ void AdamaxParameterOptimizer::update(const VectorPtr vecs[], theta->add(*theta, 1.0, *g, -learningRate); } - void OptimizerWithGradientClipping::update(const VectorPtr vecs[], const ParameterConfig& config, size_t sparseId) const { diff --git a/paddle/parameter/FirstOrderOptimizer.h b/paddle/parameter/FirstOrderOptimizer.h index ad5f48097643a10d8b6f5bf3202211aa2b092469..a9a2ffdd41310d1927df012be8328d0e4bd3af0f 100644 --- a/paddle/parameter/FirstOrderOptimizer.h +++ b/paddle/parameter/FirstOrderOptimizer.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "ParameterOptimizer.h" @@ -31,21 +30,22 @@ public: virtual void startBatch(int64_t numSamplesProcessed) { learningRate_ = calcLearningRate(numSamplesProcessed, pass_); } - virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig, + virtual void update(const VectorPtr vecs[], + const ParameterConfig& paraConfig, size_t sparseId) const { (void)sparseId; - real torch_learningRate = optConfig_.learning_method() == "torch_momentum" ? - 1.0 - paraConfig.momentum() : 1.0; + real torch_learningRate = optConfig_.learning_method() == "torch_momentum" + ? 1.0 - paraConfig.momentum() + : 1.0; vecs[PARAMETER_VALUE]->sgdUpdate( - *vecs[PARAMETER_GRADIENT], *vecs[PARAMETER_MOMENTUM], + *vecs[PARAMETER_GRADIENT], + *vecs[PARAMETER_MOMENTUM], learningRate_ * paraConfig.learning_rate() * - (firstTime_ ? 1.0 : torch_learningRate), + (firstTime_ ? 1.0 : torch_learningRate), paraConfig.momentum(), applyDecay_ ? paraConfig.decay_rate() : 0); } - virtual void finishBatch() { - firstTime_ = false; - } + virtual void finishBatch() { firstTime_ = false; } }; // SGD optimization with sparse support. @@ -71,7 +71,8 @@ public: const OptimizationConfig& optConfig); virtual void init(size_t numRows, const ParameterConfig* config); virtual void startBatch(int64_t numSamplesProcessed); - virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig, + virtual void update(const VectorPtr vecs[], + const ParameterConfig& paraConfig, size_t sparseId) const; virtual TraverseCallback needSpecialTraversal( const ParameterConfig& config) const; @@ -111,7 +112,8 @@ public: (void)numSamplesProcessed; ++numUpdates_; } - virtual void update(const VectorPtr vecs[], const ParameterConfig& config, + virtual void update(const VectorPtr vecs[], + const ParameterConfig& config, size_t sparseId) const; virtual TraverseCallback needSpecialTraversal( const ParameterConfig& config) const; @@ -141,7 +143,8 @@ public: learningRate_ = calcLearningRate(numSamplesProcessed, pass_); } - virtual void update(const VectorPtr vecs[], const ParameterConfig& config, + virtual void update(const VectorPtr vecs[], + const ParameterConfig& config, size_t sparseId) const; protected: @@ -173,7 +176,8 @@ public: } virtual void finishBatch() { timer_++; } - virtual void update(const VectorPtr vecs[], const ParameterConfig& config, + virtual void update(const VectorPtr vecs[], + const ParameterConfig& config, size_t sparseId) const; protected: @@ -214,7 +218,8 @@ public: } virtual void finishBatch() { timer_++; } - virtual void update(const VectorPtr vecs[], const ParameterConfig& config, + virtual void update(const VectorPtr vecs[], + const ParameterConfig& config, size_t sparseId) const; protected: @@ -251,7 +256,8 @@ public: virtual void finishBatch() { ++step_; } - virtual void update(const VectorPtr vecs[], const ParameterConfig& config, + virtual void update(const VectorPtr vecs[], + const ParameterConfig& config, size_t sparseId) const; protected: @@ -280,7 +286,8 @@ public: virtual void finishBatch() { ++step_; } - virtual void update(const VectorPtr vecs[], const ParameterConfig& config, + virtual void update(const VectorPtr vecs[], + const ParameterConfig& config, size_t sparseId) const; protected: @@ -301,7 +308,8 @@ public: // learningRate required by regularizer learningRate_ = calcLearningRate(numSamplesProcessed, pass_); } - virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig, + virtual void update(const VectorPtr vecs[], + const ParameterConfig& paraConfig, size_t sparseId) const { vecs[PARAMETER_VALUE]->add(*vecs[PARAMETER_GRADIENT], optConfig_.delta_add_rate()); @@ -314,7 +322,8 @@ public: explicit DummyOptimizer(const OptimizationConfig& optConfig) : ParameterOptimizer(optConfig) {} - virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig, + virtual void update(const VectorPtr vecs[], + const ParameterConfig& paraConfig, size_t sparseId) const {} }; @@ -344,7 +353,8 @@ public: const ParameterConfig& config) const { return optimizer_->needSpecialTraversal(config); } - virtual void update(const VectorPtr vecs[], const ParameterConfig& config, + virtual void update(const VectorPtr vecs[], + const ParameterConfig& config, size_t sparseId) const; virtual void setNoDecay() { optimizer_->setNoDecay(); } diff --git a/paddle/parameter/LearningRateScheduler.cpp b/paddle/parameter/LearningRateScheduler.cpp index ce045ebf05a226215d565bf0281f245918e13055..a7412500ccfa05707286f0ad493ad8280eee1cbc 100644 --- a/paddle/parameter/LearningRateScheduler.cpp +++ b/paddle/parameter/LearningRateScheduler.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "LearningRateScheduler.h" #include "paddle/utils/StringUtil.h" diff --git a/paddle/parameter/LearningRateScheduler.h b/paddle/parameter/LearningRateScheduler.h index 74fb848fabe1ad9bbea8620d51d9d3674eb8a526..e987c3dcde120b8c88d58de7a18ee5c6db85bb5c 100644 --- a/paddle/parameter/LearningRateScheduler.h +++ b/paddle/parameter/LearningRateScheduler.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "TrainerConfig.pb.h" @@ -20,9 +19,10 @@ limitations under the License. */ namespace paddle { // NOLINTNEXTLINES_4 -#define REGISTER_LEARNING_RATE_SCHEDULER(__type_name, __class_name) \ - static InitFunction __reg_type_##__type_name([]() { \ - LearningRateScheduler::registrar_.registerClass<__class_name>(#__type_name); \ +#define REGISTER_LEARNING_RATE_SCHEDULER(__type_name, __class_name) \ + static InitFunction __reg_type_##__type_name([]() { \ + LearningRateScheduler::registrar_.registerClass<__class_name>( \ + #__type_name); \ }) class LearningRateScheduler { diff --git a/paddle/parameter/OptimizerFunctions.cpp b/paddle/parameter/OptimizerFunctions.cpp index 5adcf86efd5284ab5bc3131217c9e44172caa71b..6fd7964347644214533007dc1e11e6fa45ee9ea6 100644 --- a/paddle/parameter/OptimizerFunctions.cpp +++ b/paddle/parameter/OptimizerFunctions.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "AverageOptimizer.h" #include "FirstOrderOptimizer.h" #include "OptimizerWithRegularizer.h" @@ -22,19 +21,22 @@ namespace paddle { // creator for AverageOptimizer ParameterOptimizer* sgdOptimizerCreate(const OptimizationConfig& optConfig, const ParameterConfig& paraConfig, - bool isParameterSparse, bool inPserver) { + bool isParameterSparse, + bool inPserver) { ParameterOptimizer* optimizer = OptimizerWithRegularizer::create( optConfig, paraConfig, isParameterSparse, inPserver); - return AverageOptimizer::create(optConfig, optimizer, isParameterSparse, - inPserver /*useParameterApply*/); + return AverageOptimizer::create( + optConfig, optimizer, isParameterSparse, inPserver /*useParameterApply*/); } std::vector sgdOptimizerGetTypes( const OptimizationConfig& optConfig, bool inPserver) { std::unique_ptr optimizer; - optimizer.reset(AverageOptimizer::create( - optConfig, ParameterOptimizer::create(optConfig, inPserver), - false /*isParameterSparse*/, inPserver)); + optimizer.reset( + AverageOptimizer::create(optConfig, + ParameterOptimizer::create(optConfig, inPserver), + false /*isParameterSparse*/, + inPserver)); CHECK(optimizer) << "fail to create optimizer: " << optConfig.learning_method(); return optimizer->getParameterTypes(); diff --git a/paddle/parameter/OptimizerFunctions.h b/paddle/parameter/OptimizerFunctions.h index 9592658224d856fff1a2bde5e400ea85f95cd521..a5f8b2c56942720335c0df6c9d71fd4e15494600 100644 --- a/paddle/parameter/OptimizerFunctions.h +++ b/paddle/parameter/OptimizerFunctions.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "FirstOrderOptimizer.h" @@ -25,7 +24,8 @@ namespace paddle { */ ParameterOptimizer* sgdOptimizerCreate(const OptimizationConfig& optConfig, const ParameterConfig& paraConfig, - bool isParameterSparse, bool inPserver); + bool isParameterSparse, + bool inPserver); /* * Get the parameter types needed for the specific optimization diff --git a/paddle/parameter/OptimizerWithRegularizer.cpp b/paddle/parameter/OptimizerWithRegularizer.cpp index 0da27a51c6d29337864222d2e85126113f7f6431..5381e7bef3b177884d85671ef6e3dfbc0de1d5ed 100644 --- a/paddle/parameter/OptimizerWithRegularizer.cpp +++ b/paddle/parameter/OptimizerWithRegularizer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "OptimizerWithRegularizer.h" namespace paddle { @@ -24,7 +23,8 @@ OptimizerWithRegularizerEveryNumBatches::needSpecialTraversal( if (isRegularizationBatch(config)) { callbacks.emplace_back( - [this](const VectorPtr vecs[], const ParameterConfig& config, + [this](const VectorPtr vecs[], + const ParameterConfig& config, size_t sparseId) { this->doTraversal(vecs, config); }); } @@ -39,8 +39,8 @@ void OptimizerWithRegularizerEveryNumBatches::doTraversal( const VectorPtr vecs[], const ParameterConfig& config) const { int32_t base = std::max(baseTimer_, (timer_ + 1 - config.num_batches_regularization())); - regularizer_->update(vecs, config, optimizer_->getLearningRate(), base, - timer_ + 1); + regularizer_->update( + vecs, config, optimizer_->getLearningRate(), base, timer_ + 1); } ParameterOptimizer::TraverseCallback @@ -53,7 +53,8 @@ OptimizerWithRegularizerEveryNumBatches::startCatchUpWith() const { if (baseTimer_ < timer_) { callbacks.emplace_back( - [this](const VectorPtr vecs[], const ParameterConfig& config, + [this](const VectorPtr vecs[], + const ParameterConfig& config, size_t sparseId) { this->catchUpWith(vecs, config, sparseId); }); } @@ -61,11 +62,15 @@ OptimizerWithRegularizerEveryNumBatches::startCatchUpWith() const { } void OptimizerWithRegularizerEveryNumBatches::catchUpWith( - const VectorPtr vecs[], const ParameterConfig& config, + const VectorPtr vecs[], + const ParameterConfig& config, size_t sparseId) const { int32_t base = timer_ - timer_ % config.num_batches_regularization(); - regularizer_->update(vecs, config, optimizer_->getLearningRate(), - std::max(base, baseTimer_), timer_); + regularizer_->update(vecs, + config, + optimizer_->getLearningRate(), + std::max(base, baseTimer_), + timer_); } void OptimizerWithRegularizerSparse::init(size_t numRows, @@ -83,8 +88,11 @@ void OptimizerWithRegularizerSparse::update(const VectorPtr vecs[], optimizer_->update(vecs, config, sparseId); // para W(t0) -> W(t+1) CHECK_LT(sparseId, t0Vec_.size()); - regularizer_->update(vecs, config, optimizer_->getLearningRate(), - t0Vec_[sparseId], timer_ + 1); + regularizer_->update(vecs, + config, + optimizer_->getLearningRate(), + t0Vec_[sparseId], + timer_ + 1); t0Vec_[sparseId] = timer_ + 1; } @@ -98,7 +106,8 @@ OptimizerWithRegularizerSparse::startCatchUpWith() const { if (timer_ > 0) { callbacks.emplace_back( - [this](const VectorPtr vecs[], const ParameterConfig& config, + [this](const VectorPtr vecs[], + const ParameterConfig& config, size_t sparseId) { this->catchUpWith(vecs, config, sparseId); }); } @@ -110,18 +119,20 @@ void OptimizerWithRegularizerSparse::catchUpWith(const VectorPtr vecs[], size_t sparseId) const { // para W(t0) -> W(t+1) CHECK_LT(sparseId, t0Vec_.size()); - regularizer_->update(vecs, config, optimizer_->getLearningRate(), - t0Vec_[sparseId], timer_); + regularizer_->update( + vecs, config, optimizer_->getLearningRate(), t0Vec_[sparseId], timer_); } // factory method to create instance of OptimizerWithRegularizer ParameterOptimizer* OptimizerWithRegularizer::create( - const OptimizationConfig& optConfig, const ParameterConfig& paraConfig, - bool isParameterSparse, bool inPserver) { + const OptimizationConfig& optConfig, + const ParameterConfig& paraConfig, + bool isParameterSparse, + bool inPserver) { ParameterOptimizer* optimizer = ParameterOptimizer::create(optConfig, inPserver); if (paraConfig.gradient_clipping_threshold() > 0.0f && - !dynamic_cast(optimizer)) { + !dynamic_cast(optimizer)) { optimizer = new OptimizerWithGradientClipping(optConfig, optimizer); } Regularizer* regularizer = @@ -157,23 +168,23 @@ ParameterOptimizer* OptimizerWithRegularizer::create( } // normal optimizer->setNoDecay(); - return new OptimizerWithRegularizerEveryNumBatches(optConfig, optimizer, - regularizer); + return new OptimizerWithRegularizerEveryNumBatches( + optConfig, optimizer, regularizer); } if (isParameterSparse) { - CHECK(paraConfig.momentum() == 0.0f) - << "Parameter cannot support momentum if it's sparse."; + CHECK(paraConfig.momentum() == 0.0f) + << "Parameter cannot support momentum if it's sparse."; optimizer->setNoDecay(); - return new OptimizerWithRegularizerSparse(optConfig, optimizer, - regularizer); + return new OptimizerWithRegularizerSparse( + optConfig, optimizer, regularizer); } // dense if (paraConfig.decay_rate_l1() == 0.0f || - dynamic_cast(optimizer)) { + dynamic_cast(optimizer)) { return optimizer; } CHECK(paraConfig.momentum() == 0.0f) - << "Parameter cannot support momentum if it use L1 decay."; + << "Parameter cannot support momentum if it use L1 decay."; optimizer->setNoDecay(); return new OptimizerWithRegularizer(optConfig, optimizer, regularizer); } diff --git a/paddle/parameter/OptimizerWithRegularizer.h b/paddle/parameter/OptimizerWithRegularizer.h index b8b2d5b84d6875c1f9e4ea8a9cd1c93c1fff4be5..ebe23c7397f6d3f14976422342953e493a6fbee1 100644 --- a/paddle/parameter/OptimizerWithRegularizer.h +++ b/paddle/parameter/OptimizerWithRegularizer.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "FirstOrderOptimizer.h" @@ -24,7 +23,8 @@ class OptimizerWithRegularizer : public ParameterOptimizer { public: static ParameterOptimizer* create(const OptimizationConfig& optConfig, const ParameterConfig& paraConfig, - bool isParameterSparse, bool inPserver); + bool isParameterSparse, + bool inPserver); OptimizerWithRegularizer(const OptimizationConfig& optConfig, ParameterOptimizer* optimizer, @@ -60,7 +60,8 @@ public: return optimizer_->needSpecialTraversal(config); } - virtual void update(const VectorPtr vecs[], const ParameterConfig& config, + virtual void update(const VectorPtr vecs[], + const ParameterConfig& config, size_t sparseId) const { optimizer_->update(vecs, config, sparseId); regularizer_->update(vecs, config, optimizer_->getLearningRate(), 0, 1); @@ -94,7 +95,8 @@ public: baseTimer_ = 0; } - virtual void update(const VectorPtr vecs[], const ParameterConfig& config, + virtual void update(const VectorPtr vecs[], + const ParameterConfig& config, size_t sparseId) const { optimizer_->update(vecs, config, sparseId); } @@ -103,7 +105,8 @@ public: const ParameterConfig& config) const; void doTraversal(const VectorPtr vecs[], const ParameterConfig& config) const; - void catchUpWith(const VectorPtr vecs[], const ParameterConfig& config, + void catchUpWith(const VectorPtr vecs[], + const ParameterConfig& config, size_t sparseId) const; virtual TraverseCallback startCatchUpWith() const; @@ -130,9 +133,11 @@ public: virtual void init(size_t numRows, const ParameterConfig* config); - virtual void update(const VectorPtr vecs[], const ParameterConfig& config, + virtual void update(const VectorPtr vecs[], + const ParameterConfig& config, size_t sparseId) const; - void catchUpWith(const VectorPtr vecs[], const ParameterConfig& config, + void catchUpWith(const VectorPtr vecs[], + const ParameterConfig& config, size_t sparseId) const; virtual TraverseCallback startCatchUpWith() const; virtual void finishCatchUpWith() { diff --git a/paddle/parameter/ParallelParameter.cpp b/paddle/parameter/ParallelParameter.cpp index 19cbdab1c8d1e8b4836c8f193901edb5b166f055..99b20a59ca2a8b4a84a5bcbd0fab135ac54de61c 100644 --- a/paddle/parameter/ParallelParameter.cpp +++ b/paddle/parameter/ParallelParameter.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/utils/Logging.h" #include @@ -152,7 +151,8 @@ void SyncParameter::minorUpdate(real learnRate) { gradSem_->post(); } -AsyncParameter::AsyncParameter(TrainerRole role, int asyncCount, +AsyncParameter::AsyncParameter(TrainerRole role, + int asyncCount, ParameterPtr localParam) : ParallelParameter(role, localParam) { asyncCount_ = asyncCount; diff --git a/paddle/parameter/ParallelParameter.h b/paddle/parameter/ParallelParameter.h index 882033af636529cd845bfaae2253767a37e2cb72..2b65321fe201ae166dbbd6629e9a0ab0c6481699 100644 --- a/paddle/parameter/ParallelParameter.h +++ b/paddle/parameter/ParallelParameter.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include @@ -47,17 +46,17 @@ const int UPDATE_TYPE_NUM = 32; * TrainRole denotes the role of current training, different roles have * different jobs. * - * control, major, minor are three kinds of role to support mutiple GPUs + * control, major, minor are three kinds of role to support mutiple GPUs * parallel SGD training. SM on GPU card has two groups, each group * consist of a major and a minor. * * @param single single GPU card single thread training. - * + * * * @param control current parameter updates via control role, * not participate in real training. control role is - * responsible for merging all major's gradient and - * update parameter value. + * responsible for merging all major's gradient and + * update parameter value. * * @param major major role paticipates in real training, when local * gradient is ready, merge its corresponding minor's @@ -83,7 +82,8 @@ typedef void (ParallelParameter::*UpdateFunction)(real learnRate); class ParallelParameter { public: - static ParallelParameterPtr create(TrainerRole role, ParameterPtr localParam, + static ParallelParameterPtr create(TrainerRole role, + ParameterPtr localParam, int asyncCount = 1); ParallelParameter(TrainerRole role, ParameterPtr localParam) { @@ -135,7 +135,7 @@ protected: }; /** - * this class is designed for multi-threading training. + * this class is designed for multi-threading training. * * "Synchronous" means multiple GPUs calculate 1/4 mini-Batch, * but will get only one gradient @@ -209,14 +209,14 @@ public: * When asynchronous training, update strategy including slave and master. * * slave: If in range asyncCount, adopting self-update method. - * If beyond asyncCount, waiting for master to update. + * If beyond asyncCount, waiting for master to update. */ void slaveUpdate(real learnRate); /** * When asynchronous training, update strategy including slave and master. * - * master: it only polls slaves, do not training data. + * master: it only polls slaves, do not training data. * If slave's gradient is ready, fetch it. * Update master's parameter, then copy it into * corresponding slave. @@ -227,7 +227,7 @@ public: private: /** * When asynchronous training, every aysnc trainer needs to - * accumulate a number of batch gradient. + * accumulate a number of batch gradient. * * gradientAccum_ is used to save the sum of gradients. */ diff --git a/paddle/parameter/Parameter.cpp b/paddle/parameter/Parameter.cpp index 64d72ae7404f09903aea35cefd97e810b20c39a3..7e37bf225ba25e8bae269cf45b69ce418a54d1a3 100644 --- a/paddle/parameter/Parameter.cpp +++ b/paddle/parameter/Parameter.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include #include "paddle/math/MathUtils.h" #include "AverageOptimizer.h" @@ -27,11 +26,13 @@ limitations under the License. */ #include "hl_gpu.h" #include "paddle/utils/CommandLineParser.h" -P_DEFINE_int32(enable_grad_share, (100 * 1024 * 1024), +P_DEFINE_int32(enable_grad_share, + (100 * 1024 * 1024), "threshold for enable gradient parameter share for batch " "multi-cpu training"); P_DEFINE_int32( - grad_share_block_num, 64, + grad_share_block_num, + 64, "block number of gradient parameter share for batch multi-cpu training"); namespace paddle { @@ -95,13 +96,12 @@ void Parameter::randomize(const VectorPtr& value, real initial_max = config.initial_mean() + config.initial_std(); value->uniform(initial_min, initial_max); VLOG(1) << config.name() << ": initial_min=" << initial_min - << ", initial_max=" << initial_max; + << ", initial_max=" << initial_max; } else if (PARAMETER_INIT_NORMAL == config.initial_strategy()) { /* Initialize the parameters randomly */ value->randnorm(config.initial_mean(), config.initial_std()); - VLOG(1) << config.name() - << ": initial_mean=" << config.initial_mean() - << ", initial_std=" << config.initial_std(); + VLOG(1) << config.name() << ": initial_mean=" << config.initial_mean() + << ", initial_std=" << config.initial_std(); } else { LOG(FATAL) << "not supported initial_strategy: " << config.initial_strategy(); @@ -116,12 +116,18 @@ void Parameter::randomize() { if (config_.is_sparse()) { if (format_ == SPARSE_CSC) { sparseRand(intBufs_[PARAMETER_COLS]->getData(), - intBufs_[PARAMETER_ROWS]->getData(), config_.size(), - config_.dims(1) + 1, config_.dims(0), useGpu_); + intBufs_[PARAMETER_ROWS]->getData(), + config_.size(), + config_.dims(1) + 1, + config_.dims(0), + useGpu_); } else { sparseRand(intBufs_[PARAMETER_ROWS]->getData(), - intBufs_[PARAMETER_COLS]->getData(), config_.size(), - config_.dims(0) + 1, config_.dims(1), useGpu_); + intBufs_[PARAMETER_COLS]->getData(), + config_.size(), + config_.dims(0) + 1, + config_.dims(1), + useGpu_); } } setValueUpdated(); @@ -152,7 +158,7 @@ bool Parameter::isValueShared() { bool Parameter::isGradSparseUpdate() const { return !useGpu_ && !isStatic() && - (config_.sparse_update() || config_.sparse_remote_update()); + (config_.sparse_update() || config_.sparse_remote_update()); } void Parameter::setMat(ParameterType pType, int matType) { @@ -180,30 +186,42 @@ void Parameter::setMat(ParameterType pType, int matType) { CHECK_EQ(width + 1, intBufs_[PARAMETER_COLS]->getSize()); CHECK_EQ(size, intBufs_[PARAMETER_ROWS]->getSize()); } - mats_[pType] = Matrix::createSparseMatrix( - bufs_[pType]->getData(), intBufs_[PARAMETER_ROWS]->getData(), - intBufs_[PARAMETER_COLS]->getData(), height, width, - bufs_[pType]->getSize(), FLOAT_VALUE, format_, false, useGpu_); + mats_[pType] = + Matrix::createSparseMatrix(bufs_[pType]->getData(), + intBufs_[PARAMETER_ROWS]->getData(), + intBufs_[PARAMETER_COLS]->getData(), + height, + width, + bufs_[pType]->getSize(), + FLOAT_VALUE, + format_, + false, + useGpu_); } } else if (matType == MAT_NORMAL_SHARED) { CHECK_EQ(height * width, bufs_[pType]->getSize()); size_t blockNum = 0; CHECK(isGradShared(&blockNum)); mats_[pType] = std::make_shared( - blockNum, std::dynamic_pointer_cast( - bufs_[pType]->getMemoryHandle()), - height, width); + blockNum, + std::dynamic_pointer_cast( + bufs_[pType]->getMemoryHandle()), + height, + width); } else if (matType == MAT_VALUE_SHARED) { CHECK_EQ(height * width, bufs_[pType]->getSize()); mats_[pType] = std::make_shared( std::dynamic_pointer_cast( - bufs_[pType]->getMemoryHandle()), height, width); + bufs_[pType]->getMemoryHandle()), + height, + width); } else if (matType == MAT_SPARSE_ROW_IDS) { CHECK_EQ(height * width, bufs_[pType]->getSize()); mats_[pType] = std::make_shared( std::dynamic_pointer_cast( bufs_[pType]->getMemoryHandle()), - height, width); + height, + width); } else if (matType == MAT_SPARSE_ROW) { auto valueMat = std::dynamic_pointer_cast(mats_[PARAMETER_VALUE]); @@ -214,29 +232,31 @@ void Parameter::setMat(ParameterType pType, int matType) { << " MAT_SPARSE_ROW_PREFETCH or MAT_CACHE_ROW"; indexDict = valueMat->getIndexDictHandle(); } - auto mat = std::make_shared( - nullptr, height, width, - // grad share index with value - indexDict); + auto mat = + std::make_shared(nullptr, + height, + width, + // grad share index with value + indexDict); mats_[pType] = mat; } else if (matType == MAT_CACHE_ROW) { CHECK(isGradSparseUpdate()); - auto mat = std::make_shared( - height, width); + auto mat = std::make_shared(height, width); mats_[pType] = mat; } else if (matType == MAT_SPARSE_ROW_PREFETCH_FULL_SIZE || matType == MAT_SPARSE_ROW_PREFETCH) { auto mat = std::make_shared( bufs_[pType] ? std::dynamic_pointer_cast( - bufs_[pType]->getMemoryHandle()) : nullptr, - height, width, + bufs_[pType]->getMemoryHandle()) + : nullptr, + height, + width, nullptr, // indexDictHandle getGlobalSyncThreadPool()); mats_[pType] = mat; } else if (matType == MAT_SPARSE_ROW_AUTO_GROW) { CHECK(isGradSparseUpdate()); - mats_[pType] = std::make_shared( - height, width); + mats_[pType] = std::make_shared(height, width); } else { LOG(FATAL) << "Unsupported mat type" << matType; } @@ -252,30 +272,43 @@ SparsePrefetchRowCpuMatrix* Parameter::getPrefetchMatrix() { } void Parameter::updateWithGradient(real learningRate) { - sgdUpdate(learningRate * config_.learning_rate(), config_.momentum(), - config_.decay_rate(), bufs_[PARAMETER_VALUE].get(), - bufs_[PARAMETER_GRADIENT].get(), bufs_[PARAMETER_MOMENTUM].get()); + sgdUpdate(learningRate * config_.learning_rate(), + config_.momentum(), + config_.decay_rate(), + bufs_[PARAMETER_VALUE].get(), + bufs_[PARAMETER_GRADIENT].get(), + bufs_[PARAMETER_MOMENTUM].get()); } -void Parameter::updateWithGradient(real learningRate, MatrixPtr gradMat, - IVectorPtr t0, int currentTime, bool fini) { +void Parameter::updateWithGradient(real learningRate, + MatrixPtr gradMat, + IVectorPtr t0, + int currentTime, + bool fini) { SparseRowCpuMatrix* sparseMat = dynamic_cast(gradMat.get()); CHECK(sparseMat); CHECK_EQ(config_.momentum(), 0.0f) << "not support momentum in sparse input sgd"; bool useL1 = (config_.decay_rate_l1() != 0.0f); - sparseMat->sgdUpdate(*bufs_[PARAMETER_VALUE], *t0, - learningRate * config_.learning_rate(), currentTime, + sparseMat->sgdUpdate(*bufs_[PARAMETER_VALUE], + *t0, + learningRate * config_.learning_rate(), + currentTime, useL1 ? config_.decay_rate_l1() : config_.decay_rate(), - useL1, fini); + useL1, + fini); } -void Parameter::updateWithGradient(real learningRate, VectorPtr gradVec, +void Parameter::updateWithGradient(real learningRate, + VectorPtr gradVec, bool normalUpdate) { if (normalUpdate) { - sgdUpdate(learningRate * config_.learning_rate(), config_.momentum(), - config_.decay_rate(), bufs_[PARAMETER_VALUE].get(), gradVec.get(), + sgdUpdate(learningRate * config_.learning_rate(), + config_.momentum(), + config_.decay_rate(), + bufs_[PARAMETER_VALUE].get(), + gradVec.get(), bufs_[PARAMETER_MOMENTUM].get()); } else { size_t size = gradVec->getSize(); @@ -361,7 +394,7 @@ bool Parameter::load(const std::string& filename) { return true; } LOG(FATAL) << "unsupported load_missing_parameter_strategy: " - << FLAGS_load_missing_parameter_strategy; + << FLAGS_load_missing_parameter_strategy; return false; } return load(fs); @@ -372,8 +405,8 @@ bool Parameter::load(std::istream& s) { Header header; CHECK(s.read(reinterpret_cast(&header), sizeof(header))) << "Fail to read parameter " << getName(); - CHECK_EQ(header.version, kFormatVersion) - << "Incorrect format version: " << header.version; + CHECK_EQ(header.version, kFormatVersion) << "Incorrect format version: " + << header.version; CHECK_EQ(header.size, getSize()) << "The size (" << header.size << ") in the file does not match the size " << "(" << getSize() << ") of the parameter: " << getName(); @@ -382,7 +415,7 @@ bool Parameter::load(std::istream& s) { CHECK(s.read(reinterpret_cast(vec.getData()), header.size * sizeof(real))); - auto & tmp = *bufs_[PARAMETER_VALUE].get(); + auto& tmp = *bufs_[PARAMETER_VALUE].get(); if (typeid(tmp) == typeid(GpuVector)) { bufs_[PARAMETER_VALUE]->copyFrom(vec); } @@ -393,7 +426,11 @@ bool Parameter::load(std::istream& s) { auto height = config_.dims(0); auto width = config_.dims(1); auto mat = Matrix::create(vec.getData(), height, width); - CpuSparseMatrix sparseMat(height, width, 0, FLOAT_VALUE, format_, + CpuSparseMatrix sparseMat(height, + width, + 0, + FLOAT_VALUE, + format_, /*trans*/ false); sparseMat.copyFrom(*mat, HPPL_STREAM_DEFAULT); auto nnz = sparseMat.getElementCnt(); @@ -423,11 +460,11 @@ bool Parameter::load(std::istream& s) { s.read(reinterpret_cast(rows.getData()), rowSize * sizeof(int))); CHECK( s.read(reinterpret_cast(cols.getData()), colSize * sizeof(int))); - auto & paramRows = *intBufs_[PARAMETER_ROWS].get(); + auto& paramRows = *intBufs_[PARAMETER_ROWS].get(); if (typeid(paramRows) == typeid(GpuIVector)) { intBufs_[PARAMETER_ROWS]->copyFrom(rows); } - auto & paramCols = *intBufs_[PARAMETER_COLS].get(); + auto& paramCols = *intBufs_[PARAMETER_COLS].get(); if (typeid(paramCols) == typeid(GpuIVector)) { intBufs_[PARAMETER_COLS]->copyFrom(cols); } @@ -457,8 +494,8 @@ void Parameter::exec(ExecFunc func) { func(this->getBufs()); } else { // multi thread VectorPtr* vecs = Parameter::getTlsTempBufs(); - auto interval = calcSplitArrayInterval(this->getSize(), (size_t)tid, - numThreads, 8LU /*for avx*/); + auto interval = calcSplitArrayInterval( + this->getSize(), (size_t)tid, numThreads, 8LU /*for avx*/); for (size_t i = 0; i < (size_t)NUM_PARAMETER_TYPES; ++i) { if (bufs_[i]) { vecs[i]->subVecFrom(*bufs_[i], interval); diff --git a/paddle/parameter/Parameter.h b/paddle/parameter/Parameter.h index ff251fe89f9f885c361b6c1ae7dde0ae57695e47..1c159d669a6a0f7b56c377e0b1cfa35b3fb75d53 100644 --- a/paddle/parameter/Parameter.h +++ b/paddle/parameter/Parameter.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include @@ -52,7 +51,6 @@ struct Segment { int64_t beginPos; // beginning position in the local value or grad buffer }; - class Parameter; typedef std::shared_ptr ParameterPtr; @@ -129,8 +127,7 @@ public: if (config_.dims_size() == 2) { if (matType == MAT_NORMAL || matType == MAT_NORMAL_SHARED || matType == MAT_SPARSE_ROW_PREFETCH_FULL_SIZE || - matType == MAT_VALUE_SHARED || - matType == MAT_SPARSE_ROW_IDS) { + matType == MAT_VALUE_SHARED || matType == MAT_SPARSE_ROW_IDS) { bufs_[type] = Vector::createParallelVector(config_.size(), useGpu_); bufs_[type]->zeroMem(); } else { @@ -161,7 +158,8 @@ public: } } - void enableSharedType(ParameterType type, VectorPtr vec, + void enableSharedType(ParameterType type, + VectorPtr vec, MatrixPtr mat = nullptr) { if (!bufs_[type] && !mats_[type]) { bufs_[type] = vec; @@ -235,13 +233,17 @@ public: * * @see SparseRowCpuMatrix::sgdUpdate for more information. */ - void updateWithGradient(real learningRate, MatrixPtr gradMat, IVectorPtr t0, - int currentTime, bool fini = false); + void updateWithGradient(real learningRate, + MatrixPtr gradMat, + IVectorPtr t0, + int currentTime, + bool fini = false); /** * This function is used to calculate multiple gpus, but only as a candidate */ - void updateWithGradient(real learningRate, VectorPtr grad, + void updateWithGradient(real learningRate, + VectorPtr grad, bool normalUpdate = true); /** diff --git a/paddle/parameter/ParameterOptimizer.cpp b/paddle/parameter/ParameterOptimizer.cpp index 164b50c4d279102ce14d82b102a74e56dfc5b2fe..2a71d6aee4dae556956616bd317156cfaf8732f0 100644 --- a/paddle/parameter/ParameterOptimizer.cpp +++ b/paddle/parameter/ParameterOptimizer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/utils/Logging.h" #include diff --git a/paddle/parameter/ParameterOptimizer.h b/paddle/parameter/ParameterOptimizer.h index 8c766743401dddc6468e6db22164843e286e6ad7..21a148333c2fd3aa127c5b3bb8160784864f4cce 100644 --- a/paddle/parameter/ParameterOptimizer.h +++ b/paddle/parameter/ParameterOptimizer.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "LearningRateScheduler.h" @@ -32,8 +31,8 @@ namespace paddle { */ class ParameterOptimizer { public: - typedef std::function + typedef std::function TraverseCallback; public: @@ -69,35 +68,35 @@ public: (void)numSamplesProcessed; } - /** - * following hooks useful for sparse update, - * because the traversal in block costs. - * called by Trainer after update and before finishBatch - * e.g. Trainer call like this: - * - * @code - * startBatch(); - * if (dense) { - * update(blockVec); - * } else {//sparse - * for (row : rows_in_block) {update(rowVec)} - * } - * auto callback = needSpecialTraversal(); - * if (callback) { - * // do traverse, maybe multi-thread - * if (dense) { - * callback(); - * } else {//sparse - * for (row : all_rows_in_block) {callback();} - * } - * } - * finishBatch(); - * @endcode - * - * @return callback if need traverse, - * else return nullptr. - * It should be no state change. - */ + /** + * following hooks useful for sparse update, + * because the traversal in block costs. + * called by Trainer after update and before finishBatch + * e.g. Trainer call like this: + * + * @code + * startBatch(); + * if (dense) { + * update(blockVec); + * } else {//sparse + * for (row : rows_in_block) {update(rowVec)} + * } + * auto callback = needSpecialTraversal(); + * if (callback) { + * // do traverse, maybe multi-thread + * if (dense) { + * callback(); + * } else {//sparse + * for (row : all_rows_in_block) {callback();} + * } + * } + * finishBatch(); + * @endcode + * + * @return callback if need traverse, + * else return nullptr. + * It should be no state change. + */ virtual TraverseCallback needSpecialTraversal( const ParameterConfig& config) const { return nullptr; @@ -112,47 +111,48 @@ public: * with its gradient in PARAMETER_GRADIENT. sparseId is row id, * when sparseId set, update is sparse, each time one row. */ - virtual void update(const VectorPtr vecs[], const ParameterConfig& config, + virtual void update(const VectorPtr vecs[], + const ParameterConfig& config, size_t sparseId = -1LU) const = 0; - /** - * following hooks catch up with current time for sparse update, - * In the beginning, call startCatchUpWith() and check return. - * In the end, call finishCatchUpWith() to finish state. - * callback do the actual works, can call many times for sparse data. - * e.g. Trainer call like this: - * - * @code - * auto callback = startCatchUpWith(); - * if (callback) { - * // do catch up with, maybe multi-thread - * if (dense) { - * callback(); - * } else {//sparse - * for (row : rows_in_block) {callback();} - * } - * // finish catch up with, main thread - * finishCatchUpWith(); - * } - * @endcode - * - * @return callback if need catch up with, - * else return nullptr. - * It should be no state change. - */ + /** + * following hooks catch up with current time for sparse update, + * In the beginning, call startCatchUpWith() and check return. + * In the end, call finishCatchUpWith() to finish state. + * callback do the actual works, can call many times for sparse data. + * e.g. Trainer call like this: + * + * @code + * auto callback = startCatchUpWith(); + * if (callback) { + * // do catch up with, maybe multi-thread + * if (dense) { + * callback(); + * } else {//sparse + * for (row : rows_in_block) {callback();} + * } + * // finish catch up with, main thread + * finishCatchUpWith(); + * } + * @endcode + * + * @return callback if need catch up with, + * else return nullptr. + * It should be no state change. + */ virtual TraverseCallback startCatchUpWith() const { return nullptr; } virtual void finishCatchUpWith() {} - /** - * following two hooks used by averager, - * apply to final parameter value (PARAMETER_VALUE or PARAMETER_APPLY). - * - * restore() will restore orginal value if it apply to PARAMETER_VALUE. - * Caller must ensure it's catched up with current time before apply. - * - * Use returned callback same way as callback returned by - * ParameterOptimizer::needSpecialTraversal() - */ + /** + * following two hooks used by averager, + * apply to final parameter value (PARAMETER_VALUE or PARAMETER_APPLY). + * + * restore() will restore orginal value if it apply to PARAMETER_VALUE. + * Caller must ensure it's catched up with current time before apply. + * + * Use returned callback same way as callback returned by + * ParameterOptimizer::needSpecialTraversal() + */ virtual TraverseCallback apply() { return nullptr; } virtual TraverseCallback restore() { return nullptr; } @@ -180,7 +180,8 @@ protected: static TraverseCallback composeCallbacks( const TraverseCallbackVec& callbacks) { if (callbacks.size() > 1LU) { - return [callbacks](const VectorPtr vecs[], const ParameterConfig& config, + return [callbacks](const VectorPtr vecs[], + const ParameterConfig& config, size_t sparseId) { for (auto callback : callbacks) { callback(vecs, config, sparseId); diff --git a/paddle/parameter/ParameterUpdateFunctions.cpp b/paddle/parameter/ParameterUpdateFunctions.cpp index 679e3bf89b517a91cdd1af6bdad4e199418485a5..510ec5bf48a7576f646ecf01b02c5047c637afeb 100644 --- a/paddle/parameter/ParameterUpdateFunctions.cpp +++ b/paddle/parameter/ParameterUpdateFunctions.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/utils/Logging.h" #ifdef __AVX__ #include @@ -23,8 +22,13 @@ limitations under the License. */ namespace paddle { -void sgdUpdateCpu(real learningRate, real momentum, real decayRate, size_t size, - real* value, const real* grad, real* momentumVec) { +void sgdUpdateCpu(real learningRate, + real momentum, + real decayRate, + size_t size, + real* value, + const real* grad, + real* momentumVec) { decayRate *= learningRate; for (size_t i = 0; i < size; ++i) { momentumVec[i] = momentum * momentumVec[i] - learningRate * grad[i] - @@ -33,8 +37,12 @@ void sgdUpdateCpu(real learningRate, real momentum, real decayRate, size_t size, } } -void sgdUpdate(real learningRate, real momentum, real decayRate, Vector* value, - Vector* grad, Vector* momentumVec) { +void sgdUpdate(real learningRate, + real momentum, + real decayRate, + Vector* value, + Vector* grad, + Vector* momentumVec) { size_t size = value->getSize(); real* val = value->getData(); real* grd = grad->getData(); @@ -48,8 +56,12 @@ void sgdUpdate(real learningRate, real momentum, real decayRate, Vector* value, } } -void sgdUpdateAvx(float learningRate, float momentum, float decayRate, - size_t size, float* value, const float* _grad, +void sgdUpdateAvx(float learningRate, + float momentum, + float decayRate, + size_t size, + float* value, + const float* _grad, float* momentumVec) { #ifdef __AVX__ float* grad = const_cast(_grad); // the gradient is not modified @@ -86,18 +98,36 @@ void sgdUpdateAvx(float learningRate, float momentum, float decayRate, std::function loopFun; learningRate *= -1; - lr = _mm256_set_ps(learningRate, learningRate, learningRate, learningRate, - learningRate, learningRate, learningRate, learningRate); + lr = _mm256_set_ps(learningRate, + learningRate, + learningRate, + learningRate, + learningRate, + learningRate, + learningRate, + learningRate); if (0 != momentum) { - mom = _mm256_set_ps(momentum, momentum, momentum, momentum, momentum, - momentum, momentum, momentum); + mom = _mm256_set_ps(momentum, + momentum, + momentum, + momentum, + momentum, + momentum, + momentum, + momentum); } decayRate *= learningRate; if (0 != decayRate) { - dr = _mm256_set_ps(decayRate, decayRate, decayRate, decayRate, decayRate, - decayRate, decayRate, decayRate); + dr = _mm256_set_ps(decayRate, + decayRate, + decayRate, + decayRate, + decayRate, + decayRate, + decayRate, + decayRate); } auto gradMulFun = [&](void) { diff --git a/paddle/parameter/ParameterUpdateFunctions.h b/paddle/parameter/ParameterUpdateFunctions.h index 59eb25656e51c097b2d957902573437894ab53f7..2d98030bd2389469fbd32940af6162203557620c 100644 --- a/paddle/parameter/ParameterUpdateFunctions.h +++ b/paddle/parameter/ParameterUpdateFunctions.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "paddle/utils/TypeDefs.h" @@ -31,14 +30,27 @@ namespace paddle { * momentum = 0 or decayRate = 0 are specially handled to avoid unnecessary * computation. */ -void sgdUpdate(real learningRate, real momentum, real decayRate, Vector* value, - Vector* grad, Vector* momentumVec); - -void sgdUpdateCpu(real learningRate, real momentum, real decayRate, size_t size, - real* value, const real* grad, real* momentumVec); - -void sgdUpdateAvx(float learningRate, float momentum, float decayRate, - size_t size, float* value, const float* grad, +void sgdUpdate(real learningRate, + real momentum, + real decayRate, + Vector* value, + Vector* grad, + Vector* momentumVec); + +void sgdUpdateCpu(real learningRate, + real momentum, + real decayRate, + size_t size, + real* value, + const real* grad, + real* momentumVec); + +void sgdUpdateAvx(float learningRate, + float momentum, + float decayRate, + size_t size, + float* value, + const float* grad, float* momentumVec); } // namespace paddle diff --git a/paddle/parameter/ParameterUpdaterBase.cpp b/paddle/parameter/ParameterUpdaterBase.cpp index e3f1d54037b305972248b7b30065b0ae5eb4b357..e706742053fc49df9c99081774f425622941e38c 100644 --- a/paddle/parameter/ParameterUpdaterBase.cpp +++ b/paddle/parameter/ParameterUpdaterBase.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include #include "paddle/utils/Logging.h" #include "ParameterUpdaterBase.h" diff --git a/paddle/parameter/ParameterUpdaterBase.h b/paddle/parameter/ParameterUpdaterBase.h index f16e183515853e01eacda39977c9a7e127b3824c..ffd2980261530382ee09f2c98e354d0e56fd8038 100644 --- a/paddle/parameter/ParameterUpdaterBase.h +++ b/paddle/parameter/ParameterUpdaterBase.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "Parameter.h" diff --git a/paddle/parameter/ParameterUpdaterHook.cpp b/paddle/parameter/ParameterUpdaterHook.cpp index 02a352920cf38120938d659dac4258a48643de4d..7d85a32c0cf527d39c252c2021b7bad0eb58753d 100644 --- a/paddle/parameter/ParameterUpdaterHook.cpp +++ b/paddle/parameter/ParameterUpdaterHook.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "ParameterUpdaterHook.h" #include @@ -155,7 +154,8 @@ private: std::hash intHasher_; }; -static WeakKVCache, IParameterUpdaterHook, +static WeakKVCache, + IParameterUpdaterHook, StringIntPairHasher> g_hookCache_; /** diff --git a/paddle/parameter/ParameterUpdaterHook.h b/paddle/parameter/ParameterUpdaterHook.h index 1c132a733866b8083632a64b1b47ff2b35b2ee69..553282bcaaa2e90910eaafbe2e03a4afadf04a85 100644 --- a/paddle/parameter/ParameterUpdaterHook.h +++ b/paddle/parameter/ParameterUpdaterHook.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include diff --git a/paddle/parameter/Regularizer.cpp b/paddle/parameter/Regularizer.cpp index bc7de3ca048dbe094e1f53c024e705425908cdfb..a9bddc1596656ba36d6c445781f42991684f0c52 100644 --- a/paddle/parameter/Regularizer.cpp +++ b/paddle/parameter/Regularizer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/utils/Util.h" #include "paddle/utils/Flags.h" #include "Regularizer.h" @@ -21,8 +20,9 @@ namespace paddle { Regularizer* Regularizer::get(const std::vector& types, const ParameterConfig& paraConfig) { - bool useLearningRateVec = std::find(types.begin(), types.end(), - PARAMETER_LEARNING_RATE) != types.end(); + bool useLearningRateVec = + std::find(types.begin(), types.end(), PARAMETER_LEARNING_RATE) != + types.end(); if (paraConfig.decay_rate_l1() > 0.0f && paraConfig.decay_rate() > 0.0f) { // use L1 and L2 if (useLearningRateVec) { diff --git a/paddle/parameter/Regularizer.h b/paddle/parameter/Regularizer.h index 8c9eb49ab611e8aea7b88f008fe287cbdb17a008..5baaccc00db5f858272dbfa6751647915bfa6e3c 100644 --- a/paddle/parameter/Regularizer.h +++ b/paddle/parameter/Regularizer.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "ParameterUpdaterBase.h" @@ -22,7 +21,8 @@ namespace paddle { // Regularizer function for parameter, e.g. L1/L2 class Regularizer { public: - virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig, + virtual void update(const VectorPtr vecs[], + const ParameterConfig& paraConfig, real learningRate, // learningrate from optimizer int t0, // last occurence time int t) const = 0; // current time @@ -34,8 +34,11 @@ public: // L1 Regularizer, |w|_1 class L1Regularizer : public Regularizer { - virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig, - real learningRate, int t0, int t) const { + virtual void update(const VectorPtr vecs[], + const ParameterConfig& paraConfig, + real learningRate, + int t0, + int t) const { vecs[PARAMETER_VALUE]->applyL1(learningRate * paraConfig.learning_rate(), paraConfig.decay_rate_l1() * (t - t0)); } @@ -43,8 +46,11 @@ class L1Regularizer : public Regularizer { // L1 Lr Regularizer class L1LrRegularizer : public Regularizer { - virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig, - real learningRate, int t0, int t) const { + virtual void update(const VectorPtr vecs[], + const ParameterConfig& paraConfig, + real learningRate, + int t0, + int t) const { vecs[PARAMETER_VALUE]->applyL1(*vecs[PARAMETER_LEARNING_RATE], learningRate * paraConfig.learning_rate(), paraConfig.decay_rate_l1() * (t - t0)); @@ -53,8 +59,11 @@ class L1LrRegularizer : public Regularizer { // L2 Regularizer, |w|_2^2 class L2Regularizer : public Regularizer { - virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig, - real learningRate, int t0, int t) const { + virtual void update(const VectorPtr vecs[], + const ParameterConfig& paraConfig, + real learningRate, + int t0, + int t) const { vecs[PARAMETER_VALUE]->applyL2(learningRate * paraConfig.learning_rate(), paraConfig.decay_rate() * (t - t0)); } @@ -62,8 +71,11 @@ class L2Regularizer : public Regularizer { // L2 Lr Regularizer class L2LrRegularizer : public Regularizer { - virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig, - real learningRate, int t0, int t) const { + virtual void update(const VectorPtr vecs[], + const ParameterConfig& paraConfig, + real learningRate, + int t0, + int t) const { vecs[PARAMETER_VALUE]->applyL2(*vecs[PARAMETER_LEARNING_RATE], learningRate * paraConfig.learning_rate(), paraConfig.decay_rate() * (t - t0)); @@ -72,8 +84,11 @@ class L2LrRegularizer : public Regularizer { // L1 + L2 Regularizer, |w|_1 + |w|_2^2 class L1L2Regularizer : public Regularizer { - virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig, - real learningRate, int t0, int t) const { + virtual void update(const VectorPtr vecs[], + const ParameterConfig& paraConfig, + real learningRate, + int t0, + int t) const { vecs[PARAMETER_VALUE]->applyL1(learningRate * paraConfig.learning_rate(), paraConfig.decay_rate_l1() * (t - t0)); vecs[PARAMETER_VALUE]->applyL2(learningRate * paraConfig.learning_rate(), @@ -83,8 +98,11 @@ class L1L2Regularizer : public Regularizer { // L1 + L2 Lr Regularizer class L1L2LrRegularizer : public Regularizer { - virtual void update(const VectorPtr vecs[], const ParameterConfig& paraConfig, - real learningRate, int t0, int t) const { + virtual void update(const VectorPtr vecs[], + const ParameterConfig& paraConfig, + real learningRate, + int t0, + int t) const { vecs[PARAMETER_VALUE]->applyL1(*vecs[PARAMETER_LEARNING_RATE], learningRate * paraConfig.learning_rate(), paraConfig.decay_rate_l1() * (t - t0)); diff --git a/paddle/parameter/Weight.cpp b/paddle/parameter/Weight.cpp index ed02355c01a587da36da038be9e3d6eaf559c884..c138010607412fa257a6c7360a27d855197f88ad 100644 --- a/paddle/parameter/Weight.cpp +++ b/paddle/parameter/Weight.cpp @@ -60,14 +60,20 @@ Weight::Weight(size_t height, size_t width, ParameterPtr param, size_t offset) { // weight_ if (vPtr) { - weight_ = Matrix::create(vPtr->getData() + offset, height, width, - /* trans */ false, param->useGpu()); + weight_ = Matrix::create(vPtr->getData() + offset, + height, + width, + /* trans */ false, + param->useGpu()); } // weightGrad if (gPtr) { - weightGrad_ = Matrix::create(gPtr->getData() + offset, height, width, - /* trans */ false, param->useGpu()); + weightGrad_ = Matrix::create(gPtr->getData() + offset, + height, + width, + /* trans */ false, + param->useGpu()); } parameter_ = param; diff --git a/paddle/parameter/tests/test_common.cpp b/paddle/parameter/tests/test_common.cpp index 1a22abf7cf80157039f6147293e7648d654e45f7..1a64fe335257a3107be03cfd333cb483c5ab452d 100644 --- a/paddle/parameter/tests/test_common.cpp +++ b/paddle/parameter/tests/test_common.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include #include @@ -38,8 +37,8 @@ protected: CommonTest() : testStat_("test") {} virtual ~CommonTest() {} virtual void SetUp() { - const size_t buffSize[] = {100, 128, 500, 1024, - 4096, 10240, 102400, 1000000}; + const size_t buffSize[] = { + 100, 128, 500, 1024, 4096, 10240, 102400, 1000000}; sizeVec_.resize(8); memcpy(&sizeVec_[0], &buffSize[0], 8 * sizeof(size_t)); valueUint_.resize(4); @@ -54,8 +53,10 @@ protected: learningRate_ = 1.0; } - void test_sgdUpadate(real* gradientBuffer, real* valueBuffer, - real* momentumBuffer, size_t size); + void test_sgdUpadate(real* gradientBuffer, + real* valueBuffer, + real* momentumBuffer, + size_t size); virtual void TreaDown() { LOG(INFO) << "All Test Finished."; } @@ -66,8 +67,10 @@ protected: StatSet testStat_; }; -void CommonTest::test_sgdUpadate(real* gradientBuffer, real* valueBuffer, - real* momentumBuffer, size_t size) { +void CommonTest::test_sgdUpadate(real* gradientBuffer, + real* valueBuffer, + real* momentumBuffer, + size_t size) { // sgdUpdateAvx has no double version yet #if defined(__AVX__) && !defined(PADDLE_TYPE_DOUBLE) real valueSum1 = 0, valueSum2 = 0, momSum1 = 0, momSum2 = 0; @@ -85,8 +88,13 @@ void CommonTest::test_sgdUpadate(real* gradientBuffer, real* valueBuffer, gettimeofday(&t, NULL); } REGISTER_TIMER("avxTimer", 0); - sgdUpdateAvx(learningRate_, arg.first, arg.second, size, valueBuffer, - gradientBuffer, momentumBuffer); + sgdUpdateAvx(learningRate_, + arg.first, + arg.second, + size, + valueBuffer, + gradientBuffer, + momentumBuffer); } for (size_t i = 0; i < size; i++) { valueSum1 += valueBuffer[i]; @@ -98,8 +106,13 @@ void CommonTest::test_sgdUpadate(real* gradientBuffer, real* valueBuffer, } { REGISTER_TIMER("cpuTimer", 0); - sgdUpdateCpu(learningRate_, arg.first, arg.second, size, valueTmp, - gradTmp, momentumTmp); + sgdUpdateCpu(learningRate_, + arg.first, + arg.second, + size, + valueTmp, + gradTmp, + momentumTmp); } for (size_t i = 0; i < size; i++) { valueSum2 += valueTmp[i]; @@ -126,10 +139,10 @@ TEST_F(CommonTest, sgdUpdate) { for (auto& size : sizeVec_) { real *gradientBuffer, *valueBuffer, *momentumBuffer; CHECK_EQ(posix_memalign((void**)&gradientBuffer, 32, sizeof(real) * size), - 0); + 0); CHECK_EQ(posix_memalign((void**)&valueBuffer, 32, sizeof(real) * size), 0); CHECK_EQ(posix_memalign((void**)&momentumBuffer, 32, sizeof(real) * size), - 0); + 0); for (size_t i = 0; i < size; i++) { gradientBuffer[i] = 1.0; @@ -141,7 +154,8 @@ TEST_F(CommonTest, sgdUpdate) { << "-------------------------"; test_sgdUpadate(&gradientBuffer[alignHeader[i]], &valueBuffer[alignHeader[i]], - &momentumBuffer[alignHeader[i]], size - alignHeader[i]); + &momentumBuffer[alignHeader[i]], + size - alignHeader[i]); } free(gradientBuffer); free(valueBuffer); @@ -173,16 +187,16 @@ TEST_F(CommonTest, barrierStat) { SyncThreadPool pool(threadNum); -#define TEST_BARRIER_RANDOM(statName, numConnThreads, ...) \ - pool.exec([&](int tid, size_t numThreads) { \ - struct timeval time; \ - gettimeofday(&time, nullptr); \ - uint64_t usec = timeToMicroSecond(time); \ - std::srand(usec); \ - auto value = std::rand() % 100000; \ - usleep(value); \ - REGISTER_SLOW_NODES_PROBE(globalStat, statName, numConnThreads, tid, \ - __VA_ARGS__); \ +#define TEST_BARRIER_RANDOM(statName, numConnThreads, ...) \ + pool.exec([&](int tid, size_t numThreads) { \ + struct timeval time; \ + gettimeofday(&time, nullptr); \ + uint64_t usec = timeToMicroSecond(time); \ + std::srand(usec); \ + auto value = std::rand() % 100000; \ + usleep(value); \ + REGISTER_SLOW_NODES_PROBE( \ + globalStat, statName, numConnThreads, tid, __VA_ARGS__); \ }); for (auto i = 0; i < 10; i++) { @@ -202,11 +216,11 @@ TEST_F(CommonTest, barrierStat) { globalStat.reset(); // use it to test accurate barrier gap -#define TEST_BARRIER(statName, numConnThreads, ...) \ - pool.exec([&](int tid, size_t numThreads) { \ - usleep(tid * 10000); \ - REGISTER_SLOW_NODES_PROBE(globalStat, statName, numConnThreads, tid, \ - __VA_ARGS__); \ +#define TEST_BARRIER(statName, numConnThreads, ...) \ + pool.exec([&](int tid, size_t numThreads) { \ + usleep(tid * 10000); \ + REGISTER_SLOW_NODES_PROBE( \ + globalStat, statName, numConnThreads, tid, __VA_ARGS__); \ }); for (auto i = 0; i < 10; i++) { diff --git a/paddle/pserver/BaseClient.cpp b/paddle/pserver/BaseClient.cpp index df4daca9bfaf888ccaacc73d9295d6d973dcb9fb..ff83970ab1b11f74ceb4009cc8f469f7b54a7272 100644 --- a/paddle/pserver/BaseClient.cpp +++ b/paddle/pserver/BaseClient.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include #include #include "paddle/utils/Stat.h" diff --git a/paddle/pserver/BaseClient.h b/paddle/pserver/BaseClient.h index f1c4c9eb375420edaf895c3ddea7ac06f7b225bd..3a501172b70a91e02ecda0f9f78e0c025ac67936 100644 --- a/paddle/pserver/BaseClient.h +++ b/paddle/pserver/BaseClient.h @@ -62,7 +62,10 @@ public: /// send data to server, support only synchronize template - void putData(int clientId, SendDataType type, DataType* datas, size_t size, + void putData(int clientId, + SendDataType type, + DataType* datas, + size_t size, DataUpdateMode mode) { synchronize(SYNC_DATA); sendData(clientId, type, mode, datas, size); @@ -71,16 +74,23 @@ public: } template - void putOwnData(int clientId, SendDataType type, DataType* datas, + void putOwnData(int clientId, + SendDataType type, + DataType* datas, size_t size) { putData(clientId, type, datas, size, DATA_UPDATE_MODE_SET_OWN); } template - void getAllData(int clientId, SendDataType type, DataType* datas, + void getAllData(int clientId, + SendDataType type, + DataType* datas, size_t size) { - sendData(clientId, type, DATA_UPDATE_MODE_GET_ALL, - reinterpret_cast(NULL), 0); + sendData(clientId, + type, + DATA_UPDATE_MODE_GET_ALL, + reinterpret_cast(NULL), + 0); recvData(); size_t dataOffset = 0; for (auto& recvMem : recvDataMems_) { @@ -100,7 +110,10 @@ public: * The results are saved in recvBuf of rootId client */ template - void reduce(DataType* sendBuf, DataType* recvBuf, size_t size, int clientId, + void reduce(DataType* sendBuf, + DataType* recvBuf, + size_t size, + int clientId, int rootId) { putOwnData(clientId, DATA_REDUCE_SUM, sendBuf, size); if (rootId == clientId) { @@ -147,8 +160,12 @@ protected: void finishThreads(); template - void prepareData(int clientId, SendDataType type, DataUpdateMode updateMode, - DataType* datas, size_t size, SendJob* sendJob) { + void prepareData(int clientId, + SendDataType type, + DataUpdateMode updateMode, + DataType* datas, + size_t size, + SendJob* sendJob) { sendJob->parallelDataRequests.resize(serviceNum_); sendJob->parallelInputIovs.resize(serviceNum_); for (int i = 0; i < serviceNum_; ++i) { @@ -192,8 +209,11 @@ protected: * synchronization in metric learning. */ template - void sendData(int clientId, SendDataType type, DataUpdateMode updateMode, - DataType* datas, size_t size) { + void sendData(int clientId, + SendDataType type, + DataUpdateMode updateMode, + DataType* datas, + size_t size) { SendJobPtr sendJob = std::make_shared(); prepareData(clientId, type, updateMode, datas, size, sendJob.get()); for (int i = 0; i < threadNum_; ++i) { @@ -210,7 +230,8 @@ protected: /// send request, and recv responses template - void multiCall(const char* funcName, const ProtoIn& request, + void multiCall(const char* funcName, + const ProtoIn& request, std::vector* responses) { responses->resize(clients_.size()); size_t numClients = clients_.size(); diff --git a/paddle/pserver/LightNetwork.cpp b/paddle/pserver/LightNetwork.cpp index ff2875fc702ffbb0675f21433138961c19ff0b86..1830170a163fa47114c75a2a88a731ea31060142 100644 --- a/paddle/pserver/LightNetwork.cpp +++ b/paddle/pserver/LightNetwork.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include #include #include @@ -32,19 +31,22 @@ limitations under the License. */ #include "RDMANetwork.h" /// quick ack can reduce the latency of small message -P_DEFINE_bool(small_messages, false, +P_DEFINE_bool(small_messages, + false, "if message size is small, recommend set it True to enable quick " "ack and no delay"); /// reasonable sock_send_buf_size can control the traffic injected into switch /// network. Injecting too many data into traffic could cause packets loss which /// cause long latency and degrade the efficiency of communication. -P_DEFINE_int32(sock_send_buf_size, 1024 * 1024 * 40, +P_DEFINE_int32(sock_send_buf_size, + 1024 * 1024 * 40, "restrict sock send buff size, can reduce network congestion if " "set carefully"); /// reasonable size can hold bursted packets and reduce packets loss -P_DEFINE_int32(sock_recv_buf_size, 1024 * 1024 * 40, +P_DEFINE_int32(sock_recv_buf_size, + 1024 * 1024 * 40, "restrict sock recv buff size"); namespace paddle { @@ -174,7 +176,8 @@ void SocketServer::tcpServer() { if (!addr_.empty()) { server = gethostbyname(addr_.c_str()); PCHECK(server) << "ERROR, no such host: " << addr_; - bcopy((char *)server->h_addr, (char *)&serv_addr.sin_addr.s_addr, + bcopy((char *)server->h_addr, + (char *)&serv_addr.sin_addr.s_addr, server->h_length); } else { serv_addr.sin_addr.s_addr = INADDR_ANY; @@ -347,29 +350,32 @@ void SocketClient::TcpClient(const std::string &serverAddr, int serverPort) { struct sockaddr_in serv_addr; struct hostent *server; - int errRet; // temp for gethostbyname_r + int errRet; // temp for gethostbyname_r /// Create a socket point int sockfd = socket(AF_INET, SOCK_STREAM, 0); PCHECK(sockfd >= 0) << "ERROR opening socket"; #if defined(__OSX__) || defined(__APPLE__) - server = getipnodebyname(serverAddr.c_str(), AF_INET, AI_DEFAULT, &errRet); - CHECK_NE(HOST_NOT_FOUND, errRet) - << "ERROR, no such host: " << serverAddr << " ret = " << errRet; - CHECK(server) << "getipnodebyname error!"; + server = getipnodebyname(serverAddr.c_str(), AF_INET, AI_DEFAULT, &errRet); + CHECK_NE(HOST_NOT_FOUND, errRet) << "ERROR, no such host: " << serverAddr + << " ret = " << errRet; + CHECK(server) << "getipnodebyname error!"; #else - struct hostent hostinfo; - char buf[1024]; // temp for gethostbyname_r - CHECK_EQ(0, gethostbyname_r(serverAddr.c_str(), &hostinfo, buf, sizeof(buf), - &server, &errRet)) - << "ERROR, no such host: " << serverAddr << " ret = " << errRet; - CHECK(server) << "gethostbyname_r error!"; + struct hostent hostinfo; + char buf[1024]; // temp for gethostbyname_r + CHECK_EQ( + 0, + gethostbyname_r( + serverAddr.c_str(), &hostinfo, buf, sizeof(buf), &server, &errRet)) + << "ERROR, no such host: " << serverAddr << " ret = " << errRet; + CHECK(server) << "gethostbyname_r error!"; #endif bzero((char *)&serv_addr, sizeof(serv_addr)); serv_addr.sin_family = AF_INET; - bcopy((char *)server->h_addr, (char *)&serv_addr.sin_addr.s_addr, + bcopy((char *)server->h_addr, + (char *)&serv_addr.sin_addr.s_addr, server->h_length); serv_addr.sin_port = htons(serverPort); @@ -421,7 +427,8 @@ void SocketClient::RdmaClient(const std::string &serverAddr, int serverPort) { * * @note responsible for building one connection to specified pserver port */ -SocketClient::SocketClient(const std::string &serverAddr, int serverPort, +SocketClient::SocketClient(const std::string &serverAddr, + int serverPort, enum ChannelType channelType) { if (channelType == F_RDMA) RdmaClient(serverAddr, serverPort); diff --git a/paddle/pserver/LightNetwork.h b/paddle/pserver/LightNetwork.h index 0d6d6bf6b7c6d3b7123f9ce05f50ad45bfd5ac60..b7d7bc7902abb18aae03fc4d8a3972f0298199fe 100644 --- a/paddle/pserver/LightNetwork.h +++ b/paddle/pserver/LightNetwork.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "SocketChannel.h" @@ -39,9 +38,9 @@ class SocketWorker; * in child class of socketserver. */ class SocketServer : public Thread { - // rdmaCpu controls the cpu affinity of RDMA server daemon, - // which could benifit performance. rdmaCpu = -1 means TCP - // is used instead of RDMA transport. + // rdmaCpu controls the cpu affinity of RDMA server daemon, + // which could benifit performance. rdmaCpu = -1 means TCP + // is used instead of RDMA transport. public: SocketServer(const std::string& addr, int port, int rdmaCpu); ~SocketServer(); @@ -91,7 +90,6 @@ protected: bool stopping_; }; - /** * @brief class for holding one connection from one trainer * @@ -165,7 +163,8 @@ private: */ class SocketClient { public: - SocketClient(const std::string& serverAddr, int serverPort, + SocketClient(const std::string& serverAddr, + int serverPort, enum ChannelType channelType); SocketChannel* getChannel() { return channel_.get(); } diff --git a/paddle/pserver/ParameterClient2.cpp b/paddle/pserver/ParameterClient2.cpp index d0e5352c828d197d6854ef19e6310dc63913846d..28cc0ae2dd36273397015e618f6e14ea43398964 100644 --- a/paddle/pserver/ParameterClient2.cpp +++ b/paddle/pserver/ParameterClient2.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include #include "ParameterClient2.h" @@ -27,7 +26,8 @@ P_DEFINE_int32(parallel_thread_num, 1, "Thread number for parameter send"); namespace paddle { template -void copyToRepeatedField(google::protobuf::RepeatedField* dest, const T* src, +void copyToRepeatedField(google::protobuf::RepeatedField* dest, + const T* src, size_t size) { dest->Clear(); dest->Reserve(size); @@ -46,11 +46,10 @@ void copyToRepeatedField(const std::vector& src, ParameterClient2::ParameterClient2(bool separate, int port, int numPorts) : BaseClient(separate, numPorts), port_(port) { #ifndef PADDLE_DISABLE_TIMER - forwardbackwordTime_ = 0; + forwardbackwordTime_ = 0; #endif } - int ParameterClient2::calcParameterBlockSize( const std::vector& parameters, size_t serviceNum) { size_t totalSize = 0; @@ -89,8 +88,8 @@ bool ParameterClient2::init(const std::vector& parameters) { for (auto& para : parameters) { /// set block size for each parameter para->getConfig().set_parameter_block_size( - para->getConfig().sparse_remote_update() ? - para->getConfig().dims(1) : denseBlockSize); + para->getConfig().sparse_remote_update() ? para->getConfig().dims(1) + : denseBlockSize); } for (auto& para : parameters) { @@ -107,7 +106,7 @@ bool ParameterClient2::init(const std::vector& parameters) { allSegments_.push_back(segments); if (para->getConfig().sparse_remote_update()) { CHECK_EQ(para->getConfig().parameter_block_size(), - para->getConfig().dims(1)) + para->getConfig().dims(1)) << "For sparse remote update parameter," << " block size is the width of each row."; } @@ -152,7 +151,8 @@ void ParameterClient2::destroy() { clients_.clear(); } -void ParameterClient2::sendParallel(int tid, size_t numThreads, +void ParameterClient2::sendParallel(int tid, + size_t numThreads, ParameterType recvParameterType) { int numMyClients = divup(serviceNum_ - tid, numThreads); @@ -163,7 +163,8 @@ void ParameterClient2::sendParallel(int tid, size_t numThreads, /// at the same time so that they will not flood data to the same /// pserver. i = calcClientId(i, serviceNum_); - clients_[i].send("sendParameter", sendJob_.parallelRequests[i], + clients_[i].send("sendParameter", + sendJob_.parallelRequests[i], sendJob_.parallelInputIovs[i]); /// clear large structure @@ -204,10 +205,15 @@ void ParameterClient2::sendParallel(int tid, size_t numThreads, } void ParameterClient2::prepareSendData( - ParameterUpdateMode updateMode, ParameterType parameterType, - const std::vector& parameterSegments, int64_t numSamples, - real cost, bool sendBackParameter, ParameterType sendBackParameterType, - BatchStatus batchStatus, SendJob* sendJob) { + ParameterUpdateMode updateMode, + ParameterType parameterType, + const std::vector& parameterSegments, + int64_t numSamples, + real cost, + bool sendBackParameter, + ParameterType sendBackParameterType, + BatchStatus batchStatus, + SendJob* sendJob) { sendJob->parallelRequests.resize(serviceNum_); sendJob->parallelInputIovs.resize(serviceNum_); @@ -247,11 +253,11 @@ void ParameterClient2::prepareSendData( const auto prefetchMat = parameter->getPrefetchMatrix(); CHECK(prefetchMat != nullptr) << "prefetchMat is nullptr"; auto sendMat = dynamic_cast( - parameter->getMat(parameterType).get()); + parameter->getMat(parameterType).get()); CHECK(sendMat != nullptr) << "sendMat is nullptr"; syncThreadPool_->exec([&](int tid, size_t numThreads) { - const auto &localIndices = prefetchMat->getLocalIndices(); + const auto& localIndices = prefetchMat->getLocalIndices(); /// num of sparse rows size_t nLocalBlocks = localIndices.size(); uint64_t beginDim = 0; @@ -278,17 +284,17 @@ void ParameterClient2::prepareSendData( if (sendingPara) { sendJob->parallelInputIovs[serverId].push_back( - {sendMat->getLocalRow(row), sizeof(real) * (size_t) blockSize}); + {sendMat->getLocalRow(row), sizeof(real) * (size_t)blockSize}); /// detect sparse parameter distribution sparseDistribution_->probeDistribution(serverId, - sizeof(real) * blockSize); + sizeof(real) * blockSize); } } }); } else { /// parameter set for dense and sparse - real* buf = sendingPara ? - parameter->getBuf(parameterType)->getPoint(0) : nullptr; + real* buf = + sendingPara ? parameter->getBuf(parameterType)->getPoint(0) : nullptr; uint64_t endDim = 0; for (uint64_t beginDim = 0; beginDim < paraSize; beginDim = endDim) { endDim = std::min(beginDim + blockSize, paraSize); @@ -302,8 +308,8 @@ void ParameterClient2::prepareSendData( block->set_begin_pos(beginDim); block->set_block_size(endDim - beginDim); if (buf) { - sendJob->parallelInputIovs[serverId].push_back({buf + beginDim, - sizeof(real) * ((size_t) (endDim - beginDim))}); + sendJob->parallelInputIovs[serverId].push_back( + {buf + beginDim, sizeof(real) * ((size_t)(endDim - beginDim))}); } } } @@ -313,13 +319,23 @@ void ParameterClient2::prepareSendData( } void ParameterClient2::sendAndReceiveParameter( - ParameterUpdateMode updateMode, ParameterType parameterType, - const std::vector& parameterSegments, int64_t numSamples, - real cost, bool sendBackParameter, ParameterType sendBackParameterType, + ParameterUpdateMode updateMode, + ParameterType parameterType, + const std::vector& parameterSegments, + int64_t numSamples, + real cost, + bool sendBackParameter, + ParameterType sendBackParameterType, ParameterType recvParameterType) { - prepareSendData(updateMode, parameterType, parameterSegments, numSamples, - cost, sendBackParameter, sendBackParameterType, - /*batchStatus = */ BATCH_START_AND_FINISH, &sendJob_); + prepareSendData(updateMode, + parameterType, + parameterSegments, + numSamples, + cost, + sendBackParameter, + sendBackParameterType, + /*batchStatus = */ BATCH_START_AND_FINISH, + &sendJob_); syncThreadPool_->exec([&](int tid, size_t numThreads) { this->sendParallel(tid, numThreads, recvParameterType); @@ -327,12 +343,22 @@ void ParameterClient2::sendAndReceiveParameter( } void ParameterClient2::sendParameter( - ParameterUpdateMode updateMode, ParameterType parameterType, - const std::vector& parameterSegments, int64_t numSamples, - real cost, bool sendBackParameter, BatchStatus batchStatus) { + ParameterUpdateMode updateMode, + ParameterType parameterType, + const std::vector& parameterSegments, + int64_t numSamples, + real cost, + bool sendBackParameter, + BatchStatus batchStatus) { SendJobPtr sendJob = std::make_shared(); - prepareSendData(updateMode, parameterType, parameterSegments, numSamples, - cost, sendBackParameter, PARAMETER_VALUE, batchStatus, + prepareSendData(updateMode, + parameterType, + parameterSegments, + numSamples, + cost, + sendBackParameter, + PARAMETER_VALUE, + batchStatus, sendJob.get()); for (int i = 0; i < threadNum_; i++) { @@ -360,10 +386,12 @@ void ParameterClient2::send(int threadId) { /// pserver. i = calcClientId(i, serviceNum_); if (recvJob->parallelRequests.size()) { - clients_[i].send("sendParameter", recvJob->parallelRequests[i], + clients_[i].send("sendParameter", + recvJob->parallelRequests[i], recvJob->parallelInputIovs[i]); } else { - clients_[i].send("sendData", recvJob->parallelDataRequests[i], + clients_[i].send("sendData", + recvJob->parallelDataRequests[i], recvJob->parallelInputIovs[i]); } } @@ -586,12 +614,13 @@ void PreparedOperations::addOperationHelper(Operation* op, CpuMatrixPtr mat) { ProtoMatrix& pmat = *op->add_matrices(); pmat.set_num_cols(mat->getWidth()); pmat.set_num_rows(mat->getHeight()); - copyToRepeatedField(pmat.mutable_values(), mat->getData(), - pmat.num_cols() * pmat.num_rows()); + copyToRepeatedField( + pmat.mutable_values(), mat->getData(), pmat.num_cols() * pmat.num_rows()); } void ParameterClient2::doOperation(PreparedOperations& ops, - bool waitForGradient, bool sendBackGradient, + bool waitForGradient, + bool sendBackGradient, bool releasePass) { std::vector responses; ops.request_.set_wait_for_gradient(waitForGradient); @@ -666,7 +695,8 @@ void ParameterClient2::doOperation(PreparedOperations& ops, CHECK_EQ(rmat->getWidth(), (size_t)mat.num_cols()); CpuMatrixPtr amat = std::make_shared(const_cast(mat.values().data()), - rmat->getHeight(), rmat->getWidth()); + rmat->getHeight(), + rmat->getWidth()); rmat->add(*amat); } } @@ -700,14 +730,17 @@ void ParameterClient2::vectorAddMult(PServerVector u, PServerVector v, real a) { doOperation(ops, false, false); } -void ParameterClient2::vectorAddMultInto(PServerVector u, PServerVector v, - PServerVector w, real a) { +void ParameterClient2::vectorAddMultInto(PServerVector u, + PServerVector v, + PServerVector w, + real a) { PreparedOperations ops; ops.addOperation(PSERVER_OP_au_bv_cw, v, w, u, (real)1, a, (real)0); doOperation(ops, false, false); } -void ParameterClient2::vectorScaleInto(PServerVector u, PServerVector v, +void ParameterClient2::vectorScaleInto(PServerVector u, + PServerVector v, real a) { PreparedOperations ops; ops.addOperation(PSERVER_OP_au_bv, v, u, a, (real)0); diff --git a/paddle/pserver/ParameterClient2.h b/paddle/pserver/ParameterClient2.h index 7a4085ad8230747ab3c740910695932623946a5e..af8dd41ec4327fcf78625e7aa5d4b136ca7d14dd 100644 --- a/paddle/pserver/ParameterClient2.h +++ b/paddle/pserver/ParameterClient2.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include @@ -190,8 +189,8 @@ protected: }; struct ParameterSegments { - std::string name; // name of the parameter - size_t id; // id of the parameter + std::string name; // name of the parameter + size_t id; // id of the parameter }; /** @@ -225,7 +224,8 @@ public: * connections the parameter client maintains. */ ParameterClient2(bool separate = false, - int port = FLAGS_port, int numPorts = FLAGS_ports_num); + int port = FLAGS_port, + int numPorts = FLAGS_ports_num); ~ParameterClient2(); @@ -255,14 +255,14 @@ public: * client[recvParameterType] * @note Only parameterType will be sent. */ - void sendAndReceiveParameter( - ParameterUpdateMode updateMode, - ParameterType parameterType, - const std::vector& segments, - int64_t numSamples, - real cost, bool sendBackParameter, - ParameterType sendBackParameterType, - ParameterType recvParameterType); + void sendAndReceiveParameter(ParameterUpdateMode updateMode, + ParameterType parameterType, + const std::vector& segments, + int64_t numSamples, + real cost, + bool sendBackParameter, + ParameterType sendBackParameterType, + ParameterType recvParameterType); /** * @brief Sends all parameters to parameter servers, and receives the response @@ -276,8 +276,13 @@ public: bool sendBackParameter, ParameterType sendBackParameterType = PARAMETER_VALUE, ParameterType recvParameterType = PARAMETER_VALUE) { - sendAndReceiveParameter(updateMode, parameterType, allSegments_, numSamples, - cost, sendBackParameter, sendBackParameterType, + sendAndReceiveParameter(updateMode, + parameterType, + allSegments_, + numSamples, + cost, + sendBackParameter, + sendBackParameterType, recvParameterType); } @@ -302,29 +307,41 @@ public: void sendParameter(ParameterUpdateMode updateMode, ParameterType parameterType, const std::vector& segments, - int64_t numSamples, real cost, bool sendBackParameter, + int64_t numSamples, + real cost, + bool sendBackParameter, BatchStatus batchStatus); void recvParameter(); /** - * Sends all parameters to parameter servers, recvParameter() have to be invoked + * Sends all parameters to parameter servers, recvParameter() have to be + * invoked * afterwards. * * @note This function is non-blocking. This means that if parameter should * not changes between this call and recvParameter() */ void sendParameter(ParameterUpdateMode updateMode, - ParameterType parameterType, int64_t numSamples, real cost, - bool sendBackParameter, BatchStatus batchStatus) { - sendParameter(updateMode, parameterType, allSegments_, numSamples, cost, - sendBackParameter, batchStatus); + ParameterType parameterType, + int64_t numSamples, + real cost, + bool sendBackParameter, + BatchStatus batchStatus) { + sendParameter(updateMode, + parameterType, + allSegments_, + numSamples, + cost, + sendBackParameter, + batchStatus); } /// Get all parameters from parameter servers void getParameter(ParameterType recvParameterType = PARAMETER_VALUE, ParameterType sendBackParameterType = PARAMETER_VALUE) { - sendAndReceiveParameter(PSERVER_UPDATE_MODE_GET_PARAM, PARAMETER_VALUE, + sendAndReceiveParameter(PSERVER_UPDATE_MODE_GET_PARAM, + PARAMETER_VALUE, 0, // numSamples = 0 0, // cost = 0 true, // sendBackParameter = true @@ -341,12 +358,14 @@ public: 0, // numSamples = 0 0, // cost = 0 true, // sendBackParameter = true - sendBackParameterType, recvParameterType); + sendBackParameterType, + recvParameterType); } /// Set all parameters on parameter servers using the local parameters void setParameter() { - sendAndReceiveParameter(PSERVER_UPDATE_MODE_SET_PARAM, PARAMETER_VALUE, + sendAndReceiveParameter(PSERVER_UPDATE_MODE_SET_PARAM, + PARAMETER_VALUE, 0, // numSamples = 0 0, // cost = 0 false); // sendBackParameter = false @@ -356,7 +375,8 @@ public: * means do not sending local parameters */ void setParameterZero() { - sendAndReceiveParameter(PSERVER_UPDATE_MODE_SET_PARAM_ZERO, PARAMETER_VALUE, + sendAndReceiveParameter(PSERVER_UPDATE_MODE_SET_PARAM_ZERO, + PARAMETER_VALUE, 0, // numSamples = 0 0, // cost = 0 false); // sendBackParameter = false @@ -401,15 +421,18 @@ public: * @param[in] If true, and if all clients call waitPassFinish, signal all * clients finish the pass. */ - void doOperation(PreparedOperations& ops, bool waitForGradient, - bool sendBackParameter, bool releasePass = true); + void doOperation(PreparedOperations& ops, + bool waitForGradient, + bool sendBackParameter, + bool releasePass = true); /** * Set the configuration of pserver, including parameter config and * optimization config */ void setConfig(const OptimizationConfig& optConfig, - const std::string& saveDir = "", bool isSparseServer = false); + const std::string& saveDir = "", + bool isSparseServer = false); /// Return true if all pservers are in the given status bool inStatus(PServerStatus status); @@ -454,7 +477,9 @@ public: void vectorAddMult(PServerVector u, PServerVector v, real a); /// u = v + w * a - void vectorAddMultInto(PServerVector u, PServerVector v, PServerVector w, + void vectorAddMultInto(PServerVector u, + PServerVector v, + PServerVector w, real a); /// u = v * a void vectorScaleInto(PServerVector u, PServerVector v, real a); @@ -491,7 +516,8 @@ public: protected: template - void multiCall(const char* funcName, const ProtoIn& request, + void multiCall(const char* funcName, + const ProtoIn& request, std::vector* responses) { responses->resize(clients_.size()); size_t numClients = clients_.size(); @@ -511,10 +537,12 @@ private: * to all pservers. it is called under one SyncThreadPool. it * supports to use N thread to control M connections. the receiving * actions can be started until all sending action to all connections - * owned by current thread are finished. Different connections controlled + * owned by current thread are finished. Different connections + * controlled * by different threads can transfer data asynchronously. */ - void sendParallel(int tid, size_t numThreads, + void sendParallel(int tid, + size_t numThreads, ParameterType recvParameterType); /// sending thread routine for asynchronously send data void send(int threadId); @@ -535,9 +563,12 @@ private: ParameterUpdateMode updateMode, ParameterType parameterType, // client send type const std::vector& parameterSegments, - int64_t numSamples, real cost, bool sendBackParameter, + int64_t numSamples, + real cost, + bool sendBackParameter, ParameterType sendBackParameterType, // send back type in pserver - BatchStatus batchStatus, SendJob* sendJob); + BatchStatus batchStatus, + SendJob* sendJob); /// start necessary threads for threadPool void initThreads(); diff --git a/paddle/pserver/ParameterServer2.cpp b/paddle/pserver/ParameterServer2.cpp index 960fca2853b86cded3889748afe115060e0a0293..b7f999f8b132e59ce8b7dffe5c4d43615e4c564c 100644 --- a/paddle/pserver/ParameterServer2.cpp +++ b/paddle/pserver/ParameterServer2.cpp @@ -31,10 +31,12 @@ limitations under the License. */ #include "paddle/utils/GlobalConstants.h" P_DEFINE_int32(pserver_num_threads, 1, "number of threads for sync op exec"); -P_DEFINE_double(async_lagged_ratio_min, 1.0, +P_DEFINE_double(async_lagged_ratio_min, + 1.0, "control config_.async_lagged_grad_discard_ratio() min value"); P_DEFINE_double( - async_lagged_ratio_default, 1.5, + async_lagged_ratio_default, + 1.5, "if async_lagged_grad_discard_ratio is not set in trainer_config.conf" "use it as defalut value"); @@ -47,7 +49,8 @@ const std::string ParameterServer2::kRetMsgInvalidVectorHandle = const std::string ParameterServer2::kRetMsgUnknownOperation = "Unknown operation"; -ParameterServer2::ParameterServer2(const std::string& addr, int port, +ParameterServer2::ParameterServer2(const std::string& addr, + int port, int rdmaCpu) : ProtoServer(addr, port, rdmaCpu), dataSize_(0), @@ -59,12 +62,12 @@ ParameterServer2::ParameterServer2(const std::string& addr, int port, allClientPassFinish_(false), serverId_(-1), batchId_(-1) { - /** - * register function for remote client calling, these functions - * will be mapped to a data structure for quick looking up. each - * request from trainer can contains one function name to indicate - * remote action. this architecture looks like rpc style for pserver. - */ + /** + * register function for remote client calling, these functions + * will be mapped to a data structure for quick looking up. each + * request from trainer can contains one function name to indicate + * remote action. this architecture looks like rpc style for pserver. + */ REGISTER_SERVICE_FUNCTION_EX(ParameterServer2, sendParameter); REGISTER_SERVICE_FUNCTION_EX(ParameterServer2, sendData); REGISTER_SERVICE_FUNCTION(ParameterServer2, setConfig); @@ -150,12 +153,12 @@ void ParameterServer2::setConfig(const SetConfigRequest& request, mkDir(request.save_dir().c_str()); } - for (const auto& config : request.param_configs()) { - CHECK(!configMap_.count(config.para_id())) - << "Duplicated parameter name: " << config.name(); - configMap_[config.para_id()] = config; - CHECK_EQ(config.sparse_remote_update(), isSparseServer_); - } + for (const auto& config : request.param_configs()) { + CHECK(!configMap_.count(config.para_id())) + << "Duplicated parameter name: " << config.name(); + configMap_[config.para_id()] = config; + CHECK_EQ(config.sparse_remote_update(), isSparseServer_); + } config_ = request.opt_config(); if (config_.algorithm() == TrainAlgorithm::AsyncSGD) { @@ -267,9 +270,9 @@ void ParameterServer2::setParameter(const SendParameterRequest& request, if (!request.blocks().size()) { LOG(WARNING) - << "--ports_num or --ports_num_for_sparse might be too large, " - << "or total dense parameter size or sparse parameters size " - << "might be too small, this psever doesn't store any parameter."; + << "--ports_num or --ports_num_for_sparse might be too large, " + << "or total dense parameter size or sparse parameters size " + << "might be too small, this psever doesn't store any parameter."; return; } @@ -339,8 +342,8 @@ void ParameterServer2::setParameter(const SendParameterRequest& request, << "width : " << width; } info.optimizer->init(1, info.config); - usedSegments_.push_back(std::make_pair(offsets[i], - offsets[i] + request.blocks(i).block_size())); + usedSegments_.push_back(std::make_pair( + offsets[i], offsets[i] + request.blocks(i).block_size())); } mergeSegments(&usedSegments_); @@ -364,15 +367,18 @@ void ParameterServer2::addGradient(const SendParameterRequest& request, std::vector* outputBuffers) { VLOG(1) << "pserver: addGradient"; - /// forwardbackward delta from all trainers - /// indicate the fluctuation caused by forwardbackward. +/// forwardbackward delta from all trainers +/// indicate the fluctuation caused by forwardbackward. #ifndef PADDLE_METRIC_LEARNING // @TODO(yanfei): // add support tuning forwardbackward balance for metric learning if (!numPassFinishClients_) { REGISTER_BARRIER_DELTA_SERVER_SET( - *statSet_, "forwardbackwardDelta", FLAGS_num_gradient_servers, - request.trainer_id(), request.forwardbackward_time(), + *statSet_, + "forwardbackwardDelta", + FLAGS_num_gradient_servers, + request.trainer_id(), + request.forwardbackward_time(), isSparseServer_ ? "_sparseUpdater" : "_denseUpdater"); } #endif @@ -390,14 +396,19 @@ void ParameterServer2::addGradient(const SendParameterRequest& request, /// barrier fluctuation caused by network and previous forwardbackward if (!numPassFinishClients_) { REGISTER_BARRIER_TIMER_SERVER_SET( - *statSet_, "handleReqBegin", FLAGS_num_gradient_servers, - request.trainer_id(), (*handleRequestBegin_), + *statSet_, + "handleReqBegin", + FLAGS_num_gradient_servers, + request.trainer_id(), + (*handleRequestBegin_), isSparseServer_ ? "_sparseUpdater" : "_denseUpdater"); } if (!numPassFinishClients_) { REGISTER_BARRIER_TIMER_SERVER( - *statSet_, "addGradBegin", FLAGS_num_gradient_servers, + *statSet_, + "addGradBegin", + FLAGS_num_gradient_servers, request.trainer_id(), isSparseServer_ ? "_sparseUpdater" : "_denseUpdater"); } @@ -414,8 +425,8 @@ void ParameterServer2::addGradient(const SendParameterRequest& request, int64_t blockId = getBlockId(block); CHECK_GE(blockId, 0) << "Only existing parameter block is allowed: " - << " id=" << block.para_id() - << " block id=" << block.block_id(); + << " id=" << block.para_id() + << " block id=" << block.block_id(); Buffer buffer = inputBuffers[bufferIndex]; ++bufferIndex; @@ -438,7 +449,9 @@ void ParameterServer2::addGradient(const SendParameterRequest& request, if (!numPassFinishClients_) { REGISTER_BARRIER_TIMER_SERVER( - *statSet_, "addGradCoreFinish", FLAGS_num_gradient_servers, + *statSet_, + "addGradCoreFinish", + FLAGS_num_gradient_servers, request.trainer_id(), isSparseServer_ ? "_sparseUpdater" : "_denseUpdater"); } @@ -453,7 +466,9 @@ void ParameterServer2::addGradient(const SendParameterRequest& request, /// numPassFinishClients_ means some trainer has entered finishPass if (!numPassFinishClients_) { REGISTER_SLOW_NODES_PROBE( - *statSet_, "SLOW_NODES", FLAGS_num_gradient_servers, + *statSet_, + "SLOW_NODES", + FLAGS_num_gradient_servers, request.trainer_id(), isSparseServer_ ? "_sparseUpdater" : "_denseUpdater"); } @@ -463,7 +478,9 @@ void ParameterServer2::addGradient(const SendParameterRequest& request, /// if wait pass finish does not start, do check if (!numPassFinishClients_) { - CHECK_BARRIER_TIMER(*statSet_, "SLOW_NODES", FLAGS_num_gradient_servers, + CHECK_BARRIER_TIMER(*statSet_, + "SLOW_NODES", + FLAGS_num_gradient_servers, isSparseServer_ ? "_sparseUpdater" : "_denseUpdater"); } @@ -471,7 +488,9 @@ void ParameterServer2::addGradient(const SendParameterRequest& request, /// can indicate the fluctation caused by computation at pserver. if (!numPassFinishClients_) { REGISTER_BARRIER_TIMER_SERVER( - *statSet_, "paraReady", FLAGS_num_gradient_servers, + *statSet_, + "paraReady", + FLAGS_num_gradient_servers, request.trainer_id(), isSparseServer_ ? "_sparseUpdater" : "_denseUpdater"); } @@ -481,7 +500,8 @@ void ParameterServer2::addGradient(const SendParameterRequest& request, { /// total time except overhead of network. REGISTER_TIMER_DYNAMIC_SET("sendParaNoRecvNoSend", - timeToMicroSecond(*addGradBegin_), -1, + timeToMicroSecond(*addGradBegin_), + -1, *statSet_); } } @@ -609,7 +629,8 @@ void ParameterServer2::asyncSGD(const SendParameterRequest& request, << " block id=" << block.block_id(); int64_t blockId = getBlockId(block); CHECK_GE(blockId, 0) << "Only existing parameter block is allowed: " - << " id=" << block.para_id() << " block id=" << block.block_id(); + << " id=" << block.para_id() + << " block id=" << block.block_id(); Buffer buffer = inputBuffers[bufferIndex]; ++bufferIndex; @@ -730,10 +751,11 @@ void ParameterServer2::sendBackParameter(const ParameterBlock& block, int64_t offset = getBlockOffset(block); CHECK_GE(offset, 0) << "Only existing parameter block is allowed: " - << " id=" << block.para_id() << " block id=" << block.block_id(); + << " id=" << block.para_id() + << " block id=" << block.block_id(); real* valueBuffer = vectors_[parameterType]->getPoint(offset); - outputBuffers->push_back({valueBuffer, (size_t) block.block_size()}); + outputBuffers->push_back({valueBuffer, (size_t)block.block_size()}); } void ParameterServer2::sendBackParameter(const ParameterBlock& block, @@ -749,7 +771,8 @@ void ParameterServer2::sendBackParameter(const ParameterBlock& block, int64_t offset = getBlockOffset(block); CHECK_GE(offset, 0) << "Only existing parameter block is allowed: " - << " id=" << block.para_id() << " block id=" << block.block_id(); + << " id=" << block.para_id() + << " block id=" << block.block_id(); size_t size = buffer->size; real* valueBuffer = vectors_[parameterType]->getPoint(offset); @@ -759,8 +782,11 @@ void ParameterServer2::sendBackParameter(const ParameterBlock& block, } void ParameterServer2::sendBackParameterSparse( - const ParameterBlock& block, int parameterType, - SendParameterResponse* response, Buffer* buffer, size_t width, + const ParameterBlock& block, + int parameterType, + SendParameterResponse* response, + Buffer* buffer, + size_t width, std::vector* outputBuffers) { ParameterBlock* returnBlock = response->add_blocks(); returnBlock->set_para_id(block.para_id()); @@ -769,7 +795,8 @@ void ParameterServer2::sendBackParameterSparse( returnBlock->set_block_size(block.block_size()); int64_t offset = getBlockOffset(block); CHECK_GE(offset, 0) << "Only existing parameter block is allowed: " - << " id=" << block.para_id() << " block id=" << block.block_id(); + << " id=" << block.para_id() + << " block id=" << block.block_id(); real* valueBuffer = vectors_[parameterType]->getPoint(offset); CHECK_EQ(buffer->size, width); @@ -781,7 +808,7 @@ void ParameterServer2::readAllBlocks( MsgReader* msgReader, std::vector* buffers) { auto& buffer = *readWriteBuffer_; size_t numBlocks = msgReader->getNumBlocks(); - buffer.resizeWithAlignHints(msgReader->getTotalLength()/sizeof(real), + buffer.resizeWithAlignHints(msgReader->getTotalLength() / sizeof(real), numBlocks); std::vector bufs(numBlocks); buffers->clear(); @@ -861,7 +888,9 @@ void ParameterServer2::sendParameter(const SendParameterRequest& request, /// indicates network flucatuation for big message. if (!numPassFinishClients_) { REGISTER_BARRIER_TIMER_SERVER( - *statSet_, "sendParamFinish", FLAGS_num_gradient_servers, + *statSet_, + "sendParamFinish", + FLAGS_num_gradient_servers, request.trainer_id(), isSparseServer_ ? "_sparseUpdater" : "_denseUpdater"); } @@ -871,13 +900,15 @@ void ParameterServer2::sendParameter(const SendParameterRequest& request, /// total time including overhead of network. REGISTER_TIMER_DYNAMIC_SET("sendParaTotal", timeToMicroSecond(*handleRequestBegin_), - -1, *statSet_); + -1, + *statSet_); } /// all time exhausted in pserverServer except recieve network. { /// total time except overhead of network receive REGISTER_TIMER_DYNAMIC_SET("sendParaNoRecv", - timeToMicroSecond(*addGradBegin_), -1, + timeToMicroSecond(*addGradBegin_), + -1, *statSet_); } } @@ -1007,36 +1038,42 @@ void ParameterServer2::clearUnusedSegments(CpuVector* vec) { return; } memset(data, 0, sizeof(real) * usedSegments_[0].first); - memset(data + usedSegments_.back().second, 0, + memset(data + usedSegments_.back().second, + 0, sizeof(real) * (size_ - usedSegments_.back().second)); size_t n = size_ - usedSegments_.back().second; for (size_t i = 1; i < usedSegments_.size(); ++i) { memset( - data + usedSegments_[i - 1].second, 0, + data + usedSegments_[i - 1].second, + 0, sizeof(real) * (usedSegments_[i].first - usedSegments_[i - 1].second)); n += usedSegments_[i].first - usedSegments_[i - 1].second; } } void ParameterServer2::parallelExecForEachBlock(ExecFunc func) { - SyncThreadPool::execHelper(syncThreadPool_.get(), [&](int tid, - size_t numThreads) { - int64_t numBlocks = blockIdMap_.size(); - VectorPtr* vecs = Parameter::getTlsTempBufs(); - for (int64_t blockId = tid; blockId < numBlocks; blockId += numThreads) { - func(blockId, vecs); - } - }); + SyncThreadPool::execHelper(syncThreadPool_.get(), + [&](int tid, size_t numThreads) { + int64_t numBlocks = blockIdMap_.size(); + VectorPtr* vecs = Parameter::getTlsTempBufs(); + for (int64_t blockId = tid; blockId < numBlocks; + blockId += numThreads) { + func(blockId, vecs); + } + }); } void ParameterServer2::blockTraverse( - BlockInfo& info, const ParameterConfig& config, int64_t offset, size_t size, + BlockInfo& info, + const ParameterConfig& config, + int64_t offset, + size_t size, const VectorPtr vecs[], const ParameterOptimizer::TraverseCallback& callback) { /// setup sub bufs for (const auto type : info.optimizer->getParameterTypes()) { - vecs[type]->subVecFrom(*vectors_[type], offset, size); + vecs[type]->subVecFrom(*vectors_[type], offset, size); } callback(vecs, config, config.sparse_remote_update() ? 0 : -1LU); } @@ -1064,10 +1101,10 @@ void ParameterServer2::op_SGD(const Operation& operation, info.optimizer->startBatch(numSamplesProcessed_); for (const auto type : info.optimizer->getParameterTypes()) { - vecs[type]->subVecFrom(*vectors_[type], offset, size); + vecs[type]->subVecFrom(*vectors_[type], offset, size); } - info.optimizer->update(vecs, config, - config.sparse_remote_update() ? 0 : -1LU); + info.optimizer->update( + vecs, config, config.sparse_remote_update() ? 0 : -1LU); vecs[PARAMETER_GRADIENT]->zeroMem(); if (auto callback = info.optimizer->needSpecialTraversal(config)) { diff --git a/paddle/pserver/ParameterServer2.h b/paddle/pserver/ParameterServer2.h index ceb1ad69e9ec51894d869cee63f48950e5e8fa7c..ccaea42e7d0cb1865234702315fd4bbd00e548d5 100644 --- a/paddle/pserver/ParameterServer2.h +++ b/paddle/pserver/ParameterServer2.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include @@ -55,7 +54,6 @@ namespace paddle { // computation causes big optmization latency, the GPU may be required by // pserver. - /** * Client interface for the parameter server * @@ -189,9 +187,10 @@ protected: */ constexpr static size_t AlignElementCount = AlignBytes / sizeof(T); - static_assert( - AlignElementCount == (AlignElementCount & -AlignElementCount) - || AlignBytes > sizeof(T), "AlignElementCount should be exp of 2"); + static_assert(AlignElementCount == + (AlignElementCount & -AlignElementCount) || + AlignBytes > sizeof(T), + "AlignElementCount should be exp of 2"); /** * @brief Resize Buffer, with block count that will be allocated. Each block @@ -205,7 +204,7 @@ protected: } else { //! at most, we need such elements in buffer to make sure each block is //! aligned. - this->resize(size + alignBlockCount* (AlignElementCount - 1)); + this->resize(size + alignBlockCount * (AlignElementCount - 1)); } } @@ -224,8 +223,8 @@ protected: curOffset_ += blockSize; if (!IsTLargerThanAlign) { - curOffset_ = (curOffset_ + AlignElementCount - 1) & - ~(AlignElementCount -1); + curOffset_ = + (curOffset_ + AlignElementCount - 1) & ~(AlignElementCount - 1); } return r; } @@ -369,7 +368,8 @@ public: /** * @brief send config to pserver * - * @note it can help pserver to understand the configuration for optimization, + * @note it can help pserver to understand the configuration for + * optimization, * logging control, duplicated initialization, etc. */ void setConfig(const SetConfigRequest& request, @@ -545,17 +545,17 @@ protected: std::vector* buffers); const ParameterConfig& getParameterConfig(const ParameterBlock& block) { - CHECK_LT(block.para_id(), -1UL) - << "invalid parameter id:" << block.para_id(); + CHECK_LT(block.para_id(), -1UL) << "invalid parameter id:" + << block.para_id(); const auto it = configMap_.find(block.para_id()); - CHECK(it != configMap_.end()) - << "can not find parameter id: " << block.para_id(); + CHECK(it != configMap_.end()) << "can not find parameter id: " + << block.para_id(); return it->second; } /// it implictly check blockOffsetMap_ while retrieving blockId const ParameterConfig& getParameterConfig(int64_t blockId) const { - CHECK(blockId >= 0 && blockId < (int64_t) blockInfos_.size()) + CHECK(blockId >= 0 && blockId < (int64_t)blockInfos_.size()) << "block idx out of range, id: " << blockId << " info size: " << blockInfos_.size(); return *(blockInfos_[blockId].config); @@ -614,7 +614,8 @@ protected: * vectors_[parameterType] directly * for dense with sync-sgd */ - void sendBackParameter(const ParameterBlock& block, int parameterType, + void sendBackParameter(const ParameterBlock& block, + int parameterType, SendParameterResponse* response, std::vector* outputBuffers); @@ -627,16 +628,20 @@ protected: * to buffer->base. * for dense with async-sgd */ - void sendBackParameter(const ParameterBlock& block, int parameterType, - SendParameterResponse* response, Buffer* buffer, + void sendBackParameter(const ParameterBlock& block, + int parameterType, + SendParameterResponse* response, + Buffer* buffer, std::vector* outputBuffers); /** * @brief prepare data for sending back * * @note specified for sparse */ - void sendBackParameterSparse(const ParameterBlock& block, int parameterType, - SendParameterResponse* response, Buffer* buffer, + void sendBackParameterSparse(const ParameterBlock& block, + int parameterType, + SendParameterResponse* response, + Buffer* buffer, size_t width, std::vector* outputBuffers); @@ -648,8 +653,11 @@ protected: */ typedef std::function ExecFunc; void parallelExecForEachBlock(ExecFunc func); - void blockTraverse(BlockInfo& info, const ParameterConfig& config, - int64_t offset, size_t size, const VectorPtr vecs[], + void blockTraverse(BlockInfo& info, + const ParameterConfig& config, + int64_t offset, + size_t size, + const VectorPtr vecs[], const ParameterOptimizer::TraverseCallback& callback); public: diff --git a/paddle/pserver/ProtoServer.cpp b/paddle/pserver/ProtoServer.cpp index 0ce06ddf9180299c0ecf28669fe96e9668d9d48b..2f6d911a017d231692c42f2a235cf1e15257f7ae 100644 --- a/paddle/pserver/ProtoServer.cpp +++ b/paddle/pserver/ProtoServer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "ProtoServer.h" namespace paddle { @@ -42,8 +41,8 @@ void ProtoServer::handleRequest(std::unique_ptr msgReader, void ProtoServer::registerServiceFunctionImp(const std::string& funcName, ServiceFunction func) { - CHECK(!nameToFuncMap_.count(funcName)) - << "Duplicated registration: " << funcName; + CHECK(!nameToFuncMap_.count(funcName)) << "Duplicated registration: " + << funcName; nameToFuncMap_[funcName] = func; } diff --git a/paddle/pserver/ProtoServer.h b/paddle/pserver/ProtoServer.h index 86e715868356ca1939dac819b52e816e19d7d361..cf08e24ff3ef47d9c17bfe14d7d3aff1537b8ce8 100644 --- a/paddle/pserver/ProtoServer.h +++ b/paddle/pserver/ProtoServer.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "LightNetwork.h" @@ -23,17 +22,17 @@ limitations under the License. */ namespace paddle { - /** - * - * It implements the rpc framework, which launchs one thread for each - * connection. Here define one parameter server as single TCP server - * binding on single port. All connections share single tcp ProtoServer - * object, each connection handles all requests from specified trainer - * within single worker thread. - * to accelerate bandwidth efficiency and harness multicore for pserver - * optimization to reduce pserver latency, you could launch more port - * for single NIC hardward with --port=N(N>1) for small cluster job. - */ +/** + * + * It implements the rpc framework, which launchs one thread for each + * connection. Here define one parameter server as single TCP server + * binding on single port. All connections share single tcp ProtoServer + * object, each connection handles all requests from specified trainer + * within single worker thread. + * to accelerate bandwidth efficiency and harness multicore for pserver + * optimization to reduce pserver latency, you could launch more port + * for single NIC hardward with --port=N(N>1) for small cluster job. + */ class ProtoServer : public SocketServer { public: /// rdmaCpu controls the cpu affinity of RDMA server daemon, @@ -84,7 +83,8 @@ public: template void registerServiceFunctionEx( const std::string& funcName, - std::function msgReader, + std::function msgReader, ProtoResponseCallbackEx callback)> func); protected: @@ -120,7 +120,8 @@ protected: class ProtoClient : public SocketClient { public: - ProtoClient(const std::string& serverAddr, int serverPort, + ProtoClient(const std::string& serverAddr, + int serverPort, enum ChannelType channelType = F_TCP) : SocketClient(serverAddr, serverPort, channelType) {} @@ -133,7 +134,8 @@ public: * @note iov provides additional blocks which need to be written to the * communication channel */ - void send(const char* funcName, const google::protobuf::MessageLite& proto, + void send(const char* funcName, + const google::protobuf::MessageLite& proto, const std::vector& iov = std::vector()); /** @@ -148,7 +150,8 @@ public: /// combines send() and recv() std::unique_ptr sendAndRecv( - const char* funcName, const google::protobuf::MessageLite& protoIn, + const char* funcName, + const google::protobuf::MessageLite& protoIn, google::protobuf::MessageLite* protoOut) { send(funcName, protoIn); return recv(protoOut); @@ -156,8 +159,10 @@ public: /// combines send() and recv() std::unique_ptr sendAndRecv( - const char* funcName, const google::protobuf::MessageLite& protoIn, - const std::vector& iov, google::protobuf::MessageLite* protoOut) { + const char* funcName, + const google::protobuf::MessageLite& protoIn, + const std::vector& iov, + google::protobuf::MessageLite* protoOut) { send(funcName, protoIn, iov); return recv(protoOut); } @@ -172,52 +177,62 @@ struct service_arg_type { }; template -struct service_arg_type, - Arg2)> { +struct service_arg_type, + Arg2)> { typedef Arg1 _1; }; /// register a service function to the ProtoServer /// This should only be used within a member function of className -#define REGISTER_SERVICE_FUNCTION(className, funcName) \ - registerServiceFunction< \ - service_arg_type::_1>( \ - #funcName, std::bind(&className::funcName, this, std::placeholders::_1, \ - std::placeholders::_2)) +#define REGISTER_SERVICE_FUNCTION(className, funcName) \ + registerServiceFunction< \ + service_arg_type::_1>( \ + #funcName, \ + std::bind(&className::funcName, \ + this, \ + std::placeholders::_1, \ + std::placeholders::_2)) /// register a service function to the ProtoServer /// This should only be used within a member function of className -#define REGISTER_SERVICE_FUNCTION_EX(className, funcName) \ - registerServiceFunctionEx< \ - service_arg_type::_1>( \ - #funcName, std::bind(&className::funcName, this, std::placeholders::_1, \ - std::placeholders::_2, std::placeholders::_3)) +#define REGISTER_SERVICE_FUNCTION_EX(className, funcName) \ + registerServiceFunctionEx< \ + service_arg_type::_1>( \ + #funcName, \ + std::bind(&className::funcName, \ + this, \ + std::placeholders::_1, \ + std::placeholders::_2, \ + std::placeholders::_3)) /// create wrapper function for parameter server high level function and /// register the wrapper function into function mapping. template void ProtoServer::registerServiceFunctionEx( const std::string& funcName, - std::function msgReader, + std::function msgReader, ProtoResponseCallbackEx callback)> func) { - auto f = - [func](std::unique_ptr msgReader, ResponseCallback callback) { - ProtoIn request; - std::string str(msgReader->getNextBlockLength(), 0); - msgReader->readNextBlock(&str[0]); - CHECK(request.ParseFromString(str)); - auto pcob = [callback](const google::protobuf::MessageLite& response, - const std::vector& outputIovs) { - std::string out; - CHECK(response.SerializeToString(&out)); - std::vector iovs; - iovs.push_back({&out[0], out.size()}); - iovs.insert(iovs.end(), outputIovs.begin(), outputIovs.end()); - callback(iovs); - }; - - func(request, std::move(msgReader), pcob); - }; + auto f = [func](std::unique_ptr msgReader, + ResponseCallback callback) { + ProtoIn request; + std::string str(msgReader->getNextBlockLength(), 0); + msgReader->readNextBlock(&str[0]); + CHECK(request.ParseFromString(str)); + auto pcob = [callback](const google::protobuf::MessageLite& response, + const std::vector& outputIovs) { + std::string out; + CHECK(response.SerializeToString(&out)); + std::vector iovs; + iovs.push_back({&out[0], out.size()}); + iovs.insert(iovs.end(), outputIovs.begin(), outputIovs.end()); + callback(iovs); + }; + + func(request, std::move(msgReader), pcob); + }; registerServiceFunctionImp(funcName, f); } @@ -226,24 +241,24 @@ template void ProtoServer::registerServiceFunction( const std::string& funcName, std::function func) { - auto f = - [func](std::unique_ptr msgReader, ResponseCallback callback) { - ProtoIn request; - std::string str(msgReader->getNextBlockLength(), 0); - msgReader->readNextBlock(&str[0]); - CHECK(request.ParseFromString(str)); - msgReader.reset(); - - auto pcob = [callback](const google::protobuf::MessageLite& response) { - std::string out; - CHECK(response.SerializeToString(&out)); - std::vector iovs; - iovs.push_back({&out[0], out.size()}); - callback(iovs); - }; - - func(request, pcob); - }; + auto f = [func](std::unique_ptr msgReader, + ResponseCallback callback) { + ProtoIn request; + std::string str(msgReader->getNextBlockLength(), 0); + msgReader->readNextBlock(&str[0]); + CHECK(request.ParseFromString(str)); + msgReader.reset(); + + auto pcob = [callback](const google::protobuf::MessageLite& response) { + std::string out; + CHECK(response.SerializeToString(&out)); + std::vector iovs; + iovs.push_back({&out[0], out.size()}); + callback(iovs); + }; + + func(request, pcob); + }; registerServiceFunctionImp(funcName, f); } diff --git a/paddle/pserver/RDMANetwork.h b/paddle/pserver/RDMANetwork.h index 05b845b68a150cb36fa4ba09150bc8f41e3922c8..4e492a3afd120462ac6e056b9df850063c503a53 100644 --- a/paddle/pserver/RDMANetwork.h +++ b/paddle/pserver/RDMANetwork.h @@ -76,7 +76,7 @@ inline sxi_sock* accept(sxi_socket* s) { inline sockaddr_in* getSourceAddress(sxi_sock* sock) { #ifndef PADDLE_DISABLE_RDMA - return reinterpret_cast(&sock->sa); + return reinterpret_cast(&sock->sa); #else PROMPT_ERR(); #endif @@ -98,7 +98,6 @@ inline int close(sxi_sock* sock) { #endif } - inline void init() { #ifndef PADDLE_DISABLE_RDMA sxi_module_init(); @@ -155,6 +154,5 @@ inline sxi_sock* connect(sxi_socket* socket, const char* url) { #endif } - } // namespace rdma } // namespace paddle diff --git a/paddle/pserver/SocketChannel.cpp b/paddle/pserver/SocketChannel.cpp index 20295d7cdc22b5dba6380a0792eafef9feec257a..4ebc47d32659d82f32b9da529aec7ec3f46f77a9 100644 --- a/paddle/pserver/SocketChannel.cpp +++ b/paddle/pserver/SocketChannel.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "SocketChannel.h" #include @@ -35,7 +34,6 @@ namespace paddle { #define UIO_MAXIOV 512 #endif - SocketChannel::~SocketChannel() { if (tcpRdma_ == F_TCP) close(tcpSocket_); @@ -81,8 +79,12 @@ size_t SocketChannel::write(const void* buf, size_t size) { } template -static size_t readwritev(IOFunc iofunc, SocketType socket, iovec* iovs, - int iovcnt, int maxiovs, const std::string& peerName) { +static size_t readwritev(IOFunc iofunc, + SocketType socket, + iovec* iovs, + int iovcnt, + int maxiovs, + const std::string& peerName) { int curIov = 0; size_t total = 0; @@ -123,25 +125,40 @@ static size_t readwritev(IOFunc iofunc, SocketType socket, iovec* iovs, return size; } - /// rdma::readv and rdma::writev can take advantage of RDMA blocking offload /// transfering size_t SocketChannel::writev(const std::vector& iovs) { if (tcpRdma_ == F_TCP) - return readwritev(::writev, tcpSocket_, const_cast(&iovs[0]), - iovs.size(), UIO_MAXIOV, peerName_); + return readwritev(::writev, + tcpSocket_, + const_cast(&iovs[0]), + iovs.size(), + UIO_MAXIOV, + peerName_); else - return readwritev(rdma::writev, rdmaSocket_, const_cast(&iovs[0]), - iovs.size(), MAX_VEC_SIZE, peerName_); + return readwritev(rdma::writev, + rdmaSocket_, + const_cast(&iovs[0]), + iovs.size(), + MAX_VEC_SIZE, + peerName_); } size_t SocketChannel::readv(std::vector* iovs) { if (tcpRdma_ == F_TCP) - return readwritev(::readv, tcpSocket_, const_cast(&(*iovs)[0]), - iovs->size(), UIO_MAXIOV, peerName_); + return readwritev(::readv, + tcpSocket_, + const_cast(&(*iovs)[0]), + iovs->size(), + UIO_MAXIOV, + peerName_); else - return readwritev(rdma::readv, rdmaSocket_, const_cast(&(*iovs)[0]), - iovs->size(), MAX_VEC_SIZE, peerName_); + return readwritev(rdma::readv, + rdmaSocket_, + const_cast(&(*iovs)[0]), + iovs->size(), + MAX_VEC_SIZE, + peerName_); } void SocketChannel::writeMessage(const std::vector& userIovs) { @@ -157,8 +174,8 @@ void SocketChannel::writeMessage(const std::vector& userIovs) { std::vector iovs; iovs.reserve(userIovs.size() + 2); iovs.push_back({&header, sizeof(header)}); - iovs.push_back({&iovLengths[0], static_cast( - sizeof(iovLengths[0]) * header.numIovs)}); + iovs.push_back({&iovLengths[0], + static_cast(sizeof(iovLengths[0]) * header.numIovs)}); iovs.insert(iovs.end(), userIovs.begin(), userIovs.end()); header.totalLength = 0; diff --git a/paddle/pserver/SocketChannel.h b/paddle/pserver/SocketChannel.h index fb9ac2e1dc23d9921777427540fb482e9bb0bd08..472b37a12283ca1c358034427d491804af765171 100644 --- a/paddle/pserver/SocketChannel.h +++ b/paddle/pserver/SocketChannel.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "paddle/utils/Util.h" diff --git a/paddle/pserver/SparseParameterDistribution.cpp b/paddle/pserver/SparseParameterDistribution.cpp index 31682c158e8006e071d681b29322b6000a9d1329..2085b22a95138fa8caf474a081fb46229688966f 100644 --- a/paddle/pserver/SparseParameterDistribution.cpp +++ b/paddle/pserver/SparseParameterDistribution.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include #include "paddle/utils/Logging.h" @@ -21,19 +20,24 @@ limitations under the License. */ #include "SparseParameterDistribution.h" -P_DEFINE_bool(check_sparse_distribution_in_pserver, false, +P_DEFINE_bool(check_sparse_distribution_in_pserver, + false, "check whether sparse parameter exhibts balanced distribution at " "all pservers"); -P_DEFINE_bool(show_check_sparse_distribution_log, false, +P_DEFINE_bool(show_check_sparse_distribution_log, + false, "show logs details for sparse parameter distribution in pserver"); -P_DEFINE_int32(check_sparse_distribution_batches, 100, +P_DEFINE_int32(check_sparse_distribution_batches, + 100, "run sparse parameter distribution check for N batches"); P_DEFINE_double( - check_sparse_distribution_ratio, 0.6, + check_sparse_distribution_ratio, + 0.6, "if parameters dispatched to different pservers exhibit unbalanced " " distribution for check_sparse_distribution_ratio * " " check_sparse_distribution_batches times, crash program"); -P_DEFINE_double(check_sparse_distribution_unbalance_degree, 2.0, +P_DEFINE_double(check_sparse_distribution_unbalance_degree, + 2.0, "the ratio of maximum data size and minimun data size for " "different pserver"); diff --git a/paddle/pserver/test/SocketTest.cpp b/paddle/pserver/test/SocketTest.cpp index 260aed0083c5d19ba6a766a70b51e30042389e38..24c90f10785a6f5870ab291a5c5e6c13fbc0d49f 100644 --- a/paddle/pserver/test/SocketTest.cpp +++ b/paddle/pserver/test/SocketTest.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/utils/Util.h" #include @@ -184,7 +183,8 @@ SocketClient::SocketClient(const std::string& serverAddr, int serverPort) { bzero((char*)&serv_addr, sizeof(serv_addr)); serv_addr.sin_family = AF_INET; - bcopy((char*)server->h_addr, (char*)&serv_addr.sin_addr.s_addr, + bcopy((char*)server->h_addr, + (char*)&serv_addr.sin_addr.s_addr, server->h_length); serv_addr.sin_port = htons(serverPort); diff --git a/paddle/pserver/test/test_ParameterServer2.cpp b/paddle/pserver/test/test_ParameterServer2.cpp index c9722f1212ae9b7cab15c5ae314c604ffa8f0647..eb813e92d6d696db6c2ced543a00594b69c7f5af 100644 --- a/paddle/pserver/test/test_ParameterServer2.cpp +++ b/paddle/pserver/test/test_ParameterServer2.cpp @@ -27,7 +27,9 @@ P_DEFINE_int32(server_cpu, 0, "assign server cpu"); class ParameterServer2Tester : public ParameterServer2 { public: - ParameterServer2Tester(std::string serverAddr, int port, int rdmaCpu = -1, + ParameterServer2Tester(std::string serverAddr, + int port, + int rdmaCpu = -1, bool sepSendAndRecv = false) : ParameterServer2(serverAddr, port, rdmaCpu), client_(sepSendAndRecv) {} virtual ~ParameterServer2Tester() {} @@ -63,7 +65,7 @@ public: } size_t id = 0; - for (auto ¶ : parameters_) { + for (auto& para : parameters_) { para->setID(id++); } @@ -560,8 +562,8 @@ TEST(ParameterServer2, sendData) { std::unique_ptr g_server2; std::unique_ptr g_server3; if (FLAGS_rdma_tcp == "rdma") { - g_server1.reset(new ParameterServer2Tester(FLAGS_server_addr, FLAGS_port, - FLAGS_server_cpu)); + g_server1.reset(new ParameterServer2Tester( + FLAGS_server_addr, FLAGS_port, FLAGS_server_cpu)); g_server1->start(); g_server2.reset(new ParameterServer2Tester( FLAGS_server_addr, FLAGS_port + 1, FLAGS_server_cpu + 1)); @@ -604,8 +606,8 @@ int main(int argc, char** argv) { FLAGS_num_gradient_servers = 2; if (FLAGS_rdma_tcp == "rdma") { - g_server.reset(new ParameterServer2Tester(FLAGS_server_addr, FLAGS_port, - FLAGS_server_cpu)); + g_server.reset(new ParameterServer2Tester( + FLAGS_server_addr, FLAGS_port, FLAGS_server_cpu)); } else { g_server.reset(new ParameterServer2Tester(FLAGS_server_addr, FLAGS_port)); } diff --git a/paddle/pserver/test/test_ProtoServer.cpp b/paddle/pserver/test/test_ProtoServer.cpp index 065d6b3396be2287ee14226b4cf9b07be32e63e0..79d1f2743a1c2e6050afe48d6cf86a1084a4500c 100644 --- a/paddle/pserver/test/test_ProtoServer.cpp +++ b/paddle/pserver/test/test_ProtoServer.cpp @@ -126,9 +126,11 @@ TEST(ProtoServer, extended) { GetStatusResponse response; { REGISTER_TIMER("sendAndRecv"); - auto msgReader = client->sendAndRecv( - "getStatusEx", request, {{cpuGrad.getData(), (size_t)dataSize}}, - &response); + auto msgReader = + client->sendAndRecv("getStatusEx", + request, + {{cpuGrad.getData(), (size_t)dataSize}}, + &response); EXPECT_EQ(msgReader->getNumBlocks(), (size_t)1); EXPECT_EQ(msgReader->getNextBlockLength(), (size_t)dataSize); diff --git a/paddle/trainer/ParamUtil.cpp b/paddle/trainer/ParamUtil.cpp index bb309a54975a1dfc386bfb440c90a6dd408205c3..2be9cd62235a262812231579c536a5f0596b69d9 100644 --- a/paddle/trainer/ParamUtil.cpp +++ b/paddle/trainer/ParamUtil.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "ParamUtil.h" #include @@ -48,8 +47,6 @@ ParameterUtil::ParameterUtil( pUpdater_ = parameterUpdater; } - - bool ParameterUtil::loadParameters(int passId, bool local, bool remote) { constexpr int kBufLen = 100; char buf[kBufLen]; @@ -60,8 +57,9 @@ bool ParameterUtil::loadParameters(int passId, bool local, bool remote) { return true; } -void ParameterUtil::loadParametersWithPath(const std::string& dir, - bool local, bool remote) { +void ParameterUtil::loadParametersWithPath(const std::string &dir, + bool local, + bool remote) { if (local) { gserver_->loadParameters(dir); } @@ -98,7 +96,7 @@ void ParameterUtil::saveParameters(int passId, int passInnerId) { mkDir(saveDir.c_str()); if (!intConfig_->load_save_param_pserver_) { pUpdater_->getParametersRemote(true /*full parameter*/, - true /*after apply*/); + true /*after apply*/); } gserver_->saveParameters(saveDir); @@ -117,9 +115,13 @@ void ParameterUtil::saveParameters(int passId, int passInnerId) { void ParameterUtil::deleteParameters(int passId, int passInnerId) { constexpr int kBufLen = 100; char buf[kBufLen]; - const std::string& saveDir = config_->getSaveDir(); + const std::string &saveDir = config_->getSaveDir(); if (passInnerId > 0) { - snprintf(buf, kBufLen, "%s/pass-%05d-%03d", saveDir.c_str(), passId, + snprintf(buf, + kBufLen, + "%s/pass-%05d-%03d", + saveDir.c_str(), + passId, passInnerId); } else { snprintf(buf, kBufLen, "%s/pass-%05d", saveDir.c_str(), passId); @@ -129,8 +131,7 @@ void ParameterUtil::deleteParameters(int passId, int passInnerId) { rmDir(buf); } - -void ParameterUtil::saveConfigWithPath(const std::string& path) { +void ParameterUtil::saveConfigWithPath(const std::string &path) { std::string src; // save config in some path if (!intConfig_->config_.empty()) { diff --git a/paddle/trainer/ParamUtil.h b/paddle/trainer/ParamUtil.h index cfb637a3edfdcae866964bb232c64bd731e46179..3923941c3d1533621d89313aa09801e98cd5b8a9 100644 --- a/paddle/trainer/ParamUtil.h +++ b/paddle/trainer/ParamUtil.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "paddle/utils/Util.h" @@ -37,14 +36,14 @@ namespace paddle { struct ParameterUtilConfig { DISABLE_COPY(ParameterUtilConfig); - ParameterUtilConfig(bool save_only_one, int saving_period, + ParameterUtilConfig(bool save_only_one, + int saving_period, bool load_save_parameters_in_pserver, - std::string config): - save_only_one_(save_only_one), - saving_period_(saving_period), - load_save_param_pserver_(load_save_parameters_in_pserver), - config_(config) { - } + std::string config) + : save_only_one_(save_only_one), + saving_period_(saving_period), + load_save_param_pserver_(load_save_parameters_in_pserver), + config_(config) {} bool save_only_one_; int saving_period_; @@ -52,7 +51,6 @@ struct ParameterUtilConfig { std::string config_; }; - /** * ParameterUtil * Utility class for loading and saving parameters @@ -80,8 +78,9 @@ public: bool loadParameters(int passId, bool local = true, bool remote = false); /// load parameters given path info - void loadParametersWithPath(const std::string& dir, bool local = true, - bool remote = false); + void loadParametersWithPath(const std::string &dir, + bool local = true, + bool remote = false); /// Save parameter to dist for pass passId /// passInnerId means saving times in one pass, some users want to @@ -97,14 +96,14 @@ public: void deleteParameters(int passId, int passInnerId = 0); /// save config given path info - void saveConfigWithPath(const std::string& path); + void saveConfigWithPath(const std::string &path); /** * Try to load parameter from config. * @return true if can load from trainer config. */ inline bool tryLoadParametersFromConfig() { - auto& c = config_->getConfig(); + auto &c = config_->getConfig(); if (!c.init_model_path().empty()) { loadParametersWithPath(c.init_model_path()); return true; diff --git a/paddle/trainer/ParameterUpdater.cpp b/paddle/trainer/ParameterUpdater.cpp index ef2b1443d9c35e8d3296730b044c2d4cd3217d89..6001a0b391fb3425315de3194945a4d04aff7150 100644 --- a/paddle/trainer/ParameterUpdater.cpp +++ b/paddle/trainer/ParameterUpdater.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "ParameterUpdater.h" #include "paddle/utils/Logging.h" @@ -30,7 +29,8 @@ SgdUpdaterWithCpuAverager::SgdUpdaterWithCpuAverager( CHECK(FLAGS_use_gpu && optConfig.do_average_in_cpu()); averager_.reset(AverageOptimizer::create(optConfig, new DummyOptimizer(optConfig), - false /*sparse*/, true /*apply*/)); + false /*sparse*/, + true /*apply*/)); updateWorker_.addJob([]() { hl_set_device(FLAGS_gpu_id); }); } diff --git a/paddle/trainer/ParameterUpdater.h b/paddle/trainer/ParameterUpdater.h index 854e6a45d890f6fa2265ac72088c8c2574dfde5a..b83b4cf55e27b25864499531bbfe483fb75f78a1 100644 --- a/paddle/trainer/ParameterUpdater.h +++ b/paddle/trainer/ParameterUpdater.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "paddle/utils/Thread.h" @@ -69,7 +68,8 @@ public: ParameterUpdater::init(parameters); optimizer_->init(parameters_.size(), nullptr); // check no L1 decay in parameter configs - CHECK(std::find_if(parameters.begin(), parameters.end(), + CHECK(std::find_if(parameters.begin(), + parameters.end(), [](const ParameterPtr& para) { return para->getConfig().decay_rate_l1() > 0.0f; }) == parameters.end()) @@ -146,7 +146,6 @@ protected: para->getBuf(PARAMETER_GRADIENT)->zeroMem(); } - std::unique_ptr optimizer_; /** @@ -163,10 +162,10 @@ class SgdCpuUpdater : public SgdLocalUpdater, public Deprecated { public: explicit SgdCpuUpdater(const OptimizationConfig& optConfig) : SgdLocalUpdater(optConfig), - Deprecated("SgdCpuUpdater is used only in recursive neural network, " - "and recursive neural network is deprecated in paddle. " - "Use it all by your own.") - {} + Deprecated( + "SgdCpuUpdater is used only in recursive neural network, " + "and recursive neural network is deprecated in paddle. " + "Use it all by your own.") {} /** * @brief update all parameter on finish batch. diff --git a/paddle/trainer/RemoteParameterUpdater.cpp b/paddle/trainer/RemoteParameterUpdater.cpp index 3a5c2a351737ec9eb98b20c679d21dbfea42eea5..d83bb5b10adeff2dc43ad4705e5c55d10856de0d 100644 --- a/paddle/trainer/RemoteParameterUpdater.cpp +++ b/paddle/trainer/RemoteParameterUpdater.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "RemoteParameterUpdater.h" #include "Trainer.h" #include "paddle/utils/Stat.h" @@ -31,7 +30,8 @@ const std::string RemoteParameterUpdater::kAverage = "average"; const std::string RemoteParameterUpdater::kElasticAverage = "elastic_average"; RemoteParameterUpdater::RemoteParameterUpdater( - const OptimizationConfig& config, int expectedPassCount, + const OptimizationConfig& config, + int expectedPassCount, std::unique_ptr&& localUpdater) : config_(config), localUpdater_(std::move(localUpdater)), @@ -94,8 +94,8 @@ void RemoteParameterUpdater::init(std::vector& parameters) { parameterClient_->getParameter(); copyParametersToDevice(PARAMETER_VALUE); } - if (FLAGS_trainer_id == 0 && (config_.algorithm() - != TrainAlgorithm::AsyncSGD)) { + if (FLAGS_trainer_id == 0 && + (config_.algorithm() != TrainAlgorithm::AsyncSGD)) { startController(); useApplyInPserver_ = useApplyInPserver(config_); } @@ -241,7 +241,9 @@ void RemoteParameterUpdater::finishBatch(real cost) { { REGISTER_TIMER("sendAndRecv_dense"); - parameterClient_->sendAndReceiveParameter(mode, sendType, batchSize_, + parameterClient_->sendAndReceiveParameter(mode, + sendType, + batchSize_, 0, // cost = 0 sendBackParameter); } @@ -356,7 +358,8 @@ void RemoteParameterUpdater::restore() { } ConcurrentRemoteParameterUpdater::ConcurrentRemoteParameterUpdater( - OptimizationConfig config, int passCount, + OptimizationConfig config, + int passCount, std::unique_ptr&& localUpdater) : RemoteParameterUpdater(config, passCount, std::move(localUpdater)) { sendThread_.reset(new std::thread([this]() { this->send(); })); @@ -423,7 +426,10 @@ void ConcurrentRemoteParameterUpdater::send(Parameter* para) { std::vector paraSegment; if (para == NULL) { parameterClient_->sendParameter( - mode, sendType, paraSegment, batchSize_, + mode, + sendType, + paraSegment, + batchSize_, 0, // cost=0 true, // sendBackParameter = true batchStatus_); // batchStatus_ = BATCH_FINISH @@ -440,7 +446,10 @@ void ConcurrentRemoteParameterUpdater::send(Parameter* para) { copySingleParaFromDevice(para, sendType); hl_stream_synchronize(kDeviceToHostStream); } - parameterClient_->sendParameter(mode, sendType, paraSegment, batchSize_, + parameterClient_->sendParameter(mode, + sendType, + paraSegment, + batchSize_, 0, // cost=0 true, // sendBackParameter = true batchStatus_); @@ -589,14 +598,14 @@ SparseRemoteParameterUpdater::SparseRemoteParameterUpdater( void SparseRemoteParameterUpdater::init(std::vector& parameters) { ParameterUpdater::init(parameters); - parameterClient_.reset(new ParameterClient2(false, - FLAGS_port + FLAGS_ports_num, FLAGS_ports_num_for_sparse)); + parameterClient_.reset(new ParameterClient2( + false, FLAGS_port + FLAGS_ports_num, FLAGS_ports_num_for_sparse)); parameterClient_->init(parameters_); parameterClient_->setTrainerId(FLAGS_trainer_id); if (FLAGS_trainer_id == 0) { - parameterClient_->setConfig(config_, FLAGS_save_dir, - true /*is_sparse_server*/); + parameterClient_->setConfig( + config_, FLAGS_save_dir, true /*is_sparse_server*/); if (parameters[0]->isFullSize()) { parameterClient_->setParameter(); } else { // init in pserver @@ -615,9 +624,8 @@ void SparseRemoteParameterUpdater::startController() { } void SparseRemoteParameterUpdater::controller() { - ParameterClient2 client(false, - FLAGS_port + FLAGS_ports_num, - FLAGS_ports_num_for_sparse); + ParameterClient2 client( + false, FLAGS_port + FLAGS_ports_num, FLAGS_ports_num_for_sparse); client.init(parameters_); while (true) { @@ -679,7 +687,9 @@ void SparseRemoteParameterUpdater::finishBatch(real cost) { ParameterType sendType = PARAMETER_GRADIENT; REGISTER_TIMER("sendSparseParam"); - parameterClient_->sendAndReceiveParameter(mode, sendType, batchSize_, + parameterClient_->sendAndReceiveParameter(mode, + sendType, + batchSize_, 0, // cost = 0 false); // sendBackParameter @@ -823,6 +833,6 @@ void SparseRemoteParameterUpdaterComposite::init( std::vector> -ParameterUpdaterCreators::constructors_; + ParameterUpdaterCreators::constructors_; } // namespace paddle diff --git a/paddle/trainer/RemoteParameterUpdater.h b/paddle/trainer/RemoteParameterUpdater.h index be273e9ef73c744ddbcad760ac50a5720c7502a9..a40884724cc7f963dc6ce5eede750327b2bbfed9 100644 --- a/paddle/trainer/RemoteParameterUpdater.h +++ b/paddle/trainer/RemoteParameterUpdater.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include @@ -56,7 +55,8 @@ namespace paddle { class RemoteParameterUpdater : public ParameterUpdater { public: RemoteParameterUpdater( - const OptimizationConfig& config, int expectedPpassCount, + const OptimizationConfig& config, + int expectedPpassCount, std::unique_ptr&& localUpdater = nullptr); ~RemoteParameterUpdater() { if (controllerThread_) { @@ -180,7 +180,8 @@ protected: class ConcurrentRemoteParameterUpdater : public RemoteParameterUpdater { public: ConcurrentRemoteParameterUpdater( - OptimizationConfig config, int expectedPassCount, + OptimizationConfig config, + int expectedPassCount, std::unique_ptr&& localUpdater); ~ConcurrentRemoteParameterUpdater(); @@ -264,7 +265,8 @@ private: class SparseRemoteParameterUpdater : public ParameterUpdater { public: SparseRemoteParameterUpdater(const OptimizationConfig& config, - int expectedPassCount, bool testing); + int expectedPassCount, + bool testing); ~SparseRemoteParameterUpdater() { if (controllerThread_) { controllerThread_->join(); @@ -345,7 +347,9 @@ public: * @note use syncThreadPool to synchronize these two updaters */ SparseRemoteParameterUpdaterComposite( - const OptimizationConfig& config, int expectedPassCount, bool testing, + const OptimizationConfig& config, + int expectedPassCount, + bool testing, std::unique_ptr&& normalUpdater) { updaters_.resize(NUMBER_UPDATERS); updaters_[UPDATER_SPARSE_REMOTE].reset( @@ -373,11 +377,11 @@ public: */ static void addCreator( const std::function& creator) { // NOLINT explicit move closing ) in this line + bool, // isLocal + size_t // numPasses + )>& creator) { // NOLINT explicit move closing ) in this line // for readability constructors_.push_back(creator); } @@ -395,7 +399,7 @@ public: const OptimizationConfig& optConfig, bool isLocal, size_t numPasses) { - for (auto & c : constructors_) { + for (auto& c : constructors_) { if (auto updater = c(algo, optConfig, isLocal, numPasses)) { return updater; } @@ -406,7 +410,7 @@ public: private: static std::vector> - constructors_; + constructors_; }; } // namespace paddle diff --git a/paddle/trainer/Tester.cpp b/paddle/trainer/Tester.cpp index d3b88019faa04b7cebf44dd63678aa9d4ffb5252..30e92682baec2fc6035ecfa9dbd90415acd5abe1 100644 --- a/paddle/trainer/Tester.cpp +++ b/paddle/trainer/Tester.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "Tester.h" #include @@ -37,38 +36,33 @@ limitations under the License. */ namespace paddle { -Tester::Tester(const std::shared_ptr &config, - std::unique_ptr &&intconfig, - const GradientMachinePtr &gradientMachine, - const std::shared_ptr ¶meterUpdater, - std::shared_ptr testDataProvider): - config_(config), - intconfig_(std::move(intconfig)), - gradientMachine_(gradientMachine), - parameterUpdater_(parameterUpdater), - testDataProvider_(testDataProvider) { - testEvaluator_.reset(gradientMachine_ ->makeEvaluator()); +Tester::Tester(const std::shared_ptr& config, + std::unique_ptr&& intconfig, + const GradientMachinePtr& gradientMachine, + const std::shared_ptr& parameterUpdater, + std::shared_ptr testDataProvider) + : config_(config), + intconfig_(std::move(intconfig)), + gradientMachine_(gradientMachine), + parameterUpdater_(parameterUpdater), + testDataProvider_(testDataProvider) { + testEvaluator_.reset(gradientMachine_->makeEvaluator()); if (intconfig_->distributeTest) { testParameterClient_.reset(new ParameterClient2(true)); } if (testParameterClient_) { - testParameterClient_->init( - gradientMachine_->getParameters()); + testParameterClient_->init(gradientMachine_->getParameters()); } std::unique_ptr paramConfig( - new ParameterUtilConfig( - intconfig_->saveOnlyOne, - intconfig_->savingPeriod, - intconfig_->loadsaveParametersInPserver, - intconfig_->config)); + new ParameterUtilConfig(intconfig_->saveOnlyOne, + intconfig_->savingPeriod, + intconfig_->loadsaveParametersInPserver, + intconfig_->config)); paramUtil_.reset(new ParameterUtil( - config_, - std::move(paramConfig), - gradientMachine_, - parameterUpdater_)); + config_, std::move(paramConfig), gradientMachine_, parameterUpdater_)); } void Tester::startTestPeriod() { @@ -83,10 +77,10 @@ void Tester::startTestPeriod() { } } -void Tester::testOneDataBatch( - const DataBatch& dataBatch, std::vector* outArgs) { - testContext_.cost += forwardOneBatch( - dataBatch, testEvaluator_.get(), outArgs); +void Tester::testOneDataBatch(const DataBatch& dataBatch, + std::vector* outArgs) { + testContext_.cost += + forwardOneBatch(dataBatch, testEvaluator_.get(), outArgs); testContext_.numSamples += dataBatch.getSize(); } @@ -158,8 +152,8 @@ int64_t Tester::testOneBatchById(int64_t batchId) { return actualBatchSize; } -real Tester::forwardOneBatch(const DataBatch &dataBatch, - Evaluator *evaluator, +real Tester::forwardOneBatch(const DataBatch& dataBatch, + Evaluator* evaluator, std::vector* pOutArgs) { auto& outArgs = *pOutArgs; const std::vector& inArgs = dataBatch.getStreams(); @@ -180,7 +174,8 @@ real Tester::forwardOneBatch(const DataBatch &dataBatch, featMatrices.resize(numOutputs); for (size_t i = 0; i < numOutputs; ++i) { featMatrices[i] = Matrix::create(outArgs[i].value->getHeight(), - outArgs[i].value->getWidth(), false, + outArgs[i].value->getWidth(), + false, false); // CPU data buffer featMatrices[i]->copyFrom(*(outArgs[i].value), HPPL_STREAM_DEFAULT); } @@ -222,20 +217,19 @@ real Tester::forwardOneBatch(const DataBatch &dataBatch, return Argument::sumCosts(outArgs); } - void Tester::testOnePassBatch(int passId) { stats_.reset(); const std::vector inArgs; gradientMachine_->forward(inArgs, nullptr, PASS_TEST); - int64_t num; real cost; + int64_t num; + real cost; gradientMachine_->getStats(cost, num); - stats_ += std::pair {num, cost}; + stats_ += std::pair{num, cost}; gradientMachine_->onPassEnd(); LOG(INFO) << " Pass=" << passId << " " << stats_.getStats(false); } - void Tester::testOnePass(int passId) { stats_.reset(); int64_t batchId = 0; @@ -265,7 +259,6 @@ void Tester::testOnePass(int passId) { } } - void Tester::test() { CHECK(testDataProvider_) << "TestData is not specified"; testDataProvider_->setSkipShuffle(); @@ -281,33 +274,32 @@ void Tester::test() { intconfig_->testPass = 0; intconfig_->numPasses = modelList.size(); intconfig_->savingPeriod = 1; - CHECK_EQ(intconfig_->testWait, 0) << - "--test_wait must be 0 for evaluation"; + CHECK_EQ(intconfig_->testWait, 0) << "--test_wait must be 0 for evaluation"; } else if (!initModelPath.empty()) { modelList.push_back(initModelPath); intconfig_->testPass = 0; intconfig_->numPasses = 1; intconfig_->savingPeriod = 1; - CHECK_EQ(intconfig_->testWait, 0) << - "--test_wait must be 0 for evaluation"; + CHECK_EQ(intconfig_->testWait, 0) << "--test_wait must be 0 for evaluation"; } for (int i = intconfig_->testPass; i < intconfig_->numPasses; ++i) { int passId = i; if (passId % intconfig_->savingPeriod == 0) { if (intconfig_->testWait) { - while (paramUtil_->loadParameters(passId, - true /*local*/, true /*remote*/) == false) { + while (paramUtil_->loadParameters( + passId, true /*local*/, true /*remote*/) == false) { LOG(INFO) << "Waiting for parameters of pass " << passId; sleep(60); // sleep 60s } } else { if (modelList.size() == 0) { - CHECK_EQ(paramUtil_->loadParameters(passId, - true /*local*/, true /*remote*/), true); + CHECK_EQ(paramUtil_->loadParameters( + passId, true /*local*/, true /*remote*/), + true); } else { - paramUtil_->loadParametersWithPath(modelList[i], - true /*local*/, true /*remote*/); + paramUtil_->loadParametersWithPath( + modelList[i], true /*local*/, true /*remote*/); } } if (IGradientMachineMode::trainWholeDataInOneBatch(intconfig_->mode)) { @@ -326,9 +318,8 @@ void Tester::test() { gradientMachine_->finish(); } - void Tester::printOutput(const std::vector& outArgs, - std::ostream& os) { + std::ostream& os) { size_t numOutputs = outArgs.size(); size_t numIns = outArgs[0].getBatchSize(); if (cpuMat_.size() != numOutputs || cpuVec_.size() != numOutputs) { @@ -346,11 +337,13 @@ void Tester::printOutput(const std::vector& outArgs, } else if (dynamic_cast(outArgs[i].value.get())) { auto sparseMat = dynamic_cast(outArgs[i].value.get()); - cpuMat_[i] = Matrix::createSparseMatrix( - sparseMat->getHeight(), sparseMat->getWidth(), - sparseMat->getElementCnt(), sparseMat->getValueType(), - sparseMat->format_, false, /* trans */ - false); /* useGpu */ + cpuMat_[i] = Matrix::createSparseMatrix(sparseMat->getHeight(), + sparseMat->getWidth(), + sparseMat->getElementCnt(), + sparseMat->getValueType(), + sparseMat->format_, + false, /* trans */ + false); /* useGpu */ hl_stream_t stream = HPPL_STREAM_DEFAULT; cpuMat_[i]->copyFrom(*sparseMat, stream); } else { diff --git a/paddle/trainer/Tester.h b/paddle/trainer/Tester.h index 671ffc5220ebaf2e009225191f6a77e6fea80d33..a9de9fe208c61c00fbeebe644222e255308e762b 100644 --- a/paddle/trainer/Tester.h +++ b/paddle/trainer/Tester.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "paddle/utils/Util.h" @@ -49,10 +48,10 @@ public: * for getting parameter from parameter-server. * @param testDataProvider Test data provider. */ - Tester(const std::shared_ptr &config, - std::unique_ptr &&intconfig, - const GradientMachinePtr &gradientMachine, - const std::shared_ptr ¶meterUpdater, + Tester(const std::shared_ptr& config, + std::unique_ptr&& intconfig, + const GradientMachinePtr& gradientMachine, + const std::shared_ptr& parameterUpdater, std::shared_ptr testDataProvider); /** @@ -83,13 +82,11 @@ public: Evaluator* evaluator, std::vector* outArgs); - /** * performance the full pass of test given test data provider */ void test(); - protected: std::shared_ptr testParameterClient_; std::shared_ptr config_; diff --git a/paddle/trainer/TesterConfig.h b/paddle/trainer/TesterConfig.h index d5e644ce6124710c76a463d521c16451e22b5462..90267e68d768f2a144e0041d0f493072ef9eb9a1 100644 --- a/paddle/trainer/TesterConfig.h +++ b/paddle/trainer/TesterConfig.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "paddle/utils/Util.h" diff --git a/paddle/trainer/ThreadParameterUpdater.cpp b/paddle/trainer/ThreadParameterUpdater.cpp index d0fda1b6253e3e4b11a7e6b956d9a93ad5596728..cc22851d8ecbf594df1e3f2c8aeaa98c07b3765b 100644 --- a/paddle/trainer/ThreadParameterUpdater.cpp +++ b/paddle/trainer/ThreadParameterUpdater.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "ThreadParameterUpdater.h" #include "paddle/utils/Logging.h" @@ -45,7 +44,8 @@ void SgdThreadUpdater::init(std::vector& parameters) { optimizers_.resize(maxId + 1); for (auto& para : parameters_) { int pid = para->getID(); - optimizers_[pid].reset(sgdOptimizerCreate(config_, para->getConfig(), + optimizers_[pid].reset(sgdOptimizerCreate(config_, + para->getConfig(), para->isGradSparseUpdate(), false /*inPserver*/)); size_t numRows = para->isGradSparseUpdate() ? para->getConfig().dims(0) : 0; @@ -91,8 +91,10 @@ void SgdThreadUpdater::updateImpl(Parameter* para) { } void SgdThreadUpdater::threadTraverse( - const ParameterOptimizer::TraverseCallback& callback, int tid, - size_t numThreads, Parameter* para) { + const ParameterOptimizer::TraverseCallback& callback, + int tid, + size_t numThreads, + Parameter* para) { VectorPtr* vecs = Parameter::getTlsTempBufs(); if (para->isGradSparseUpdate()) { size_t height = para->getConfig().dims(0); @@ -106,8 +108,8 @@ void SgdThreadUpdater::threadTraverse( } } else { // dense // setup sub bufs - auto interval = calcSplitArrayInterval(para->getSize(), (size_t)tid, - numThreads, 8LU /*for avx*/); + auto interval = calcSplitArrayInterval( + para->getSize(), (size_t)tid, numThreads, 8LU /*for avx*/); for (auto type : parameterTypes_) { vecs[type]->subVecFrom(*para->getBuf(type), interval); } @@ -150,7 +152,7 @@ void SgdThreadUpdater::traverse(GetTraverseCallback getTraverseCallback) { } else if (hasCpuPara) { getGlobalSyncThreadPool()->exec(cpuTraverse); } else if (hasGpuPara) { - gpuTraverse(0, 0); + gpuTraverse(0, 0); } } @@ -168,9 +170,8 @@ void SgdThreadUpdater::catchUpWith() { void SgdThreadUpdater::apply() { catchUpWith(); - traverse([this](Parameter* para) { - return optimizers_[para->getID()]->apply(); - }); + traverse( + [this](Parameter* para) { return optimizers_[para->getID()]->apply(); }); } void SgdThreadUpdater::restore() { @@ -205,9 +206,9 @@ void SgdThreadUpdater::finishBatch(real cost) { } } -void SgdThreadUpdater::threadUpdateSparse( - int tid, size_t numThreads, Parameter* para) { - +void SgdThreadUpdater::threadUpdateSparse(int tid, + size_t numThreads, + Parameter* para) { int pid = para->getID(); ParameterOptimizer* optimizer = optimizers_[pid].get(); VectorPtr* vecs = Parameter::getTlsTempBufs(); @@ -216,10 +217,10 @@ void SgdThreadUpdater::threadUpdateSparse( size_t width = para->getConfig().dims(1); if (dynamic_cast( - para->getMat(PARAMETER_GRADIENT).get())) { + para->getMat(PARAMETER_GRADIENT).get())) { // From MultiGradientMachine SparseRowIdsCpuMatrix* mainMat = dynamic_cast( - para->getMat(PARAMETER_GRADIENT).get()); + para->getMat(PARAMETER_GRADIENT).get()); std::vector& sparseIds = mainMat->getIds(tid); for (auto id : sparseIds) { @@ -232,16 +233,16 @@ void SgdThreadUpdater::threadUpdateSparse( } sparseIds.clear(); } else if (dynamic_cast( - para->getMat(PARAMETER_GRADIENT).get())) { + para->getMat(PARAMETER_GRADIENT).get())) { // From NeuralNetwork SparseRowCpuMatrix* mainMat = dynamic_cast( - para->getMat(PARAMETER_GRADIENT).get()); + para->getMat(PARAMETER_GRADIENT).get()); std::vector& localIndices = mainMat->getIndexDictHandle()->localIndices; - auto interval = calcSplitArrayInterval( - localIndices.size(), tid, numThreads); + auto interval = + calcSplitArrayInterval(localIndices.size(), tid, numThreads); for (size_t i = interval.first; i < interval.second; ++i) { auto id = localIndices[i]; real* row = mainMat->getLocalRow(i); @@ -261,12 +262,11 @@ void SgdThreadUpdater::threadUpdateSparse( CHECK_EQ(numThreads, 1UL); mainMat->clearIndices(); } else { - auto & m = *para->getMat(PARAMETER_GRADIENT).get(); + auto& m = *para->getMat(PARAMETER_GRADIENT).get(); LOG(FATAL) << "Internal error: " << para->getName() << " " << typeid(m).name(); } - if (auto callback = optimizer->needSpecialTraversal(para->getConfig())) { for (size_t i = tid; i < height; i += numThreads) { // setup sub bufs @@ -278,14 +278,15 @@ void SgdThreadUpdater::threadUpdateSparse( } } -void SgdThreadUpdater::threadUpdateDense(int tid, size_t numThreads, +void SgdThreadUpdater::threadUpdateDense(int tid, + size_t numThreads, Parameter* para) { int pid = para->getID(); ParameterOptimizer* optimizer = optimizers_[pid].get(); VectorPtr* vecs = Parameter::getTlsTempBufs(); - auto interval = calcSplitArrayInterval(para->getSize(), (size_t)tid, - numThreads, 8LU /*for avx*/); + auto interval = calcSplitArrayInterval( + para->getSize(), (size_t)tid, numThreads, 8LU /*for avx*/); // setup sub bufs for (auto type : parameterTypes_) { diff --git a/paddle/trainer/ThreadParameterUpdater.h b/paddle/trainer/ThreadParameterUpdater.h index d8a7a5dd4f12afc7edfbf2c5f28cbe31d7516153..5a5e3f1d4b3c1e915aa6ac01ff503c552e42de1a 100644 --- a/paddle/trainer/ThreadParameterUpdater.h +++ b/paddle/trainer/ThreadParameterUpdater.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "paddle/utils/Util.h" @@ -26,7 +25,6 @@ limitations under the License. */ #include #include - namespace paddle { /** @@ -45,14 +43,12 @@ public: explicit SgdThreadUpdater(const OptimizationConfig& optConfig); virtual ~SgdThreadUpdater() {} - // Use the startPass() function of the base optimizer. virtual void startPass(); // Use the finishPass() function of the base optimizer. virtual bool finishPass(real cost); - virtual void init(std::vector& parameters); virtual PassType startBatch(int64_t batchSize); // Call finishBatch for each optimizer. @@ -78,9 +74,11 @@ protected: void threadUpdateDense(int tid, size_t numThreads, Parameter* para); // The update function for after update operations, such as averager. void threadTraverse(const ParameterOptimizer::TraverseCallback& callback, - int tid, size_t numThreads, Parameter* para); + int tid, + size_t numThreads, + Parameter* para); typedef std::function - GetTraverseCallback; + GetTraverseCallback; void traverse(GetTraverseCallback getTraverseCallback); }; diff --git a/paddle/trainer/Trainer.cpp b/paddle/trainer/Trainer.cpp index 7fc48dd1fbec6588b71db031d89dd88c5c5cf92c..8a5162912e5feae9b80ab8fff56bb20e4dac1696 100644 --- a/paddle/trainer/Trainer.cpp +++ b/paddle/trainer/Trainer.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "Trainer.h" #include @@ -40,7 +39,8 @@ limitations under the License. */ #include "TrainerConfigHelper.h" P_DEFINE_string(config, "", "Trainer config file"); -P_DEFINE_int32(test_period, 0, +P_DEFINE_int32(test_period, + 0, "Run test every so many train batches." " 0 for testing after each pass." " If not 0, test log_period batches." @@ -49,23 +49,28 @@ P_DEFINE_int32(test_period, 0, P_DEFINE_bool(local, true, "Train in local mode or not"); P_DEFINE_bool( - test_all_data_in_one_period, false, + test_all_data_in_one_period, + false, "true will test all data in one test peroid." "Otherwise test (batch_size * log_peroid) data in one test period."); -P_DEFINE_int32(average_test_period, 0, +P_DEFINE_int32(average_test_period, + 0, "Do test on average parameter every so" " many batches. MUST be devided by FLAGS_log_period." " Default 0 means do not test average parameter"); P_DEFINE_int32(saving_period, 1, "Save parameteres every so many passes"); -P_DEFINE_int64(saving_period_by_batches, 0, +P_DEFINE_int64(saving_period_by_batches, + 0, "Save parameters every so many batches in one pass"); P_DEFINE_string(save_dir, "", "Directory for saving model parameter"); -P_DEFINE_int32(start_pass, 0, +P_DEFINE_int32(start_pass, + 0, "Start training from this pass. " "Will load parameter from the previous pass"); -P_DEFINE_int32(test_pass, -1, +P_DEFINE_int32(test_pass, + -1, "Will load parameter start from this pass to test"); P_DEFINE_int32(test_wait, 0, "Waiting for pass parameter if not exist"); P_DEFINE_bool(with_cost, true, "enable cost layer or not"); @@ -73,17 +78,21 @@ P_DEFINE_bool(distribute_test, false, "test in distribute mode"); P_DEFINE_int32(num_passes, 100, "train for so many passes"); -P_DEFINE_string(config_args, "", +P_DEFINE_string(config_args, + "", "arguments passed to config file." "Format: key1=value1,key2=value2"); -P_DEFINE_bool(save_only_one, false, +P_DEFINE_bool(save_only_one, + false, "Save only parameters in last pass, remove previous."); P_DEFINE_string(feat_file, "", "File name of extracted feature."); -P_DEFINE_string(predict_output_dir, "", +P_DEFINE_string(predict_output_dir, + "", "Directory that saves the predicted results of output layers"); -P_DEFINE_string(model_list, "", +P_DEFINE_string(model_list, + "", "File that saves the model list when evaluation"); namespace paddle { @@ -98,11 +107,11 @@ void Trainer::init(int argc, char** argv) { init(config); } -void Trainer::init(const std::shared_ptr &config, +void Trainer::init(const std::shared_ptr& config, bool testing, - const std::shared_ptr &gradientMachine, - const std::shared_ptr &dataProvider, - const std::shared_ptr &testDataProvider) { + const std::shared_ptr& gradientMachine, + const std::shared_ptr& dataProvider, + const std::shared_ptr& testDataProvider) { this->stats_ = std::make_shared(); config_ = config; @@ -156,13 +165,16 @@ void Trainer::init(const std::shared_ptr &config, LOG(INFO) << "trainer mode: Testing"; } } else if (IGradientMachineMode::tryGetMode( - (int*)&mode_, config_->getOptConfig().algorithm(), - FLAGS_trainer_count, - FLAGS_local, FLAGS_use_gpu)) { + (int*)&mode_, + config_->getOptConfig().algorithm(), + FLAGS_trainer_count, + FLAGS_local, + FLAGS_use_gpu)) { LOG(INFO) << "Custom trainer mode."; } else if ((config_->getOptConfig().algorithm() == TrainAlgorithm::SGD || - config_->getOptConfig().algorithm() == TrainAlgorithm::AsyncSGD) - && useSparseUpdater) { + config_->getOptConfig().algorithm() == + TrainAlgorithm::AsyncSGD) && + useSparseUpdater) { mode_ = GradientMachine::kSgdSparseCpuTraining; LOG(INFO) << "trainer mode: SgdSparseCpuTraining"; } else { @@ -171,26 +183,26 @@ void Trainer::init(const std::shared_ptr &config, } // initialize trainer internal - trainerInternal_.init(config_, gradientMachine, + trainerInternal_.init(config_, + gradientMachine, TrainerInternalConfig::createFromMode(mode_), - stats_, testing); + stats_, + testing); std::unique_ptr paramConfig( - new ParameterUtilConfig(FLAGS_save_only_one, - FLAGS_saving_period, - FLAGS_loadsave_parameters_in_pserver, - FLAGS_config)); + new ParameterUtilConfig(FLAGS_save_only_one, + FLAGS_saving_period, + FLAGS_loadsave_parameters_in_pserver, + FLAGS_config)); paramUtil_.reset( - new paddle::ParameterUtil( - config_, - std::move(paramConfig), - trainerInternal_.getGradientMachine(), - trainerInternal_.getParameterUpdater())); - + new paddle::ParameterUtil(config_, + std::move(paramConfig), + trainerInternal_.getGradientMachine(), + trainerInternal_.getParameterUpdater())); - bool gpuData = FLAGS_use_gpu && (!FLAGS_parallel_nn) && - (!IGradientMachineMode::dataMustInCpu(mode_, - FLAGS_trainer_count)); + bool gpuData = + FLAGS_use_gpu && (!FLAGS_parallel_nn) && + (!IGradientMachineMode::dataMustInCpu(mode_, FLAGS_trainer_count)); dataProvider_ = dataProvider; if (!dataProvider_ && config_->hasDataConfig()) { @@ -244,12 +256,14 @@ void Trainer::init(const std::shared_ptr &config, } else if (!config_->getConfig().init_model_path().empty() && (FLAGS_local || FLAGS_trainer_id == 0)) { paramUtil_->loadParametersWithPath( - config_->getConfig().init_model_path(), - false /*local*/, true /*remote*/); + config_->getConfig().init_model_path(), + false /*local*/, + true /*remote*/); } else if (config_->getConfig().start_pass() > 0 && (FLAGS_local || FLAGS_trainer_id == 0)) { CHECK(paramUtil_->loadParameters(config_->getConfig().start_pass() - 1, - false /*local*/, true /*remote*/)); + false /*local*/, + true /*remote*/)); } else { trainerInternal_.getParameterUpdater()->randParametersRemote(); } @@ -277,9 +291,8 @@ void Trainer::train(size_t numPasses) { finishTrain(); } - static double genPerturbation(real* d, real* grad, size_t dim) { - auto & reng = ThreadLocalRandomEngine::get(); + auto& reng = ThreadLocalRandomEngine::get(); std::uniform_real_distribution dist(-1, 1); double gradNorm = 0, dNorm = 0; for (size_t i = 0; i < dim; ++i) { @@ -390,9 +403,7 @@ void Trainer::startTrain() { trainerInternal_.getGradientMachine()->start(*config_, dataProvider_); } -void Trainer::finishTrain() { - trainerInternal_.getGradientMachine()->finish(); -} +void Trainer::finishTrain() { trainerInternal_.getGradientMachine()->finish(); } void Trainer::startTrainPass() { stats_->reset(); @@ -421,9 +432,8 @@ void Trainer::trainOneDataBatch(DataBatch& dataBatch) { if (FLAGS_prev_batch_state) { trainerInternal_.getGradientMachine()->getState(trainState_); } - trainPassContext_.avgTestCost += - tester_->forwardOneBatch( - dataBatch, averageEvaluator_.get(), &forwardOutput_); + trainPassContext_.avgTestCost += tester_->forwardOneBatch( + dataBatch, averageEvaluator_.get(), &forwardOutput_); if (FLAGS_prev_batch_state) { trainerInternal_.getGradientMachine()->setState(trainState_); } @@ -434,16 +444,16 @@ void Trainer::trainOneDataBatch(DataBatch& dataBatch) { { REGISTER_TIMER("TrainBatch"); trainerInternal_.trainOneBatch( - trainPassContext_.batchId, dataBatch, &forwardOutput_); + trainPassContext_.batchId, dataBatch, &forwardOutput_); } if (averageEvaluator_ && - trainPassContext_.batchId % FLAGS_average_test_period - == FLAGS_average_test_period - 1) { + trainPassContext_.batchId % FLAGS_average_test_period == + FLAGS_average_test_period - 1) { averageEvaluator_->finish(); LOG(INFO) << " Averaged parameter:" - << " cost=" << trainPassContext_.avgTestCost - / trainPassContext_.numAvgTests + << " cost=" + << trainPassContext_.avgTestCost / trainPassContext_.numAvgTests << " Eval: " << *averageEvaluator_; trainPassContext_.numAvgTests = 0; trainPassContext_.avgTestCost = 0; @@ -463,15 +473,15 @@ void Trainer::trainOneDataBatch(DataBatch& dataBatch) { } if (FLAGS_saving_period_by_batches > 0 && - trainPassContext_.batchId - > FLAGS_saving_period_by_batches * trainPassContext_.passInnerId && + trainPassContext_.batchId > + FLAGS_saving_period_by_batches * trainPassContext_.passInnerId && 0 == FLAGS_trainer_id) { trainerInternal_.getParameterUpdater()->catchUpWith(); if (testDataProvider_) { tester_->testOnePeriod(); } - paramUtil_->saveParametersOnePass( - trainPassContext_.passId, trainPassContext_.passInnerId); + paramUtil_->saveParametersOnePass(trainPassContext_.passId, + trainPassContext_.passInnerId); ++trainPassContext_.passInnerId; } } @@ -482,8 +492,8 @@ void Trainer::finishTrainPass() { return; } - trainerInternal_.finishTrainPass( - trainPassContext_.passId, trainPassContext_.batchId); + trainerInternal_.finishTrainPass(trainPassContext_.passId, + trainPassContext_.batchId); FOR_TIMING(globalStat.setThreadInfo(true)); FOR_TIMING(globalStat.printAllStatus()); @@ -493,8 +503,8 @@ void Trainer::finishTrainPass() { tester_->testOnePeriod(); } - if (trainPassContext_.passId % FLAGS_saving_period == 0 - && FLAGS_trainer_id == 0) { + if (trainPassContext_.passId % FLAGS_saving_period == 0 && + FLAGS_trainer_id == 0) { paramUtil_->saveParametersOnePass(trainPassContext_.passId); } ++trainPassContext_.passId; @@ -526,8 +536,8 @@ void Trainer::trainOnePassBatch(int passId) { const std::vector inArgs; { REGISTER_TIMER("onePass"); - trainerInternal_.getGradientMachine()->forwardBackward(inArgs, nullptr, - PASS_TRAIN, nullptr); + trainerInternal_.getGradientMachine()->forwardBackward( + inArgs, nullptr, PASS_TRAIN, nullptr); } real cost = .0; @@ -537,8 +547,7 @@ void Trainer::trainOnePassBatch(int passId) { trainerInternal_.getGradientMachine()->onPassEnd(); - bool accepted = - trainerInternal_.getParameterUpdater()->finishPass(cost); + bool accepted = trainerInternal_.getParameterUpdater()->finishPass(cost); globalStat.setThreadInfo(true); globalStat.printAllStatus(); @@ -559,11 +568,12 @@ void Trainer::trainOnePassBatch(int passId) { } } -real Trainer::calcGradient(const DataBatch& dataBatch, const Vector& value, +real Trainer::calcGradient(const DataBatch& dataBatch, + const Vector& value, Vector& gradient) { CHECK_EQ(value.getSize(), gradient.getSize()); std::vector& parameters = - trainerInternal_.getGradientMachine()->getParameters(); + trainerInternal_.getGradientMachine()->getParameters(); clearGradient(); @@ -584,8 +594,8 @@ real Trainer::calcGradient(const DataBatch& dataBatch, const Vector& value, std::vector inArgs = dataBatch.getStreams(); std::vector outArgs; - trainerInternal_.getGradientMachine()->forwardBackward(inArgs, &outArgs, - PASS_TRAIN); + trainerInternal_.getGradientMachine()->forwardBackward( + inArgs, &outArgs, PASS_TRAIN); real cost = Argument::sumCosts(outArgs); offset = 0; @@ -612,15 +622,14 @@ void Trainer::clearGradient() { int Trainer::getBatchSize() { return config_->getOptConfig().batch_size(); } void Trainer::createTester() { - tester_.reset(new paddle::Tester(config_, createTesterConfig(), + tester_.reset(new paddle::Tester(config_, + createTesterConfig(), trainerInternal_.getGradientMachine(), trainerInternal_.getParameterUpdater(), testDataProvider_)); } -void Trainer::test() { - tester_->test(); -} +void Trainer::test() { tester_->test(); } std::unique_ptr Trainer::createTesterConfig() { TesterConfig* conf = new TesterConfig; @@ -648,7 +657,5 @@ std::unique_ptr Trainer::createTesterConfig() { return std::unique_ptr(conf); } -ParameterUtil* Trainer::getParameterUtilPtr() { - return paramUtil_.get(); -} +ParameterUtil* Trainer::getParameterUtilPtr() { return paramUtil_.get(); } } // namespace paddle diff --git a/paddle/trainer/Trainer.h b/paddle/trainer/Trainer.h index 7762722456c442cff956c3a551c66acb2bdebc62..899607c7c0f17ef2e91969f5ba1dcfa573518727 100644 --- a/paddle/trainer/Trainer.h +++ b/paddle/trainer/Trainer.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "paddle/utils/Util.h" @@ -66,18 +65,17 @@ public: * @param testDataProvider Test Data Provider. null if create from config. */ virtual void init( - const std::shared_ptr &config, + const std::shared_ptr& config, bool testing = false, - const std::shared_ptr &gradientMachine = nullptr, - const std::shared_ptr &dataProvider = nullptr, - const std::shared_ptr &testDataProvider = nullptr); + const std::shared_ptr& gradientMachine = nullptr, + const std::shared_ptr& dataProvider = nullptr, + const std::shared_ptr& testDataProvider = nullptr); /** * Initialize Trainer from command line flags. */ void init(int argc, char** argv); - /** * Train until num_passes reached. * One pass means neural network train through all training data. @@ -108,7 +106,8 @@ public: * TODO(yuyang18): I think this method is deprecated and buggy. Should it be * removed? */ - real calcGradient(const DataBatch& dataBatch, const Vector& value, + real calcGradient(const DataBatch& dataBatch, + const Vector& value, Vector& gradient); /** @@ -207,12 +206,12 @@ protected: // parameter util std::unique_ptr paramUtil_; - #ifdef PADDLE_METRIC_LEARNING +#ifdef PADDLE_METRIC_LEARNING MetricTrainer trainerInternal_; - #else +#else // trainer Internal TrainerInternal trainerInternal_; - #endif +#endif }; } // namespace paddle diff --git a/paddle/trainer/TrainerConfigHelper.cpp b/paddle/trainer/TrainerConfigHelper.cpp index 98197e7988517ad9ae3cf244e98654368a6ec17a..ee5b1e0a9c5a8faa6614d76ab938f1f1b8f4e73a 100644 --- a/paddle/trainer/TrainerConfigHelper.cpp +++ b/paddle/trainer/TrainerConfigHelper.cpp @@ -29,9 +29,8 @@ P_DECLARE_bool(with_gpu); P_DECLARE_bool(parallel_nn); P_DECLARE_string(config_args); - -const char* kConfigParserModuleName = "paddle.trainer.config_parser"; -const char* kConfigParserFuncName = "parse_config_and_serialize"; +const char *kConfigParserModuleName = "paddle.trainer.config_parser"; +const char *kConfigParserFuncName = "parse_config_and_serialize"; namespace paddle { @@ -40,12 +39,10 @@ struct TrainerConfigHelperPrivate { }; TrainerConfigHelper::TrainerConfigHelper(const std::string &configFilePath) - :m(new TrainerConfigHelperPrivate()) { + : m(new TrainerConfigHelperPrivate()) { std::ostringstream configArgs; - configArgs << "trainer_id=" << FLAGS_trainer_id - << ",local=" << FLAGS_local - << ",with_cost=" << FLAGS_with_cost - << ",use_gpu=" << FLAGS_use_gpu + configArgs << "trainer_id=" << FLAGS_trainer_id << ",local=" << FLAGS_local + << ",with_cost=" << FLAGS_with_cost << ",use_gpu=" << FLAGS_use_gpu << ",parallel_nn=" << FLAGS_parallel_nn << ",cudnn_version=" << hl_get_cudnn_lib_version(); if (!FLAGS_config_args.empty()) { @@ -54,31 +51,26 @@ TrainerConfigHelper::TrainerConfigHelper(const std::string &configFilePath) VLOG(3) << "Parsing trainer config " << configFilePath; std::string configProtoStr = - callPythonFunc(kConfigParserModuleName, kConfigParserFuncName, + callPythonFunc(kConfigParserModuleName, + kConfigParserFuncName, {configFilePath, configArgs.str()}); CHECK(m->conf.ParseFromString(configProtoStr)); } -TrainerConfigHelper::TrainerConfigHelper(const TrainerConfig& config) - :m(new TrainerConfigHelperPrivate()) { +TrainerConfigHelper::TrainerConfigHelper(const TrainerConfig &config) + : m(new TrainerConfigHelperPrivate()) { m->conf = config; } - TrainerConfigHelper::~TrainerConfigHelper() { if (m) { delete m; } } -const TrainerConfig & -TrainerConfigHelper::getConfig() const { - return m->conf; -} +const TrainerConfig &TrainerConfigHelper::getConfig() const { return m->conf; } -TrainerConfig& TrainerConfigHelper::getMutableConfig() { - return m->conf; -} +TrainerConfig &TrainerConfigHelper::getMutableConfig() { return m->conf; } const OptimizationConfig &TrainerConfigHelper::getOptConfig() const { return m->conf.opt_config(); @@ -173,8 +165,7 @@ std::string TrainerConfigHelper::getConfigName(bool *ok) const { } else if (!m->conf.init_model_path().empty()) { retv = getConfigNameFromPath(m->conf.init_model_path()); } else if (m->conf.start_pass() >= 1) { - retv = getConfigNameFromPassId(m->conf.start_pass(), - m->conf.save_dir()); + retv = getConfigNameFromPassId(m->conf.start_pass(), m->conf.save_dir()); } if (ok) { @@ -191,8 +182,8 @@ std::shared_ptr TrainerConfigHelper::createFromFlags() { } else if (!FLAGS_init_model_path.empty()) { configPath = getConfigNameFromPath(FLAGS_init_model_path); } else if (FLAGS_start_pass >= 1) { - configPath = getConfigNameFromPassId(FLAGS_start_pass - 1, - FLAGS_init_model_path); + configPath = + getConfigNameFromPassId(FLAGS_start_pass - 1, FLAGS_init_model_path); } else { return nullptr; } diff --git a/paddle/trainer/TrainerConfigHelper.h b/paddle/trainer/TrainerConfigHelper.h index d3ad1eeeb43bc6be0b944e2059dddeab734efb75..d20684964136a553b2d4119e8db5a1de084278bb 100644 --- a/paddle/trainer/TrainerConfigHelper.h +++ b/paddle/trainer/TrainerConfigHelper.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include @@ -27,7 +26,6 @@ struct TrainerConfigHelperPrivate; class ModelConfig; class DataConfig; - /** * @brief TrainerConfig Helper. A class wrap protobuf's TrainerConfig Object, * simplize the usage for TrainerConfig. @@ -46,7 +44,7 @@ public: * @brief Ctor, Create a TrainerConfig from config file * @param configFilePath Config file path. */ - explicit TrainerConfigHelper(const std::string &configFilePath); + explicit TrainerConfigHelper(const std::string& configFilePath); explicit TrainerConfigHelper(const TrainerConfig& config); /** @@ -106,7 +104,6 @@ public: */ bool hasTestDataConfig() const; - /** * @brief Update trainer config from command line flags. * Override config's (save_dir, init_model_path, start_pass) if command @@ -114,7 +111,6 @@ public: */ void updateConfigFromFlags(); - /** * @brief Disable optimization's sparse remote update. */ @@ -125,13 +121,10 @@ public: */ void disableRemoteSparseUpdaterForEachParams(); - /** * @brief implicit conversion. */ - inline operator const TrainerConfig&() const { - return this->getConfig(); - } + inline operator const TrainerConfig&() const { return this->getConfig(); } /** * @brief implicit conversion. @@ -143,16 +136,12 @@ public: /** * @brief implicit conversion. */ - inline operator const DataConfig&() const { - return this->getDataConfig(); - } + inline operator const DataConfig&() const { return this->getDataConfig(); } /** * @brief implicit conversion. */ - inline operator const ModelConfig&() const { - return this->getModelConfig(); - } + inline operator const ModelConfig&() const { return this->getModelConfig(); } /** * @brief Get mutable optimization config. diff --git a/paddle/trainer/TrainerInternal.cpp b/paddle/trainer/TrainerInternal.cpp index e23e42927c381d6efa9a3eef47f7e99f0a65b013..b1c3bf26d21d1760cd1710f372aa8a89fb7b101b 100644 --- a/paddle/trainer/TrainerInternal.cpp +++ b/paddle/trainer/TrainerInternal.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "TrainerInternal.h" #include @@ -37,30 +36,31 @@ limitations under the License. */ namespace paddle { -void TrainerInternal::init(const std::shared_ptr &config, - const GradientMachinePtr &gradientMachine, - std::unique_ptr &&intconfig, - const std::shared_ptr &stats, +void TrainerInternal::init(const std::shared_ptr& config, + const GradientMachinePtr& gradientMachine, + std::unique_ptr&& intconfig, + const std::shared_ptr& stats, bool testing) { - config_ = config; - intconfig_ = std::move(intconfig); - stats_ = stats; + config_ = config; + intconfig_ = std::move(intconfig); + stats_ = stats; - //! in training will use parameter updater definitly. - //! But only use parameter in testing mode when some parameter in pserver. - if (!testing || (config_->getOptConfig().use_sparse_remote_updater() && + //! in training will use parameter updater definitly. + //! But only use parameter in testing mode when some parameter in pserver. + if (!testing || (config_->getOptConfig().use_sparse_remote_updater() && intconfig_->loadsave_parameters_in_pserver)) { - createParameterUpdater(testing); - } + createParameterUpdater(testing); + } - gradientMachine_ = gradientMachine; - if (!gradientMachine) { - CHECK(config_->getConfig().has_model_config()) - << "Missing model_config in trainer_config"; - gradientMachine_.reset(GradientMachine::create( - config_->getConfig().model_config(), intconfig_->mode, - parameterUpdater_->getParameterTypes())); - } + gradientMachine_ = gradientMachine; + if (!gradientMachine) { + CHECK(config_->getConfig().has_model_config()) + << "Missing model_config in trainer_config"; + gradientMachine_.reset( + GradientMachine::create(config_->getConfig().model_config(), + intconfig_->mode, + parameterUpdater_->getParameterTypes())); + } } void TrainerInternal::trainOneBatch(int64_t batchId, @@ -96,8 +96,8 @@ void TrainerInternal::trainOneBatch(int64_t batchId, parameterUpdater_->getParametersRemote(); } - UpdateCallback updateCallback = - [this, showStats, ¶Stats](Parameter* para) { + UpdateCallback updateCallback = [this, showStats, ¶Stats]( + Parameter* para) { if (showStats) { //! @TODO(yuyang18) Show stats is actually a ParameterHook, refactor // it @@ -116,8 +116,8 @@ void TrainerInternal::trainOneBatch(int64_t batchId, timer.start(); #endif REGISTER_TIMER("forwardBackward"); - forwardBackwardBatch(inArgs, *outArgs, passType, updateCallback, - doPipelineUpdate); + forwardBackwardBatch( + inArgs, *outArgs, passType, updateCallback, doPipelineUpdate); #ifndef PADDLE_DISABLE_TIMER timer.stop(); parameterUpdater_->setForwardbackwardTime(timer.get()); @@ -147,7 +147,7 @@ void TrainerInternal::trainOneBatch(int64_t batchId, gradientMachine_->eval(evaluator_); } - *stats_ += { actualBatchSize, cost }; + *stats_ += {actualBatchSize, cost}; { REGISTER_TIMER("finishBatch"); parameterUpdater_->finishBatch(cost); @@ -162,12 +162,11 @@ void TrainerInternal::trainOneBatch(int64_t batchId, if (intconfig_->dot_period > 0) { std::cerr << std::endl; } - LOG(INFO) << " Batch=" << batchId + 1 << " " - << *stats_ + LOG(INFO) << " Batch=" << batchId + 1 << " " << *stats_ << " Eval: " << *evaluator_ << " CurrentEval: " << *currentEvaluator_; } else if (intconfig_->dot_period > 0 && - (batchId + 1) % intconfig_->dot_period == 0) { + (batchId + 1) % intconfig_->dot_period == 0) { std::cerr << "."; } } @@ -179,13 +178,13 @@ void TrainerInternal::finishTrainPass(int passId, int batchId) { gradientMachine_->onPassEnd(); parameterUpdater_->finishPass(); evaluator_->finish(); - LOG(INFO) << " Pass=" << passId << " Batch=" << batchId - << " " << stats_->getStats(false /*without current cost*/) + LOG(INFO) << " Pass=" << passId << " Batch=" << batchId << " " + << stats_->getStats(false /*without current cost*/) << " Eval: " << *evaluator_; } -void TrainerInternal::showParameterStats(const std::vector& - paraStats) { +void TrainerInternal::showParameterStats( + const std::vector& paraStats) { std::vector& parameters = gradientMachine_->getParameters(); for (auto& parameter : parameters) { SetDevice device(parameter->getDeviceId()); @@ -218,18 +217,21 @@ void TrainerInternal::showParameterStats(const std::vector& void TrainerInternal::createParameterUpdater(bool testing) { const std::string& alg = config_->getOptConfig().algorithm(); parameterUpdater_.reset(ParameterUpdaterCreators::tryCreateUpdater( - alg, config_->getOptConfig(), intconfig_->local, - intconfig_->num_passes)); - if (parameterUpdater_) { return; } + alg, config_->getOptConfig(), intconfig_->local, intconfig_->num_passes)); + if (parameterUpdater_) { + return; + } if (!intconfig_->local) { if (testing && config_->getOptConfig().use_sparse_remote_updater()) { std::unique_ptr localUpdater; localUpdater.reset( new SgdLocalUpdater(config_->getOptConfig())); // do nothing - parameterUpdater_.reset(new SparseRemoteParameterUpdaterComposite( - config_->getOptConfig(), intconfig_->num_passes, testing, - std::move(localUpdater))); + parameterUpdater_.reset( + new SparseRemoteParameterUpdaterComposite(config_->getOptConfig(), + intconfig_->num_passes, + testing, + std::move(localUpdater))); } else { if (GradientMachine::kSgdSparseCpuTraining == intconfig_->mode && !intconfig_->use_old_updater) { @@ -251,21 +253,18 @@ void TrainerInternal::createParameterUpdater(bool testing) { } localUpdater.reset( - intconfig_->use_old_updater + intconfig_->use_old_updater ? new RemoteParameterUpdater( - *config_, - intconfig_->num_passes, - std::move(localUpdater)) + *config_, intconfig_->num_passes, std::move(localUpdater)) : new ConcurrentRemoteParameterUpdater( - *config_, - intconfig_->num_passes, - std::move(localUpdater))); - + *config_, intconfig_->num_passes, std::move(localUpdater))); if (config_->getOptConfig().use_sparse_remote_updater()) { - localUpdater.reset(new SparseRemoteParameterUpdaterComposite( - *config_, intconfig_->num_passes, testing, - std::move(localUpdater))); + localUpdater.reset( + new SparseRemoteParameterUpdaterComposite(*config_, + intconfig_->num_passes, + testing, + std::move(localUpdater))); } this->parameterUpdater_ = std::move(localUpdater); @@ -282,8 +281,7 @@ void TrainerInternal::createParameterUpdater(bool testing) { } else if (intconfig_->use_gpu && config_->getOptConfig().do_average_in_cpu() && config_->getOptConfig().average_window() > 0) { - parameterUpdater_.reset( - new SgdUpdaterWithCpuAverager(*config_)); + parameterUpdater_.reset(new SgdUpdaterWithCpuAverager(*config_)); } else { parameterUpdater_.reset(new SgdLocalUpdater(*config_)); } @@ -294,10 +292,10 @@ void TrainerInternal::createParameterUpdater(bool testing) { } void TrainerInternal::forwardBackwardBatch(const std::vector& inArgs, - std::vector& outArgs, - PassType& passType, - UpdateCallback updateCallback, - bool doPipelineUpdate) { + std::vector& outArgs, + PassType& passType, + UpdateCallback updateCallback, + bool doPipelineUpdate) { gradientMachine_->forwardBackward( inArgs, &outArgs, passType, doPipelineUpdate ? updateCallback : nullptr); } diff --git a/paddle/trainer/TrainerInternal.h b/paddle/trainer/TrainerInternal.h index 3a53aa1d17b31ad3e7c1aa53f622c6399baa834e..962d53a30e5454060e8ce864c347c37b9cc98116 100644 --- a/paddle/trainer/TrainerInternal.h +++ b/paddle/trainer/TrainerInternal.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "paddle/utils/Util.h" @@ -28,7 +27,6 @@ limitations under the License. */ #include "TrainerConfigHelper.h" #include "TrainerInternalConfig.h" - namespace paddle { /** @@ -40,12 +38,10 @@ public: struct ParaStat { real maxAbsGrad; real avgAbsGrad; - ParaStat() :maxAbsGrad(.0), avgAbsGrad(.0){ - } + ParaStat() : maxAbsGrad(.0), avgAbsGrad(.0) {} }; - TrainerInternal() { - } + TrainerInternal() {} /** * Intializes trainer internal class @@ -55,10 +51,10 @@ public: * @param stats training stats * @param testing if it is in testing phase */ - void init(const std::shared_ptr &config, - const GradientMachinePtr &machine, - std::unique_ptr &&intconfig, - const std::shared_ptr &stats, + void init(const std::shared_ptr& config, + const GradientMachinePtr& machine, + std::unique_ptr&& intconfig, + const std::shared_ptr& stats, bool testing); virtual ~TrainerInternal() {} @@ -94,7 +90,7 @@ public: /** * getGradientMachine */ - inline const GradientMachinePtr & getGradientMachine() const { + inline const GradientMachinePtr& getGradientMachine() const { return gradientMachine_; } @@ -109,17 +105,13 @@ public: * setCurrentEvaluator * @param eval evaluator to set */ - inline void setCurrentEvaluator(Evaluator* eval) { - currentEvaluator_ = eval; - } + inline void setCurrentEvaluator(Evaluator* eval) { currentEvaluator_ = eval; } /** * setEvaluator * @param eval evaluator to set */ - inline void setEvaluator(Evaluator* eval) { - evaluator_ = eval; - } + inline void setEvaluator(Evaluator* eval) { evaluator_ = eval; } /** * forwardBackwardBatch diff --git a/paddle/trainer/TrainerInternalConfig.cpp b/paddle/trainer/TrainerInternalConfig.cpp index 4a829a4df9e345d5d6b82740deea3cd005f6432b..0dc74cb3b39309b33a1a92dfa5a45e95defb4120 100644 --- a/paddle/trainer/TrainerInternalConfig.cpp +++ b/paddle/trainer/TrainerInternalConfig.cpp @@ -14,7 +14,8 @@ limitations under the License. */ #include "TrainerInternalConfig.h" -P_DEFINE_int32(show_parameter_stats_period, 0, +P_DEFINE_int32(show_parameter_stats_period, + 0, "Whether to show parameter stats during training"); P_DEFINE_int32(dot_period, 1, "Print '.' every so many batches"); diff --git a/paddle/trainer/TrainerInternalConfig.h b/paddle/trainer/TrainerInternalConfig.h index 9b59143bade737d9cde225836b8ae634e8e1543f..b7bfd29abd729b33ca953fb20835c57cbcf3ef74 100644 --- a/paddle/trainer/TrainerInternalConfig.h +++ b/paddle/trainer/TrainerInternalConfig.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "paddle/utils/Util.h" @@ -94,9 +93,7 @@ public: * @brief get all processed samples' number * @return all processed samples' number */ - inline int64_t getNumProcessed() const { - return this->numProcessed_; - } + inline int64_t getNumProcessed() const { return this->numProcessed_; } /** * @brief same function as addCost. But it is simple to invoke. @@ -111,7 +108,7 @@ public: * @param p a pair of parameter, first is numProcessed, second is cost. * @return *this */ - inline TrainerStats& operator += (const std::pair& p) { + inline TrainerStats& operator+=(const std::pair& p) { this->addCost(p.first, p.second); return *this; } @@ -121,9 +118,7 @@ public: * * reset stat when constructed. */ - inline TrainerStats() { - this->reset(); - } + inline TrainerStats() { this->reset(); } /** * @brief show stats to ostream. @@ -137,7 +132,7 @@ public: os << "samples=" << this->getNumProcessed() << " AvgCost=" << this->getAvgCost(); if (withCurrentCost) { - os << " CurrentCost=" << this->getCurrentAvgCost(); + os << " CurrentCost=" << this->getCurrentAvgCost(); } } diff --git a/paddle/trainer/TrainerMain.cpp b/paddle/trainer/TrainerMain.cpp index a486cc383ace62111dbdbdd98e83710831a64095..e23e745d99c7b10fb780cb0c89e27207eefc19c1 100644 --- a/paddle/trainer/TrainerMain.cpp +++ b/paddle/trainer/TrainerMain.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include #include "paddle/utils/PythonUtil.h" #include "paddle/utils/StringUtil.h" @@ -34,7 +33,7 @@ P_DECLARE_string(rdma_tcp); using namespace paddle; // NOLINT int main(int argc, char** argv) { - // write logs instantly (never buffer log messages) +// write logs instantly (never buffer log messages) #ifdef PADDLE_USE_GLOG FLAGS_logbuflevel = -1; #endif diff --git a/paddle/trainer/tests/picojson.h b/paddle/trainer/tests/picojson.h index a0b5c2274b20fdbce76d021326f22b3181f3d9d1..cb657d219e55c1e349ffb77a88945085b4149c78 100644 --- a/paddle/trainer/tests/picojson.h +++ b/paddle/trainer/tests/picojson.h @@ -409,7 +409,8 @@ inline std::string value::to_str() const { case number_type: { char buf[256]; double tmp; - SNPRINTF(buf, sizeof(buf), + SNPRINTF(buf, + sizeof(buf), fabs(u_.number_) < (1ULL << 53) && modf(u_.number_, &tmp) == 0 ? "%.f" : "%.17g", @@ -532,7 +533,8 @@ void value::_serialize(Iter oi, int indent) const { ++indent; } for (object::const_iterator i = u_.object_->begin(); - i != u_.object_->end(); ++i) { + i != u_.object_->end(); + ++i) { if (i != u_.object_->begin()) { *oi++ = ','; } @@ -983,7 +985,9 @@ inline std::string parse(value& out, Iter& pos, const Iter& last) { } template -inline Iter _parse(Context& ctx, const Iter& first, const Iter& last, +inline Iter _parse(Context& ctx, + const Iter& first, + const Iter& last, std::string* err) { input in(first, last); if (!_parse(ctx, in) && err != NULL) { @@ -1003,7 +1007,9 @@ inline Iter _parse(Context& ctx, const Iter& first, const Iter& last, } template -inline Iter parse(value& out, const Iter& first, const Iter& last, +inline Iter parse(value& out, + const Iter& first, + const Iter& last, std::string* err) { default_parse_context ctx(&out); return _parse(ctx, first, last, err); @@ -1017,8 +1023,10 @@ inline std::string parse(value& out, const std::string& s) { inline std::string parse(value& out, std::istream& is) { std::string err; - parse(out, std::istreambuf_iterator(is.rdbuf()), - std::istreambuf_iterator(), &err); + parse(out, + std::istreambuf_iterator(is.rdbuf()), + std::istreambuf_iterator(), + &err); return err; } diff --git a/paddle/trainer/tests/test_Compare.cpp b/paddle/trainer/tests/test_Compare.cpp index 735c5a5b27d8189195be8a720158977edc5d8c9e..03312f9e470e0f8b01e229237d25a7ac8e088c5c 100644 --- a/paddle/trainer/tests/test_Compare.cpp +++ b/paddle/trainer/tests/test_Compare.cpp @@ -52,8 +52,8 @@ void calcGradient(bool useGpu, comData& Data) { vector& inArgs = dataBatch.getStreams(); trainer.getGradientMachine()->start(trainer.getConfig(), nullptr); for (int i = 0; i < 2; ++i) { - trainer.getGradientMachine()->forwardBackward(inArgs, &Data.outArgs, - PASS_TRAIN); + trainer.getGradientMachine()->forwardBackward( + inArgs, &Data.outArgs, PASS_TRAIN); } trainer.getGradientMachine()->finish(); } diff --git a/paddle/trainer/tests/test_CompareSparse.cpp b/paddle/trainer/tests/test_CompareSparse.cpp index 311dd333a1b1638e75ca7aaf441c441d3cf54447..a7c6862ce3362556fa60cc3309445347476e7f33 100644 --- a/paddle/trainer/tests/test_CompareSparse.cpp +++ b/paddle/trainer/tests/test_CompareSparse.cpp @@ -23,7 +23,7 @@ using namespace paddle; // NOLINT using namespace std; // NOLINT static const string& configFile1 = - "trainer/tests/sample_trainer_config_qb_rnn.conf"; + "trainer/tests/sample_trainer_config_qb_rnn.conf"; P_DECLARE_bool(use_gpu); P_DECLARE_string(config); @@ -38,8 +38,9 @@ P_DECLARE_bool(local); P_DECLARE_bool(use_old_updater); P_DECLARE_bool(parallel_nn); P_DECLARE_string(config_args); -P_DEFINE_double(max_diff_ratio, 0.0f, - "max diff ratio allowed for parameters value"); +P_DEFINE_double(max_diff_ratio, + 0.0f, + "max diff ratio allowed for parameters value"); int gNumDevices = 0; @@ -53,8 +54,7 @@ std::vector trainerOnePassTest(const string& configFile, FLAGS_config_args = sparseUpdate ? "sparse_update=1" : "sparse_update=0"; LOG(INFO) << " useGpu=" << useGpu << " trainerCount=" << trainerCount - << " configFile=" << configFile - << " sparseUpdate=" << sparseUpdate; + << " configFile=" << configFile << " sparseUpdate=" << sparseUpdate; srand(FLAGS_seed); *ThreadLocalRand::getSeed() = FLAGS_seed; ThreadLocalRandomEngine::get().seed(FLAGS_seed); @@ -91,8 +91,12 @@ std::vector& getDenseParameters() { return denseParameters; } -void checkBuffer(real* A, const char* desA, real* B, const char* desB, - size_t len, double maxDiffRatio) { +void checkBuffer(real* A, + const char* desA, + real* B, + const char* desB, + size_t len, + double maxDiffRatio) { double maxDiff = 0; double maxValue = 0; for (size_t i = 0; i < len; ++i) { @@ -101,10 +105,8 @@ void checkBuffer(real* A, const char* desA, real* B, const char* desB, maxDiff = std::max(maxDiff, diff); } EXPECT_LE(maxDiff / maxValue, maxDiffRatio); - LOG(INFO) << " maxDiff=" << maxDiff - << " maxValue=" << maxValue - << " maxDiff/maxValue=" << maxDiff / maxValue - << "\n\n"; + LOG(INFO) << " maxDiff=" << maxDiff << " maxValue=" << maxValue + << " maxDiff/maxValue=" << maxDiff / maxValue << "\n\n"; } void compareValue(const vector& parametersA, @@ -125,8 +127,12 @@ void compareValue(const vector& parametersA, LOG(INFO) << "\n\n----------- PARAMETER_VALUE: " << parameterA->getName() << " ; size : " << paraA.getSize() << " ------------"; - checkBuffer(paraA.getData(), "para_A", paraB.getData(), "para_B", - paraA.getSize(), maxDiffRatio); + checkBuffer(paraA.getData(), + "para_A", + paraB.getData(), + "para_B", + paraA.getSize(), + maxDiffRatio); } } @@ -172,8 +178,7 @@ TEST(compareSparse, multiGradientMachine) { if (useGpu) continue; #endif FLAGS_parallel_nn = useGpu; - LOG(INFO) << " local=" << local - << " useGpu=" << useGpu; + LOG(INFO) << " local=" << local << " useGpu=" << useGpu; int trainerCount = useGpu ? numGpu : 2; std::vector parameters = trainerOnePassTest(configFile1, true, trainerCount, useGpu); @@ -197,8 +202,7 @@ TEST(compareSparse, NeuralNetwork) { if (useGpu) continue; #endif FLAGS_parallel_nn = useGpu; - LOG(INFO) << " local=" << local - << " useGpu=" << useGpu; + LOG(INFO) << " local=" << local << " useGpu=" << useGpu; int trainerCount = 1; std::vector parameters = trainerOnePassTest(configFile1, true, trainerCount, useGpu); diff --git a/paddle/trainer/tests/test_CompareTwoNets.cpp b/paddle/trainer/tests/test_CompareTwoNets.cpp index d1057f2aeabd3bcc41330f1cfe72227de3837140..81320da6ac9c6e880b936a6b1e2650796bb50ff7 100644 --- a/paddle/trainer/tests/test_CompareTwoNets.cpp +++ b/paddle/trainer/tests/test_CompareTwoNets.cpp @@ -32,10 +32,12 @@ P_DECLARE_string(nics); P_DEFINE_string(config_file_a, "", "config of one network to compare"); P_DEFINE_string(config_file_b, "", "config of another network to compare"); -P_DEFINE_bool(need_high_accuracy, false, +P_DEFINE_bool(need_high_accuracy, + false, "whether need to run in double accuracy"); P_DEFINE_double( - max_diff_ratio, 0.0f, + max_diff_ratio, + 0.0f, "max diff ratio allowed for outputs and parameters (value/gradient)"); P_DECLARE_bool(thread_local_rand_use_global_seed); P_DECLARE_int32(seed); @@ -71,14 +73,18 @@ void calcGradient(ComData& data, const string configFile) { vector& inArgs = dataBatch.getStreams(); trainer.getGradientMachine()->start(trainer.getConfig(), nullptr); - trainer.getGradientMachine()->forwardBackward(inArgs, &data.outArgs, - PASS_TRAIN); + trainer.getGradientMachine()->forwardBackward( + inArgs, &data.outArgs, PASS_TRAIN); trainer.getGradientMachine()->finish(); } -void checkBuffer(real* A, const char* desA, real* B, const char* desB, - size_t len, size_t width = 1) { +void checkBuffer(real* A, + const char* desA, + real* B, + const char* desB, + size_t len, + size_t width = 1) { int nNum = 0; real maxVal = 0; for (size_t i = 0; i < len; ++i) { @@ -90,8 +96,8 @@ void checkBuffer(real* A, const char* desA, real* B, const char* desB, maxDiff = std::max(maxDiff, diff); if (diff > maxVal * FLAGS_max_diff_ratio) { nNum++; - VLOG(1) << "Row: " << i / width << ", " << desA << " : " << A[i] - << " " << desB << " : " << B[i] << " diff=" << diff; + VLOG(1) << "Row: " << i / width << ", " << desA << " : " << A[i] << " " + << desB << " : " << B[i] << " diff=" << diff; } } EXPECT_EQ(0, nNum); @@ -114,8 +120,12 @@ void compareGradient(ComData& comDataA, ComData& comDataB) { LOG(INFO) << "\n--------------------------------" << " Check Network Output_" << i << ":" << " -------------------------------------\n"; - checkBuffer(matA.getData(), "network A output", matB.getData(), - "network B output", matA.getElementCnt(), matA.getWidth()); + checkBuffer(matA.getData(), + "network A output", + matB.getData(), + "network B output", + matA.getElementCnt(), + matA.getWidth()); } vector& parametersA = comDataA.parameters; @@ -136,7 +146,10 @@ void compareGradient(ComData& comDataA, ComData& comDataB) { LOG(INFO) << "\n\n----------- PARAMETER_VALUE: " << parameterA->getName() << " ; size : " << paraA.getSize() << " ------------"; - checkBuffer(paraA.getData(), "Network A", paraB.getData(), "Network B", + checkBuffer(paraA.getData(), + "Network A", + paraB.getData(), + "Network B", paraA.getSize()); CpuVector gradA(*parameterA->getBuf(PARAMETER_GRADIENT)); @@ -144,7 +157,10 @@ void compareGradient(ComData& comDataA, ComData& comDataB) { LOG(INFO) << "\n\n----------- PARAMETER_GRADIENT: " << parameterA->getName() << " ; size : " << gradA.getSize() << " -----------"; - checkBuffer(gradA.getData(), "Network A", gradB.getData(), "Network B", + checkBuffer(gradA.getData(), + "Network A", + gradB.getData(), + "Network B", gradA.getSize()); } } diff --git a/paddle/trainer/tests/test_CompareTwoOpts.cpp b/paddle/trainer/tests/test_CompareTwoOpts.cpp index 2c44da43fcd698808805480599f2c6223d120f8d..a52f2fa7e7708925dbcb173167b17bbfef93a4da 100644 --- a/paddle/trainer/tests/test_CompareTwoOpts.cpp +++ b/paddle/trainer/tests/test_CompareTwoOpts.cpp @@ -32,11 +32,13 @@ P_DECLARE_string(nics); P_DEFINE_string(config_file_a, "", "config of one network to compare"); P_DEFINE_string(config_file_b, "", "config of another network to compare"); -P_DEFINE_bool(need_high_accuracy, true, +P_DEFINE_bool(need_high_accuracy, + true, "whether need to run in double accuracy (recommended)"); P_DEFINE_double( - max_diff_ratio, 0.0f, - "max diff ratio allowed for outputs and parameters (value/gradient)"); + max_diff_ratio, + 0.0f, + "max diff ratio allowed for outputs and parameters (value/gradient)"); struct ComData { vector outArgs; @@ -62,8 +64,12 @@ void calcGradient(ComData& data, const string configFile) { trainer.train(); } -void checkBuffer(real* A, const char* desA, real* B, const char* desB, - size_t len, size_t width = 1) { +void checkBuffer(real* A, + const char* desA, + real* B, + const char* desB, + size_t len, + size_t width = 1) { int nNum = 0; for (size_t i = 0; i < len; ++i) { real diff = fabs(A[i] - B[i]); @@ -94,8 +100,12 @@ void compareGradient(ComData& comDataA, ComData& comDataB) { LOG(INFO) << "\n--------------------------------" << " Check Network Output_" << i << ":" << " -------------------------------------\n"; - checkBuffer(matA.getData(), "network A output", matB.getData(), - "network B output", matA.getElementCnt(), matA.getWidth()); + checkBuffer(matA.getData(), + "network A output", + matB.getData(), + "network B output", + matA.getElementCnt(), + matA.getWidth()); } vector& parametersA = comDataA.parameters; @@ -116,7 +126,10 @@ void compareGradient(ComData& comDataA, ComData& comDataB) { LOG(INFO) << "\n\n----------- PARAMETER_VALUE: " << parameterA->getName() << " ; size : " << paraA.getSize() << " ------------"; - checkBuffer(paraA.getData(), "Network A", paraB.getData(), "Network B", + checkBuffer(paraA.getData(), + "Network A", + paraB.getData(), + "Network B", paraA.getSize()); CpuVector gradA(*parameterA->getBuf(PARAMETER_GRADIENT)); @@ -124,7 +137,10 @@ void compareGradient(ComData& comDataA, ComData& comDataB) { LOG(INFO) << "\n\n----------- PARAMETER_GRADIENT: " << parameterA->getName() << " ; size : " << gradA.getSize() << " -----------"; - checkBuffer(gradA.getData(), "Network A", gradB.getData(), "Network B", + checkBuffer(gradA.getData(), + "Network A", + gradB.getData(), + "Network B", gradA.getSize()); } } diff --git a/paddle/trainer/tests/test_Prediction.cpp b/paddle/trainer/tests/test_Prediction.cpp index 1c7f93666b8dfd0307797fc5e20b05b355c75a38..6db33439b319e84e99e828246ca672fa8274e4bf 100644 --- a/paddle/trainer/tests/test_Prediction.cpp +++ b/paddle/trainer/tests/test_Prediction.cpp @@ -20,7 +20,8 @@ limitations under the License. */ P_DECLARE_string(config); P_DECLARE_string(config_args); -P_DEFINE_string(merger, "./paddle_merge_model", +P_DEFINE_string(merger, + "./paddle_merge_model", "path to paddle_merge_model binary"); using namespace paddle; // NOLINT @@ -120,8 +121,10 @@ TEST(GradientMachine, create) { rand() / (real)RAND_MAX; // NOLINT TODO(yuyang): use rand_r } } - MatrixPtr input = Matrix::create(numSamples, inputDim, - /* trans */ false, FLAGS_use_gpu); + MatrixPtr input = Matrix::create(numSamples, + inputDim, + /* trans */ false, + FLAGS_use_gpu); input->copyFrom(cpuInput); inArgs[0].value = input; gradientMachine1->forward(inArgs, &outArgs, PASS_TEST); @@ -139,8 +142,8 @@ TEST(GradientMachine, create) { gradientMachine3->forward(inArgs, &outArgs2, PASS_TEST); out2.copyFrom(*outArgs2[0].value); - checkBuffer(out1.getData(), out2.getData(), - out2.getHeight() * out2.getWidth()); + checkBuffer( + out1.getData(), out2.getData(), out2.getHeight() * out2.getWidth()); cmd = " rm -rf " + modelDir + "/*"; LOG(INFO) << "cmd " << cmd; diff --git a/paddle/trainer/tests/test_PyDataProviderWrapper.cpp b/paddle/trainer/tests/test_PyDataProviderWrapper.cpp index 49332b877db646fdcd7cd3b11ec96bac64dd2d6d..e53291386c6b553e26248dae75e321d4b7246823 100644 --- a/paddle/trainer/tests/test_PyDataProviderWrapper.cpp +++ b/paddle/trainer/tests/test_PyDataProviderWrapper.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #ifndef PADDLE_NO_PYTHON #include #include diff --git a/paddle/trainer/tests/test_Trainer.cpp b/paddle/trainer/tests/test_Trainer.cpp index ad2a715ef89c6f4c4b509e1a8b816699b709c59d..900c05af851aede67253535228d75d211dee6a85 100644 --- a/paddle/trainer/tests/test_Trainer.cpp +++ b/paddle/trainer/tests/test_Trainer.cpp @@ -33,7 +33,9 @@ P_DECLARE_string(config); P_DECLARE_int32(gpu_id); P_DECLARE_bool(allow_only_one_model_on_one_gpu); -void checkGradientTest(const string& configFile, bool useGpu, bool parallel, +void checkGradientTest(const string& configFile, + bool useGpu, + bool parallel, int trainerCount = 1) { FLAGS_use_gpu = useGpu; FLAGS_parallel_nn = parallel; @@ -94,7 +96,7 @@ TEST(checkGradient, multi) { TEST(checkGradient, hsigmoid) { checkGradientTest(configFile2, false, false); } TEST(checkGradient, chunk) { -#if defined(__APPLE__) || defined (__OSX__) +#if defined(__APPLE__) || defined(__OSX__) EXPECT_EQ(0, system("python trainer/tests/gen_proto_data.py")); #else EXPECT_EQ(0, system("python2 trainer/tests/gen_proto_data.py")); diff --git a/paddle/trainer/tests/test_TrainerOnePass.cpp b/paddle/trainer/tests/test_TrainerOnePass.cpp index 4554b94485f99f1fea1ebef8f5ae8a59b630d106..da2954d1664fc18cb78e6217807ff9799d220f7f 100644 --- a/paddle/trainer/tests/test_TrainerOnePass.cpp +++ b/paddle/trainer/tests/test_TrainerOnePass.cpp @@ -41,12 +41,13 @@ public: } }; - - int gNumDevices = 0; -void trainerOnePassTest(const string& configFile, bool useGpu, bool parallel, - int trainerCount = 1, double averageWindow = 0.0f, +void trainerOnePassTest(const string& configFile, + bool useGpu, + bool parallel, + int trainerCount = 1, + double averageWindow = 0.0f, bool doAverageInCpu = false) { FLAGS_use_gpu = useGpu; FLAGS_parallel_nn = parallel; @@ -164,13 +165,13 @@ double checkRemoteParameterUpdater(TrainerForTest& trainer) { const vector& inArgs = dataBatch.getStreams(); vector outArgs; - UpdateCallback updateCallback = - [parameterUpdater, parameterCheck](Parameter* para) { - parameterCheck[para->getID()] - ->getBuf(PARAMETER_GRADIENT) - ->copyFrom(*para->getBuf(PARAMETER_GRADIENT)); - parameterUpdater->update(para); - }; + UpdateCallback updateCallback = [parameterUpdater, + parameterCheck](Parameter* para) { + parameterCheck[para->getID()] + ->getBuf(PARAMETER_GRADIENT) + ->copyFrom(*para->getBuf(PARAMETER_GRADIENT)); + parameterUpdater->update(para); + }; parameterUpdater->startPass(); parameterUpdaterCheck->startPass(); @@ -178,8 +179,8 @@ double checkRemoteParameterUpdater(TrainerForTest& trainer) { for (int i = 0; i < config.opt_config().num_batches_per_get_parameter() * 2; ++i) { PassType passType = parameterUpdater->startBatch(actualBatchSize); - gradientMachine->forwardBackward(inArgs, &outArgs, passType, - updateCallback); + gradientMachine->forwardBackward( + inArgs, &outArgs, passType, updateCallback); parameterUpdater->finishBatch(0); parameterUpdaterCheck->startBatch(actualBatchSize); @@ -191,7 +192,7 @@ double checkRemoteParameterUpdater(TrainerForTest& trainer) { double sum = 0.0f; for (size_t i = 0; i != parameters.size(); ++i) { - real* v1, *v2; + real *v1, *v2; CpuVector trainerPara(parameters[i]->getSize()); trainerPara.copyFrom(*parameters[i]->getBuf(PARAMETER_VALUE)); if (!FLAGS_use_gpu) { @@ -217,8 +218,10 @@ double checkRemoteParameterUpdater(TrainerForTest& trainer) { return sum; } -void checkRemoteParameterUpdaterTest(const string& configFile, bool useGpu, - bool parallel, int trainerCount = 1, +void checkRemoteParameterUpdaterTest(const string& configFile, + bool useGpu, + bool parallel, + int trainerCount = 1, bool useOldUpdater = false, int num_batches_per_get_parameter = 1) { FLAGS_use_gpu = useGpu; diff --git a/paddle/trainer/tests/test_recurrent_machine_generation.cpp b/paddle/trainer/tests/test_recurrent_machine_generation.cpp index fcee318d16e00428bda447e80575dbf1b027102d..49e8a97ad057246addf29274dd9c436d1481de91 100644 --- a/paddle/trainer/tests/test_recurrent_machine_generation.cpp +++ b/paddle/trainer/tests/test_recurrent_machine_generation.cpp @@ -51,8 +51,10 @@ void checkOutput(const string& expRetFile) { } } -void prepareInArgs(vector& inArgs, const size_t batchSize, - bool useGpu, bool hasSubseq) { +void prepareInArgs(vector& inArgs, + const size_t batchSize, + bool useGpu, + bool hasSubseq) { inArgs.clear(); // sentence id Argument sentId; @@ -87,7 +89,9 @@ void prepareInArgs(vector& inArgs, const size_t batchSize, inArgs.emplace_back(dummyInput); } -void testGeneration(const string& configFile, bool useGpu, bool hasSubseq, +void testGeneration(const string& configFile, + bool useGpu, + bool hasSubseq, const string& expRetFile) { FLAGS_use_gpu = useGpu; auto config = std::make_shared(configFile); @@ -114,8 +118,10 @@ TEST(RecurrentGradientMachine, test_generation) { #else const auto useGpuConfs = {true, false}; #endif - auto testGen = [&](const string& configFile, bool hasSubseq, - const string& expRetFile, bool beam_search) { + auto testGen = [&](const string& configFile, + bool hasSubseq, + const string& expRetFile, + bool beam_search) { FLAGS_config_args = beam_search ? "beam_search=1" : "beam_search=0"; for (auto useGpu : useGpuConfs) { testGeneration(configFile, useGpu, hasSubseq, expRetFile); @@ -126,7 +132,9 @@ TEST(RecurrentGradientMachine, test_generation) { // In hierarchical RNN, beam search and one way search are only in inner-RNN, // outer-RNN will concat the generated inner-results (first for beam search) // from inner-RNN. Thus, they have the same outer-results. - testGen(NEST_CONFIG_FILE, true, expectFile + ".nest", + testGen(NEST_CONFIG_FILE, + true, + expectFile + ".nest", false); // no beam search testGen(NEST_CONFIG_FILE, true, expectFile + ".nest", true); // beam search } diff --git a/paddle/utils/BarrierStat.cpp b/paddle/utils/BarrierStat.cpp index f083ef398230ea0b7da9ff0a756a6a91ff8ed413..82c5b84e5960753d5ec4c35bd667a8e43269e9e1 100644 --- a/paddle/utils/BarrierStat.cpp +++ b/paddle/utils/BarrierStat.cpp @@ -20,17 +20,19 @@ limitations under the License. */ #include "paddle/utils/BarrierStat.h" #include "paddle/utils/Flags.h" -P_DEFINE_bool(log_barrier_abstract, true, +P_DEFINE_bool(log_barrier_abstract, + true, "if true, show abstract of barrier performance"); -P_DEFINE_int32(log_barrier_lowest_nodes, 5, +P_DEFINE_int32(log_barrier_lowest_nodes, + 5, "how many lowest node will be logged"); -P_DEFINE_bool(log_barrier_show_log, false, // for performance tuning insight +P_DEFINE_bool(log_barrier_show_log, + false, // for performance tuning insight "if true, always show barrier abstract even with little gap"); namespace paddle { -std::ostream &operator<<(std::ostream &output, - const BarrierStatBase &stat) { +std::ostream &operator<<(std::ostream &output, const BarrierStatBase &stat) { if (FLAGS_log_barrier_abstract) { std::lock_guard guard(stat.lock_); stat.showAbstract(output); @@ -144,7 +146,8 @@ void BarrierEndStat::showAbstract(std::ostream &output) const { // duplicate freq info std::vector outputAbstract = abstract_; - std::sort(outputAbstract.begin(), outputAbstract.end(), + std::sort(outputAbstract.begin(), + outputAbstract.end(), [](const struct Abstract &a, const struct Abstract &b) { return a.freq > b.freq; }); @@ -280,7 +283,8 @@ void BarrierDeltaStat::showAbstract(std::ostream &output) const { // duplicate freq info std::vector outputAbstract = abstract_; - std::sort(outputAbstract.begin(), outputAbstract.end(), + std::sort(outputAbstract.begin(), + outputAbstract.end(), [](const struct Abstract &a, const struct Abstract &b) { return a.freq > b.freq; }); diff --git a/paddle/utils/BarrierStat.h b/paddle/utils/BarrierStat.h index add1093758642ebacdcbde1ae28c2f85a5a63a1b..661340ad275365ab567175d4280abdab18444fac 100644 --- a/paddle/utils/BarrierStat.h +++ b/paddle/utils/BarrierStat.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include @@ -305,44 +304,44 @@ private: // nodes. // end barrier -#define __REGISTER_BARRIER_TIMER_SERVER(set, statName, numConnThreads, \ - trainerId, ...) \ - do { \ - if (numConnThreads > 2) { \ - std::string internalName = \ - std::string(statName) + std::string(__VA_ARGS__); \ - BarrierStatPtr __stat = \ - (set).getStat(numConnThreads, internalName, BARRIER_END); \ - struct timeval cur; \ - gettimeofday(&cur, nullptr); \ - __stat->updateStat(cur, trainerId); \ - } \ +#define __REGISTER_BARRIER_TIMER_SERVER( \ + set, statName, numConnThreads, trainerId, ...) \ + do { \ + if (numConnThreads > 2) { \ + std::string internalName = \ + std::string(statName) + std::string(__VA_ARGS__); \ + BarrierStatPtr __stat = \ + (set).getStat(numConnThreads, internalName, BARRIER_END); \ + struct timeval cur; \ + gettimeofday(&cur, nullptr); \ + __stat->updateStat(cur, trainerId); \ + } \ } while (0); // end barrier with user-defined timer -#define __REGISTER_BARRIER_TIMER_SERVER_SET(set, statName, numConnThreads, \ - trainerId, cur, ...) \ - do { \ - if (numConnThreads > 2) { \ - std::string internalName = \ - std::string(statName) + std::string(__VA_ARGS__); \ - BarrierStatPtr __stat = \ - (set).getStat(numConnThreads, internalName, BARRIER_END); \ - __stat->updateStat(cur, trainerId); \ - } \ +#define __REGISTER_BARRIER_TIMER_SERVER_SET( \ + set, statName, numConnThreads, trainerId, cur, ...) \ + do { \ + if (numConnThreads > 2) { \ + std::string internalName = \ + std::string(statName) + std::string(__VA_ARGS__); \ + BarrierStatPtr __stat = \ + (set).getStat(numConnThreads, internalName, BARRIER_END); \ + __stat->updateStat(cur, trainerId); \ + } \ } while (0); // delta barrier -#define __REGISTER_BARRIER_DELTA_SERVER_SET(set, statName, numConnThreads, \ - trainerId, delta, ...) \ - do { \ - if (numConnThreads > 2) { \ - std::string internalName = \ - std::string(statName) + std::string(__VA_ARGS__); \ - BarrierStatPtr __stat = \ - (set).getStat(numConnThreads, internalName, BARRIER_DELTA); \ - __stat->updateStat(delta, trainerId); \ - } \ +#define __REGISTER_BARRIER_DELTA_SERVER_SET( \ + set, statName, numConnThreads, trainerId, delta, ...) \ + do { \ + if (numConnThreads > 2) { \ + std::string internalName = \ + std::string(statName) + std::string(__VA_ARGS__); \ + BarrierStatPtr __stat = \ + (set).getStat(numConnThreads, internalName, BARRIER_DELTA); \ + __stat->updateStat(delta, trainerId); \ + } \ } while (0); // check end barrier @@ -374,10 +373,10 @@ private: */ // try to capture which trainer is slowest node in sync-sgd at pserver. -#define REGISTER_SLOW_NODES_PROBE(set, statName, numConnThreads, trainerId, \ - ...) \ - __REGISTER_BARRIER_TIMER_SERVER((set), statName, numConnThreads, trainerId, \ - __VA_ARGS__) +#define REGISTER_SLOW_NODES_PROBE( \ + set, statName, numConnThreads, trainerId, ...) \ + __REGISTER_BARRIER_TIMER_SERVER( \ + (set), statName, numConnThreads, trainerId, __VA_ARGS__) // try to check if all threads or trainers have passed barriers for data // accuracy. #define CHECK_BARRIER_TIMER(set, statName, numConnThreads, ...) \ @@ -385,12 +384,12 @@ private: #ifdef PADDLE_DISABLE_TIMER -#define REGISTER_BARRIER_TIMER_SERVER(set, statName, numConnThreads, \ - trainerId, ...) -#define REGISTER_BARRIER_TIMER_SERVER_SET(set, statName, numConnThreads, \ - trainerId, cur, ...) -#define REGISTER_BARRIER_DELTA_SERVER_SET(set, statName, numConnThreads, \ - trainerId, cur, ...) +#define REGISTER_BARRIER_TIMER_SERVER( \ + set, statName, numConnThreads, trainerId, ...) +#define REGISTER_BARRIER_TIMER_SERVER_SET( \ + set, statName, numConnThreads, trainerId, cur, ...) +#define REGISTER_BARRIER_DELTA_SERVER_SET( \ + set, statName, numConnThreads, trainerId, cur, ...) #else @@ -398,10 +397,10 @@ private: * sensing barrier time distribution for all parallelization threads. * it provides low API for slow node check(REGISTER_SLOW_NODES_PROBE) */ -#define REGISTER_BARRIER_TIMER_SERVER(set, statName, numConnThreads, \ - trainerId, ...) \ - __REGISTER_BARRIER_TIMER_SERVER((set), statName, numConnThreads, trainerId, \ - __VA_ARGS__) +#define REGISTER_BARRIER_TIMER_SERVER( \ + set, statName, numConnThreads, trainerId, ...) \ + __REGISTER_BARRIER_TIMER_SERVER( \ + (set), statName, numConnThreads, trainerId, __VA_ARGS__) /* * sensing barrier time distribution for all parallelization threads. @@ -410,18 +409,18 @@ private: * time distribution * for receiving data. */ -#define REGISTER_BARRIER_TIMER_SERVER_SET(set, statName, numConnThreads, \ - trainerId, cur, ...) \ - __REGISTER_BARRIER_TIMER_SERVER_SET((set), statName, numConnThreads, \ - trainerId, cur, __VA_ARGS__) +#define REGISTER_BARRIER_TIMER_SERVER_SET( \ + set, statName, numConnThreads, trainerId, cur, ...) \ + __REGISTER_BARRIER_TIMER_SERVER_SET( \ + (set), statName, numConnThreads, trainerId, cur, __VA_ARGS__) // try to capture time delta from all trainers, such as forwardBackward time // which implies // computation fluctuation -#define REGISTER_BARRIER_DELTA_SERVER_SET(set, statName, numConnThreads, \ - trainerId, delta, ...) \ - __REGISTER_BARRIER_DELTA_SERVER_SET((set), statName, numConnThreads, \ - trainerId, delta, __VA_ARGS__) +#define REGISTER_BARRIER_DELTA_SERVER_SET( \ + set, statName, numConnThreads, trainerId, delta, ...) \ + __REGISTER_BARRIER_DELTA_SERVER_SET( \ + (set), statName, numConnThreads, trainerId, delta, __VA_ARGS__) #endif // DISABLE_TIMER } // namespace paddle diff --git a/paddle/utils/ClassRegistrar.h b/paddle/utils/ClassRegistrar.h index 0c7747ac77a118e794a4b0d46d10b9cc1a2d15f5..ee58ccb2ad42ac9e5380e3a80fe0044965eab083 100644 --- a/paddle/utils/ClassRegistrar.h +++ b/paddle/utils/ClassRegistrar.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include @@ -63,16 +62,16 @@ public: // Create a class instance of type @type using args BaseClass* createByType(const std::string& type, CreateArgs... args) { ClassCreator creator; - CHECK(mapGet(type, creatorMap_, &creator)) - << "Unknown class type: " << type; + CHECK(mapGet(type, creatorMap_, &creator)) << "Unknown class type: " + << type; return creator(args...); } template inline void forEachType(T callback) { - for (auto it = creatorMap_.begin(); it != creatorMap_.end(); ++it) { - callback(it->first); - } + for (auto it = creatorMap_.begin(); it != creatorMap_.end(); ++it) { + callback(it->first); + } } protected: diff --git a/paddle/utils/CommandLineParser.cpp b/paddle/utils/CommandLineParser.cpp index 8edcad5747b419387a933b74a2b477ea82382054..307e304bb03d79fa9a640ece9c84845919b0d9c4 100644 --- a/paddle/utils/CommandLineParser.cpp +++ b/paddle/utils/CommandLineParser.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "CommandLineParser.h" #ifndef PADDLE_USE_GFLAGS #include "paddle/utils/StringUtil.h" @@ -31,7 +30,6 @@ static constexpr int kStatusOK = 0; static constexpr int kStatusInvalid = 1; static constexpr int kStatusNotFound = 2; - /** * \brief: Convert a string to any type value. * @@ -48,13 +46,16 @@ template <> bool StringToValue(const std::string& content, bool* value) { std::string tmp = content; - std::transform(tmp.begin(), tmp.end(), tmp.begin(), [](char in) -> char { - if (in <= 'Z' && in >= 'A') { - return in - ('Z' - 'z'); - } else { - return in; - } - }); // tolower. + std::transform(tmp.begin(), + tmp.end(), + tmp.begin(), + [](char in) -> char { + if (in <= 'Z' && in >= 'A') { + return in - ('Z' - 'z'); + } else { + return in; + } + }); // tolower. if (tmp == "true" || tmp == "1") { *value = true; @@ -121,20 +122,16 @@ int ParseArgument(const std::string& argument, std::string* extraInfo) { * parse '--flag_name', '-flag_name' as true; '--noflag_name', '-noflag_name' as * false */ -static int ParseBoolArgumentExtra( - const std::string& argument, std::string* extraInfo) { +static int ParseBoolArgumentExtra(const std::string& argument, + std::string* extraInfo) { (void)(extraInfo); // unused extraInfo, just make api same. //! @warning: The order and content of prefixes is DESIGNED for parsing //! command line. The length of prefixes are 1, 2, 3, 4. The parse logic takes //! use of this fact. DO NOT CHANGE IT without reading how to parse command //! below. - static const std::vector > prefixes = { - {"-", true}, - {"--", true}, - {"-no", false}, - {"--no", false} - }; + static const std::vector> prefixes = { + {"-", true}, {"--", true}, {"-no", false}, {"--no", false}}; for (flags_internal::CommandLineFlagRegistry::Command& command : flags_internal::CommandLineFlagRegistry::Instance()->commands) { @@ -153,7 +150,6 @@ static int ParseBoolArgumentExtra( return kStatusNotFound; } - /** * \brief: Print command line arguments' usage with type T. */ @@ -170,12 +166,9 @@ static void PrintTypeUsage() { } } -template +template static void PrintTypeUsages() { - int unused[] = { - 0, - (PrintTypeUsage(), 0) ... - }; + int unused[] = {0, (PrintTypeUsage(), 0)...}; (void)(unused); } /** @@ -190,7 +183,8 @@ static void PrintUsageAndExit(const char* argv0) { /** * \brief: Print the error flags, usage, and exit. */ -static void PrintParseError(const std::string& name, const char* actualInput, +static void PrintParseError(const std::string& name, + const char* actualInput, const char* arg0) { std::cerr << "Parse command flag " << name << " error! User input is " << actualInput << std::endl; @@ -211,7 +205,7 @@ void ParseCommandLineFlags(int* argc, char** argv, bool withHelp) { PrintParseError(extra, argv[i], argv[0]); \ } - ParseArgumentWithType(bool); // NOLINT + ParseArgumentWithType(bool); // NOLINT ParseArgumentWithType(int32_t); ParseArgumentWithType(double); // NOLINT ParseArgumentWithType(int64_t); diff --git a/paddle/utils/CommandLineParser.h b/paddle/utils/CommandLineParser.h index d18675ffa30d7f36ee470c35a93e522c68bbfdda..c46567913e253bdda645f129449773040c0ec93d 100644 --- a/paddle/utils/CommandLineParser.h +++ b/paddle/utils/CommandLineParser.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #ifndef PADDLE_USE_GFLAGS #include "DisableCopy.h" @@ -72,7 +71,8 @@ struct CommandLineFlagRegister { * \param [inout] val: The command line argument instance, FLAGS_xxx. * \param [in] desc: The command line helper message. */ - CommandLineFlagRegister(const std::string& name, T* val, + CommandLineFlagRegister(const std::string& name, + T* val, const std::string desc) { CommandLineFlagRegistry::Instance()->commands.push_back( {name, val, desc, *val}); @@ -83,7 +83,8 @@ struct CommandLineFlagRegister { * \brief: Define a command line arguments. * * \param type: The variable type, such as int, double, etc. - * \param name: The variable name. The command line argument is '--name', the variable + * \param name: The variable name. The command line argument is '--name', the + *variable *is 'FLAGS_name' * \param default_value: The default value of command line argument. * \param text: The description in command line argument. diff --git a/paddle/utils/CustomStackTrace.cpp b/paddle/utils/CustomStackTrace.cpp index 232a478ecd93a7dcb7da7b02a5a1af37a1d1bc43..8740fe662ea21ce93c7c0d9505cdeb75975b3020 100644 --- a/paddle/utils/CustomStackTrace.cpp +++ b/paddle/utils/CustomStackTrace.cpp @@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "CustomStackTrace.h" #include "CommandLineParser.h" #include -P_DEFINE_bool(layer_stack_error_only_current_thread, +P_DEFINE_bool( + layer_stack_error_only_current_thread, true, "Dump current thread or whole process layer stack when signal error " "occurred. true means only dump current thread layer stack"); @@ -33,21 +33,23 @@ void installLayerStackTracer() { if (!gLayerStackTrace.empty()) { size_t curTid = -1UL; std::hash hasher; - gLayerStackTrace.dump([&curTid, &hasher](std::thread::id tid, - bool* isForwarding, - const std::string& layerName) { - if (curTid != hasher(tid)) { - if (curTid != -1UL) { - std::cerr << std::endl; - } - curTid = hasher(tid); - std::cerr << "Thread [" << tid << "] "; - if (isForwarding) { - std::cerr << (*isForwarding ? "Forwarding ": "Backwarding "); - } - } - std::cerr << layerName << ", "; - }, FLAGS_layer_stack_error_only_current_thread); + gLayerStackTrace.dump( + [&curTid, &hasher](std::thread::id tid, + bool* isForwarding, + const std::string& layerName) { + if (curTid != hasher(tid)) { + if (curTid != -1UL) { + std::cerr << std::endl; + } + curTid = hasher(tid); + std::cerr << "Thread [" << tid << "] "; + if (isForwarding) { + std::cerr << (*isForwarding ? "Forwarding " : "Backwarding "); + } + } + std::cerr << layerName << ", "; + }, + FLAGS_layer_stack_error_only_current_thread); std::cerr << std::endl; } std::cerr.write(data, sz); diff --git a/paddle/utils/CustomStackTrace.h b/paddle/utils/CustomStackTrace.h index 774c4db2b9be40c38286ef1248bf77746949fd6b..878e14eb5fcf870bf6c29758a1b9a297c13ce730 100644 --- a/paddle/utils/CustomStackTrace.h +++ b/paddle/utils/CustomStackTrace.h @@ -24,13 +24,13 @@ limitations under the License. */ namespace paddle { /** - * A ThreadLocal stack for tracing train/test process. - * (More details of ThreadLocal can be find + * A ThreadLocal stack for tracing train/test process. + * (More details of ThreadLocal can be find * in the comments of ThreadLocal class.) - * + * * For example. * @code{.cpp} - * + * * paddle::CustomStackTrace stack; * for (auto& layer : layers){ * stack.push(layer->getName()); @@ -48,7 +48,7 @@ namespace paddle { * @endcode */ template -class CustomStackTrace{ +class CustomStackTrace { public: /** * @brief Pop out an item from the top of the stack if item == top. @@ -87,7 +87,6 @@ public: return true; } - /** * @brief DumpCallback Type. It will be invoked many times by dump method. * @@ -96,8 +95,8 @@ public: * The third parameter is the item in stack. */ typedef std::function DumpCallback; + bool* /*isPushing*/, + const T& /*item*/)> DumpCallback; /** * Dump all thread stack, and all stack will be cleared. @@ -160,25 +159,23 @@ private: * @brief Get thread local stack reference. */ std::stack& stack() { - return this->getThreadLocal(this->logStack_, - this->stackBuffers_); + return this->getThreadLocal(this->logStack_, this->stackBuffers_); } /** * @brief Get thread local pushing flag. */ bool& pushing() { - return this->getThreadLocal(this->isPushing_, - this->pushingBuffers_); + return this->getThreadLocal(this->isPushing_, this->pushingBuffers_); } private: mutable std::mutex mtx_; - std::unordered_map* > stackBuffers_; - std::unordered_map pushingBuffers_; + std::unordered_map*> stackBuffers_; + std::unordered_map pushingBuffers_; ThreadLocal isPushing_; - ThreadLocal > logStack_; + ThreadLocal> logStack_; }; extern CustomStackTrace gLayerStackTrace; diff --git a/paddle/utils/DisableCopy.h b/paddle/utils/DisableCopy.h index 964daa237beb3085bc78404c6585e6fab16dc27b..e991c07cdf68dac2bdf7fd66de03a292a3bec3c8 100644 --- a/paddle/utils/DisableCopy.h +++ b/paddle/utils/DisableCopy.h @@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once /** * Disable copy macro. */ -#define DISABLE_COPY(CLASS_NAME)\ - CLASS_NAME(CLASS_NAME &&) = delete; \ +#define DISABLE_COPY(CLASS_NAME) \ + CLASS_NAME(CLASS_NAME &&) = delete; \ CLASS_NAME(const CLASS_NAME &other) = delete; \ - CLASS_NAME& operator=(const CLASS_NAME &other) = delete + CLASS_NAME &operator=(const CLASS_NAME &other) = delete diff --git a/paddle/utils/Excepts.cpp b/paddle/utils/Excepts.cpp index 9123508fc78d002a9fc5fd0e7e9da8ddec975d6f..b2fad3ac9dd6477e388185d95ebd49c8f0da4c84 100644 --- a/paddle/utils/Excepts.cpp +++ b/paddle/utils/Excepts.cpp @@ -27,28 +27,28 @@ int feenableexcept(unsigned int excepts) { static fenv_t fenv; unsigned int new_excepts = excepts & FE_ALL_EXCEPT, old_excepts; - if ( fegetenv (&fenv) ) return -1; + if (fegetenv(&fenv)) return -1; old_excepts = fenv.__control & FE_ALL_EXCEPT; // unmask fenv.__control &= ~new_excepts; - fenv.__mxcsr &= ~(new_excepts << 7); + fenv.__mxcsr &= ~(new_excepts << 7); - return ( fesetenv (&fenv) ? -1 : old_excepts ); + return (fesetenv(&fenv) ? -1 : old_excepts); } int fedisableexcept(unsigned int excepts) { static fenv_t fenv; unsigned int new_excepts = excepts & FE_ALL_EXCEPT, old_excepts; - if ( fegetenv (&fenv) ) return -1; + if (fegetenv(&fenv)) return -1; old_excepts = fenv.__control & FE_ALL_EXCEPT; // mask fenv.__control |= new_excepts; - fenv.__mxcsr |= new_excepts << 7; + fenv.__mxcsr |= new_excepts << 7; - return ( fesetenv (&fenv) ? -1 : old_excepts ); + return (fesetenv(&fenv) ? -1 : old_excepts); } #endif diff --git a/paddle/utils/Flags.cpp b/paddle/utils/Flags.cpp index b2b5a5949e59cb7e65eb0db7573adae8e50f80a8..6fae24e1b58c5296019cfaefe97905c3e8632210 100644 --- a/paddle/utils/Flags.cpp +++ b/paddle/utils/Flags.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "Flags.h" #ifdef PADDLE_ONLY_CPU @@ -22,7 +21,8 @@ P_DEFINE_bool(use_gpu, true, "Whether to use GPU for training"); #endif P_DEFINE_bool( - parallel_nn, false, + parallel_nn, + false, "Whether to use multi-threads to calculate one neural network." "If it was set false, use gpu_id specify which gpu core to use" "(the device property in the trainer config file will be ingored)." @@ -32,39 +32,48 @@ P_DEFINE_int32(trainer_count, 1, "Defined how many trainers to train"); P_DEFINE_int32(gpu_id, 0, "Which gpu core to use"); P_DEFINE_int32(port, 20134, "Listening port for pserver"); P_DEFINE_int32(data_server_port, 21134, "Listening port for dserver"); -P_DEFINE_int32(ports_num, 1, +P_DEFINE_int32(ports_num, + 1, "The ports number for parameter send," " increment based on default port number"); -P_DEFINE_int32(ports_num_for_sparse, 0, +P_DEFINE_int32(ports_num_for_sparse, + 0, "The ports number for parameter send," " increment based on default (port + ports_num)"); P_DEFINE_string(nics, "xgbe0,xgbe1", "network device name for pservers"); P_DEFINE_string(rdma_tcp, "tcp", "use rdma or tcp rdma transport protocol"); P_DEFINE_int32( - trainer_id, 0, + trainer_id, + 0, "For distributed training, each trainer must be given an unique id" " ranging from 0 to num_trainers-1. Trainer 0 is the master" " trainer"); P_DEFINE_int32(num_gradient_servers, 1, "number of gradient servers"); P_DEFINE_string(comment, "", "A string for commenting this training task"); -P_DEFINE_string(load_missing_parameter_strategy, "fail", +P_DEFINE_string(load_missing_parameter_strategy, + "fail", "which operation to take on load model fails. support " "fail/rand/zero only."); P_DEFINE_int32(log_period, 100, "Log progress every so many batches"); -P_DEFINE_int32(log_period_server, 500, +P_DEFINE_int32(log_period_server, + 500, "Log progress every so many batches at pserver end"); P_DEFINE_double(checkgrad_eps, 1e-5, "parameter change size for checkgrad"); -P_DEFINE_int32(enable_parallel_vector, 0, +P_DEFINE_int32(enable_parallel_vector, + 0, "threshold for enable parallel vector"); -P_DEFINE_bool(loadsave_parameters_in_pserver, false, +P_DEFINE_bool(loadsave_parameters_in_pserver, + false, "load and save parameters in pserver. " "only work while parameter set sparse_remote_update."); -P_DEFINE_int32(beam_size, 1, +P_DEFINE_int32(beam_size, + 1, "Beam size used in generating most probable output sequences."); P_DEFINE_bool(show_layer_stat, false, "show the statistics of each layer"); P_DEFINE_string(predict_file, "", "File name for saving predict result"); P_DEFINE_bool(prev_batch_state, false, "batch is continue with next batch"); -P_DEFINE_string(init_model_path, "", +P_DEFINE_string(init_model_path, + "", "Path of the initial model parameters." "If it was set, start_pass will be ignored."); diff --git a/paddle/utils/Flags.h b/paddle/utils/Flags.h index b23a29eff9069117a64bfa46d8930a9a43510949..dda60c3f965abd8575677c785b21b058b3400ee5 100644 --- a/paddle/utils/Flags.h +++ b/paddle/utils/Flags.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include "CommandLineParser.h" diff --git a/paddle/utils/GlobalConstants.cpp b/paddle/utils/GlobalConstants.cpp index 8ed6471e4e85de6d1d012660242e2eae05139ec5..d769cd1ee7d4403f9fddbe91d2afec2c986d6b18 100644 --- a/paddle/utils/GlobalConstants.cpp +++ b/paddle/utils/GlobalConstants.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "GlobalConstants.h" namespace paddle { diff --git a/paddle/utils/GlobalConstants.h b/paddle/utils/GlobalConstants.h index 8818b014f80be92f1b7b6907739c3d36bcaa7466..4c74c17a50c8cdbc18a075a58f97efc6b3330deb 100644 --- a/paddle/utils/GlobalConstants.h +++ b/paddle/utils/GlobalConstants.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include @@ -20,9 +19,9 @@ namespace paddle { namespace enumeration_wrapper { enum PassType { - PASS_TRAIN, // Train pass - PASS_TEST, // Test pass - PASS_GC, // Gradient Check pass + PASS_TRAIN, // Train pass + PASS_TEST, // Test pass + PASS_GC, // Gradient Check pass PASS_METRIC, // pass for generate template output with no drop rate. // pass for metric learning training with metric learning error, only used // when we are doing KNN evaluation. @@ -81,7 +80,7 @@ enum ParameterType { } // namespace enumeration_wrapper //! explicit import enum into paddle namespace. -using namespace enumeration_wrapper; // NOLINT +using namespace enumeration_wrapper; // NOLINT class TrainAlgorithm { public: diff --git a/paddle/utils/Locks.h b/paddle/utils/Locks.h index 1fc0363d34597c9447996479aaf771e46d0ba600..5990e1657021611437e8fe730147dfaf207c800d 100644 --- a/paddle/utils/Locks.h +++ b/paddle/utils/Locks.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include @@ -26,7 +25,7 @@ namespace paddle { /** * A simple read-write lock. - * The RWlock allows a number of readers or at most one writer + * The RWlock allows a number of readers or at most one writer * at any point in time. * The RWlock disable copy. * @@ -37,7 +36,7 @@ namespace paddle { * * Use lock_shared() to lock on read mode, other thread can get * it by using the same method lock_shared(). - * + * * Unlock: * * Use unlock() to unlock the lock. @@ -68,13 +67,13 @@ protected: }; /** - * The ReadLockGuard is a read mode RWLock - * using RAII management mechanism. + * The ReadLockGuard is a read mode RWLock + * using RAII management mechanism. */ class ReadLockGuard { public: /** - * @brief Construct Function. Lock on rwlock in read mode. + * @brief Construct Function. Lock on rwlock in read mode. */ explicit ReadLockGuard(RWLock& rwlock) : rwlock_(&rwlock) { rwlock_->lock_shared(); @@ -82,7 +81,7 @@ public: /** * @brief Destruct Function. - * @note This method just unlock the read mode rwlock, + * @note This method just unlock the read mode rwlock, * won't destroy the lock. */ ~ReadLockGuard() { rwlock_->unlock(); } @@ -120,16 +119,15 @@ class Semaphore { public: //! Disable copy & assign Semaphore(const Semaphore& other) = delete; - Semaphore& operator= (const Semaphore&& other) = delete; + Semaphore& operator=(const Semaphore&& other) = delete; //! Enable move. - Semaphore(Semaphore&& other): m(std::move(other.m)) { - } + Semaphore(Semaphore&& other) : m(std::move(other.m)) {} public: /** - * @brief Construct Function. - * @param[in] initValue the initial value of the + * @brief Construct Function. + * @param[in] initValue the initial value of the * semaphore, default 0. */ explicit Semaphore(int initValue = 0); @@ -137,22 +135,23 @@ public: ~Semaphore(); /** - * @brief The same as wait(), except if the decrement can not + * @brief The same as wait(), except if the decrement can not * be performed until ts return false install of blocking. - * @param[in] ts an absolute timeout in seconds and nanoseconds + * @param[in] ts an absolute timeout in seconds and nanoseconds * since the Epoch 1970-01-01 00:00:00 +0000(UTC). - * @return ture if the decrement proceeds before ts, + * @return ture if the decrement proceeds before ts, * else return false. */ bool timeWait(struct timespec* ts); /** - * @brief decrement the semaphore. If the semaphore's value is 0, then call blocks. + * @brief decrement the semaphore. If the semaphore's value is 0, then call + * blocks. */ void wait(); /** - * @brief increment the semaphore. If the semaphore's value + * @brief increment the semaphore. If the semaphore's value * greater than 0, wake up a thread blocked in wait(). */ void post(); @@ -178,9 +177,9 @@ public: ~ThreadBarrier(); /** - * @brief . - * If there were count - 1 threads waiting before, - * then wake up all the count - 1 threads and continue run together. + * @brief . + * If there were count - 1 threads waiting before, + * then wake up all the count - 1 threads and continue run together. * Else block the thread until waked by other thread . */ void wait(); @@ -218,12 +217,12 @@ public: /** * @brief wait until pred return ture. - * @tparam Predicate c++ concepts, describes a function object - * that takes a single iterator argument - * that is dereferenced and used to + * @tparam Predicate c++ concepts, describes a function object + * that takes a single iterator argument + * that is dereferenced and used to * return a value testable as a bool. - * @note pred shall not apply any non-constant function - * through the dereferenced iterator. + * @note pred shall not apply any non-constant function + * through the dereferenced iterator. */ template void wait(Predicate pred) { diff --git a/paddle/utils/Logging.cpp b/paddle/utils/Logging.cpp index 9a6b1f2d837e2b04eb668d910538d2f714f43d34..14303bd4c747db2c10ee24b1601f709a79174850 100644 --- a/paddle/utils/Logging.cpp +++ b/paddle/utils/Logging.cpp @@ -91,8 +91,8 @@ static inline int env2index(const char* envName, } static bool gLogToStderr = env2bool("PLOG_LOGTOSTDERR", true); -static const std::vector gLevelName = {"INFO", "WARNING", "ERROR", - "FATAL"}; +static const std::vector gLevelName = { + "INFO", "WARNING", "ERROR", "FATAL"}; static int gMinLogLevel = env2int("PLOG_MINLOGLEVEL", env2index("PLOG_MINLOGLEVEL", gLevelName, 0)); @@ -143,11 +143,19 @@ LogMessage::~LogMessage() { this->generateLogMessage(); } void LogMessage::generateLogMessage() { if (!gLogInited) { - fprintf(stderr, "%c %s:%d] %s\n", "IWEF"[severity_], fname_, line_, + fprintf(stderr, + "%c %s:%d] %s\n", + "IWEF"[severity_], + fname_, + line_, str().c_str()); } else { for (auto& fd : gLogFds[this->severity_]) { - dprintf(fd, "%c %s:%d] %s\n", "IWEF"[severity_], fname_, line_, + dprintf(fd, + "%c %s:%d] %s\n", + "IWEF"[severity_], + fname_, + line_, str().c_str()); } } @@ -167,9 +175,7 @@ void initializeLogging(int argc, char** argv) { } namespace logging { -void setMinLogLevel(int level) { - paddle::internal::gMinLogLevel = level; -} +void setMinLogLevel(int level) { paddle::internal::gMinLogLevel = level; } void installFailureFunction(void (*callback)() ATTR_NORETURN) { paddle::internal::gFailureFunctionPtr = callback; @@ -191,13 +197,11 @@ void initializeLogging(int argc, char** argv) { } namespace logging { -void setMinLogLevel(int level) { - FLAGS_minloglevel = level; -} +void setMinLogLevel(int level) { FLAGS_minloglevel = level; } void installFailureFunction(void (*callback)()) { google::InstallFailureFunction(callback); } -void installFailureWriter(void(*callback)(const char*, int)) { +void installFailureWriter(void (*callback)(const char*, int)) { google::InstallFailureWriter(callback); } } // namespace logging diff --git a/paddle/utils/Logging.h b/paddle/utils/Logging.h index 46b6a7feebd83bee7d1ae65801e9c31c9db323c4..e9029b421fa3b68845a54194f4cfa69439a99a0c 100644 --- a/paddle/utils/Logging.h +++ b/paddle/utils/Logging.h @@ -32,11 +32,11 @@ limitations under the License. */ /** * Generate Unique Variable Name, Usefully in macro. - * @SEE http://stackoverflow.com/questions/1082192/how-to-generate-random-variable-names-in-c-using-macros + * @SEE + * http://stackoverflow.com/questions/1082192/how-to-generate-random-variable-names-in-c-using-macros */ #define UNIQUE_NAME(base) PP_CAT(base, __LINE__) - namespace paddle { //! Log levels. @@ -175,7 +175,7 @@ void installFailureFunction(void (*callback)() ATTR_NORETURN); * @brief installFailureWriter * @note: not implemented currently. */ -inline void installFailureWriter(void(*callback)(const char*, int)) { +inline void installFailureWriter(void (*callback)(const char*, int)) { (void)(callback); // unused callback. } } // namespace logging @@ -187,7 +187,7 @@ void initializeLogging(int argc, char** argv); namespace logging { void setMinLogLevel(int level); void installFailureFunction(void (*callback)()); -void installFailureWriter(void(*callback)(const char*, int)); +void installFailureWriter(void (*callback)(const char*, int)); } // namespace logging } #endif // PADDLE_USE_GLOG diff --git a/paddle/utils/PythonUtil.cpp b/paddle/utils/PythonUtil.cpp index 90e5093f96ea4e892b7f2b1f2baa1bf1d6c85c05..7f17a825228ef56be7b8678bf003e57388d4b0bf 100644 --- a/paddle/utils/PythonUtil.cpp +++ b/paddle/utils/PythonUtil.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "PythonUtil.h" #include #include @@ -33,7 +32,8 @@ int executeCMD(const char* cmd, char* result) { strncpy(ps, cmd, kExecuteCMDBufLength); if ((ptr = popen(ps, "r")) != NULL) { size_t count = fread(bufPs, 1, kExecuteCMDBufLength, ptr); - memcpy(result, bufPs, + memcpy(result, + bufPs, count - 1); // why count-1: remove the '\n' at the end result[count] = 0; pclose(ptr); @@ -71,15 +71,14 @@ std::string callPythonFunc(const std::string& moduleName, #else - static std::recursive_mutex g_pyMutex; PyGuard::PyGuard() : guard_(g_pyMutex) {} - -static void printPyErrorStack(std::ostream& os, bool withEndl = false, +static void printPyErrorStack(std::ostream& os, + bool withEndl = false, bool withPyPath = true) { - PyObject * ptype, *pvalue, *ptraceback; + PyObject *ptype, *pvalue, *ptraceback; PyErr_Fetch(&ptype, &pvalue, &ptraceback); PyErr_NormalizeException(&ptype, &pvalue, &ptraceback); PyErr_Clear(); @@ -91,10 +90,8 @@ static void printPyErrorStack(std::ostream& os, bool withEndl = false, } PyTracebackObject* obj = (PyTracebackObject*)ptraceback; - os << "Python Error: " << PyString_AsString(PyObject_Str(ptype)) - <<" : " << (pvalue == NULL ? "" - : PyString_AsString( - PyObject_Str(pvalue))); + os << "Python Error: " << PyString_AsString(PyObject_Str(ptype)) << " : " + << (pvalue == NULL ? "" : PyString_AsString(PyObject_Str(pvalue))); if (withEndl) { os << std::endl; } @@ -104,8 +101,8 @@ static void printPyErrorStack(std::ostream& os, bool withEndl = false, } while (obj != NULL) { int line = obj->tb_lineno; - const char* filename = PyString_AsString( - obj->tb_frame->f_code->co_filename); + const char* filename = + PyString_AsString(obj->tb_frame->f_code->co_filename); os << " " << filename << " : " << line; if (withEndl) { os << std::endl; @@ -143,7 +140,8 @@ std::string callPythonFunc(const std::string& moduleName, } PyObjectPtr createPythonClass( - const std::string& moduleName, const std::string& className, + const std::string& moduleName, + const std::string& className, const std::vector& args, const std::map& kwargs) { PyGuard guard; @@ -164,21 +162,18 @@ PyObjectPtr createPythonClass( PyObjectPtr kwargsObjectList(PyDict_New()); for (auto& x : kwargs) { PyObjectPtr pyArg(Py_BuildValue("s#", x.second.c_str(), x.second.length())); - PyDict_SetItemString(kwargsObjectList.get(), x.first.c_str(), - pyArg.release()); + PyDict_SetItemString( + kwargsObjectList.get(), x.first.c_str(), pyArg.release()); } - PyObjectPtr pyInstance(PyInstance_New(pyClass.get(), argsObjectList.release(), - kwargsObjectList.release())); + PyObjectPtr pyInstance(PyInstance_New( + pyClass.get(), argsObjectList.release(), kwargsObjectList.release())); CHECK_PY(pyInstance) << "Create class " << className << " failed."; return pyInstance; } - namespace py { -char* repr(PyObject* obj) { - return PyString_AsString(PyObject_Repr(obj)); -} +char* repr(PyObject* obj) { return PyString_AsString(PyObject_Repr(obj)); } std::string getPyCallStack() { std::ostringstream os; @@ -186,7 +181,7 @@ std::string getPyCallStack() { return os.str(); } -PyObjectPtr import(const std::string &moduleName) { +PyObjectPtr import(const std::string& moduleName) { auto module = PyImport_ImportModule(moduleName.c_str()); CHECK_PY(module) << "Import " << moduleName << "Error"; return PyObjectPtr(module); diff --git a/paddle/utils/PythonUtil.h b/paddle/utils/PythonUtil.h index 00fc177022ac343a5760e57bcbcabf18f697bd4d..65677d90101a0ee2e62c8ac45c50b88326e169e1 100644 --- a/paddle/utils/PythonUtil.h +++ b/paddle/utils/PythonUtil.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #ifndef PADDLE_NO_PYTHON @@ -83,8 +82,7 @@ PyObjectPtr createPythonClass(const std::string& moduleName, const std::vector& args, const std::map& kwargs); -#define CHECK_PY(x)\ - CHECK((x) != nullptr) << ::paddle::py::getPyCallStack() +#define CHECK_PY(x) CHECK((x) != nullptr) << ::paddle::py::getPyCallStack() namespace py { PyObjectPtr import(const std::string& moduleName); @@ -101,13 +99,13 @@ template T castInt(PyObject* obj, bool* ok = nullptr) { if (PyLong_Check(obj)) { if (ok) *ok = true; - return (T) PyLong_AsUnsignedLong(obj); + return (T)PyLong_AsUnsignedLong(obj); } else if (PyInt_Check(obj)) { if (ok) *ok = true; - return (T) PyInt_AsLong(obj); + return (T)PyInt_AsLong(obj); } else { if (ok) *ok = false; - return (T) 0; + return (T)0; } } @@ -116,14 +114,12 @@ T castInt(PyObject* obj, bool* ok = nullptr) { * * Just like toString method in java. */ -char *repr(PyObject* obj); +char* repr(PyObject* obj); /** * Invoke repr of python object. */ -inline char *repr(const PyObjectPtr &obj) { - return repr(obj.get()); -} +inline char* repr(const PyObjectPtr& obj) { return repr(obj.get()); } /** * Get Python Error Stack String. @@ -137,8 +133,7 @@ std::string getPyCallStack(); */ class ObjectHelper { public: - explicit ObjectHelper(const PyObjectPtr& obj): obj_(obj) { - } + explicit ObjectHelper(const PyObjectPtr& obj) : obj_(obj) {} /** * get attribute @@ -211,15 +206,13 @@ public: CHECK(PySequence_Check(seq_)); } - explicit SequenceHelper(PyObject* seq): seq_(seq) { + explicit SequenceHelper(PyObject* seq) : seq_(seq) { CHECK(PySequence_Check(seq_)); } - inline size_t size() const { - return (size_t) PySequence_Size(seq_); - } + inline size_t size() const { return (size_t)PySequence_Size(seq_); } - inline PyObject* operator[] (size_t i) const { + inline PyObject* operator[](size_t i) const { return PySequence_Fast_GET_ITEM(seq_, i); } @@ -260,9 +253,9 @@ private: class DictHelper { public: - explicit DictHelper(PyObject* d): dict_(d) {} + explicit DictHelper(PyObject* d) : dict_(d) {} - explicit DictHelper(const PyObjectPtr& d): dict_(d.get()) {} + explicit DictHelper(const PyObjectPtr& d) : dict_(d.get()) {} void set(const std::string& key, PyObject* item) { PyDict_SetItemString(dict_, key.c_str(), item); @@ -274,17 +267,15 @@ public: void setStringList(const std::string& key, const std::vector& items) { - auto * list = PyList_New(items.size()); - for (size_t i=0; i < items.size(); ++i) { + auto* list = PyList_New(items.size()); + for (size_t i = 0; i < items.size(); ++i) { PyList_SetItem(list, i, PyString_FromString(items[i].c_str())); } this->set(key, list); } private: - inline void checkDict() { - CHECK(PyDict_Check(this->dict_)); - } + inline void checkDict() { CHECK(PyDict_Check(this->dict_)); } PyObject* dict_; }; @@ -298,7 +289,7 @@ inline static bool isCallable(const PyObjectPtr& obj) { */ class CallableHelper { public: - explicit CallableHelper(const PyObjectPtr& obj): obj_(obj) { + explicit CallableHelper(const PyObjectPtr& obj) : obj_(obj) { CHECK(py::isCallable(obj_)); } @@ -308,21 +299,17 @@ public: * reset args, and create new tuple. * @param sz args size. */ - void setArgsSize(size_t sz) { - args.reset(PyTuple_New(sz)); - } + void setArgsSize(size_t sz) { args.reset(PyTuple_New(sz)); } /** * Get args sequence. User can set/get by SequenceHelper. */ - SequenceHelper getArgs() { - return SequenceHelper(args); - } + SequenceHelper getArgs() { return SequenceHelper(args); } /** * Call python method, return an object. */ - PyObject* operator() () { + PyObject* operator()() { PyGuard guard; return PyObject_Call(obj_.get(), args.get(), kwargs.get()); } diff --git a/paddle/utils/Queue.h b/paddle/utils/Queue.h index f952cf58778dee0565a8e88ef0015d51dc295428..58d17e86c432b90a6b3240dd5528146a24b72184 100644 --- a/paddle/utils/Queue.h +++ b/paddle/utils/Queue.h @@ -142,12 +142,9 @@ public: */ bool waitNotEmptyFor(int seconds) { std::unique_lock lock(queueLock_); - return queueCV_.wait_for( - lock, - std::chrono::seconds(seconds), - [this] { - return numElements_ != 0; - }); + return queueCV_.wait_for(lock, + std::chrono::seconds(seconds), + [this] { return numElements_ != 0; }); } private: @@ -190,7 +187,7 @@ template class BlockingQueue { public: /** - * @brief Construct Function. + * @brief Construct Function. * @param[in] capacity the max numer of elements the queue can have. */ explicit BlockingQueue(size_t capacity) : capacity_(capacity) {} @@ -198,9 +195,9 @@ public: /** * @brief enqueue an element into Queue. * @param[in] x The enqueue element, pass by reference . - * @note This method is thread-safe, and will wake up another thread + * @note This method is thread-safe, and will wake up another thread * who was blocked because of the queue is empty. - * @note If it's size() >= capacity before enqueue, + * @note If it's size() >= capacity before enqueue, * this method will block and wait until size() < capacity. */ void enqueue(const T& x) { @@ -229,7 +226,7 @@ public: /** * Return size of queue. * - * @note This method is thread safe. + * @note This method is thread safe. * The size of the queue won't change until the method return. */ size_t size() { diff --git a/paddle/utils/Stat.h b/paddle/utils/Stat.h index 00e5aaec2babfde5cc95b6afad8713e685ffa52a..4051145d9246639fce5d041103c1211a939eddca 100644 --- a/paddle/utils/Stat.h +++ b/paddle/utils/Stat.h @@ -93,7 +93,8 @@ public: return ret.first->second; } - BarrierStatPtr getStat(uint16_t numConnThreads, const std::string& name, + BarrierStatPtr getStat(uint16_t numConnThreads, + const std::string& name, BarrierStatType bType); void deleteStat(const std::string& name); @@ -204,8 +205,10 @@ protected: class TimerOnce { public: - TimerOnce(Stat* stat, const char* info = "", - uint64_t threshold = -1, bool autoStart = true, + TimerOnce(Stat* stat, + const char* info = "", + uint64_t threshold = -1, + bool autoStart = true, uint64_t startStamp = 0) : stat_(stat), info_(info), timer_(autoStart), threshold_(threshold) { if (!autoStart) { @@ -261,21 +264,21 @@ inline StatSet& registerTimerArg2(uint64_t threshold = -1, #define REGISTER_TIMER_SET(statName, start, ...) \ static StatPtr __stat = registerTimerArg2(__VA_ARGS__).getStat(statName); \ - TimerOnce __timerOnce(__stat.get(), "", registerTimerArg1(__VA_ARGS__), \ - false, start); + TimerOnce __timerOnce( \ + __stat.get(), "", registerTimerArg1(__VA_ARGS__), false, start); // dynmaic timer, support to discriminate runtime entity, used in pserver -#define REGISTER_TIMER_DYNAMIC(statName, ...) \ - StatPtr __stat = registerTimerArg2(__VA_ARGS__).getStat(statName); \ +#define REGISTER_TIMER_DYNAMIC(statName, ...) \ + StatPtr __stat = registerTimerArg2(__VA_ARGS__).getStat(statName); \ TimerOnce __timerOnce(__stat.get(), "", registerTimerArg1(__VA_ARGS__)); -#define REGISTER_TIMER_DYNAMIC_SET(statName, start, ...) \ - StatPtr __stat = registerTimerArg2(__VA_ARGS__).getStat(statName); \ - TimerOnce __timerOnce(__stat.get(), "", registerTimerArg1(__VA_ARGS__), \ - false, start); +#define REGISTER_TIMER_DYNAMIC_SET(statName, start, ...) \ + StatPtr __stat = registerTimerArg2(__VA_ARGS__).getStat(statName); \ + TimerOnce __timerOnce( \ + __stat.get(), "", registerTimerArg1(__VA_ARGS__), false, start); -#define REGISTER_TIMER_INFO(statName, info) \ - static StatPtr __stat = globalStat.getStat(statName); \ +#define REGISTER_TIMER_INFO(statName, info) \ + static StatPtr __stat = globalStat.getStat(statName); \ TimerOnce __timerOnce(__stat.get(), info, 10 * 1000000LU /*threshold*/); #endif // DISABLE_TIMER diff --git a/paddle/utils/StringUtil.h b/paddle/utils/StringUtil.h index 50301a19be46bf608cf072d3f47335abbb830bc9..8b44dad19231781623a0a65d02b24ac1cf9e4523 100644 --- a/paddle/utils/StringUtil.h +++ b/paddle/utils/StringUtil.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include @@ -68,8 +67,6 @@ inline T to(const std::string& s) { return v; } - - } // namespace str #undef DEFINE_STRING_CONVERSION diff --git a/paddle/utils/Thread.h b/paddle/utils/Thread.h index f6c826a1eeb656ff852c70f70b85c3b00a6a5e8b..ade0ee496f94f6165f35dd1a0a37618df8fae585 100644 --- a/paddle/utils/Thread.h +++ b/paddle/utils/Thread.h @@ -57,7 +57,8 @@ public: void join() { thread_->join(); } /** - * @brief Define what to be done on this thread through override this function. + * @brief Define what to be done on this thread through override this + * function. */ virtual void run() = 0; @@ -155,10 +156,9 @@ public: /** * @brief Construct Function. No thread will be created. */ - SyncThreadPool() - : jobStartBarrier_(0), - jobFinishBarrier_(0) - { LOG(FATAL) << "Not implemented"; } + SyncThreadPool() : jobStartBarrier_(0), jobFinishBarrier_(0) { + LOG(FATAL) << "Not implemented"; + } /** * @brief Construct Fucntion. Create numWorkers of threads in the pool. @@ -191,7 +191,8 @@ public: /** * @brief Execute a job using all the theads in the pool. * @param[in] jobFunc The function to be executed. - * @param[in] ownerFunc Owner thread can do something in owerFunc when job executing. + * @param[in] ownerFunc Owner thread can do something in owerFunc when job + * executing. * @note For the ownerFunc, tid=getNumThreads(). */ void exec(JobFunc jobFunc, JobFunc ownerFunc = nullptr) { @@ -316,7 +317,8 @@ protected: * * Force stop: * - * Use forceStop() to exit forcibly even though there are remaining jobs in the + * Use forceStop() to exit forcibly even though there are remaining jobs in + * the * job queue. */ template @@ -426,7 +428,8 @@ protected: /** * @brief Do the jobs in the job queue sequentianlly * and enqueue the result into the result queue. - * @note A nullptr will be enqueued into the resulte queue, when a worker finished. + * @note A nullptr will be enqueued into the resulte queue, when a worker + * finished. */ virtual void run() { while (true) { @@ -492,7 +495,9 @@ public: } ~AsyncThreadPool() { - if (!stopping_) { stop(); } + if (!stopping_) { + stop(); + } } /** @@ -501,7 +506,7 @@ public: void stop() { stopping_ = true; for (size_t i = 0; i < workers_.size(); i++) { - jobs_.enqueue([]{}); + jobs_.enqueue([] {}); } for (auto& worker : workers_) { worker->join(); @@ -526,7 +531,7 @@ public: * asynchronously. * Call std::future::get() when the execturation result is needed; */ - template + template auto addJob(F&& f, Args&&... args) -> std::future::type> { CHECK(!stopping_) << "AsyncThreadPool is closed"; @@ -535,7 +540,7 @@ public: auto task = std::make_shared>( std::bind(std::forward(f), std::forward(args)...)); auto res = task->get_future(); - jobs_.enqueue([task]{ (*task)(); }); + jobs_.enqueue([task] { (*task)(); }); return res; } @@ -551,15 +556,15 @@ public: * * @note *results* may need to be carefully cleared before *addBatchJobs()*. */ - template - void addBatchJobs(const std::vector &jobs, - std::vector::type> &results) { + template + void addBatchJobs(const std::vector& jobs, + std::vector::type>& results) { typedef typename std::result_of::type T; static_assert(!std::is_same::value, - "should pass a non-void function as job"); + "should pass a non-void function as job"); - std::vector > resFuts; - for (const auto &job : jobs) { + std::vector> resFuts; + for (const auto& job : jobs) { resFuts.emplace_back(addJob(job)); } for (auto& fut : resFuts) { @@ -572,13 +577,16 @@ public: * @tparam F don't need to have a return value. * @param[in] jobs a vector of executable objection. */ - template - void addBatchJobs(const std::vector &jobs) { + template + void addBatchJobs(const std::vector& jobs) { CHECK(!stopping_) << "AsyncThreadPool is closed"; - std::vector > tmpRes; + std::vector> tmpRes; for (const auto& job : jobs) { - tmpRes.emplace_back(addJob([&job]{ job(); return true; })); + tmpRes.emplace_back(addJob([&job] { + job(); + return true; + })); } for (auto& res : tmpRes) { @@ -604,4 +612,4 @@ private: bool stopping_; }; // class AsyncThreadPool -} // namespace paddle +} // namespace paddle diff --git a/paddle/utils/ThreadLocal.cpp b/paddle/utils/ThreadLocal.cpp index 0f948f1029af85c97d2564a089b7bf878244643c..49d4b1526537def9b8183934faa971402f3678aa 100644 --- a/paddle/utils/ThreadLocal.cpp +++ b/paddle/utils/ThreadLocal.cpp @@ -16,7 +16,8 @@ limitations under the License. */ #include "ThreadLocal.h" #include "CommandLineParser.h" -P_DEFINE_bool(thread_local_rand_use_global_seed, false, +P_DEFINE_bool(thread_local_rand_use_global_seed, + false, "Whether to use global seed in thread local rand."); namespace paddle { diff --git a/paddle/utils/ThreadLocal.h b/paddle/utils/ThreadLocal.h index b91e4ad5472cab4f48f1eb59304aa7c0cf3f621f..06c8b392af23f81ab48042cb4d24a40b1c50275d 100644 --- a/paddle/utils/ThreadLocal.h +++ b/paddle/utils/ThreadLocal.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include @@ -91,9 +90,7 @@ public: /** * Implicit conversion to T* */ - operator T*() { - return get(); - } + operator T*() { return get(); } private: static void dataDestructor(void* p) { delete (T*)p; } diff --git a/paddle/utils/TypeDefs.h b/paddle/utils/TypeDefs.h index e02fd62b53823f8bc84b957b4fa62aeb62346c0d..e8be779bea255eec71057495d1253ed92c2256c3 100644 --- a/paddle/utils/TypeDefs.h +++ b/paddle/utils/TypeDefs.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once namespace paddle { diff --git a/paddle/utils/Util.cpp b/paddle/utils/Util.cpp index b16d4314654ffeab74137ec1ee69203dab56d851..bc727cfa74cdfb51b36259bd08733804578f6d66 100644 --- a/paddle/utils/Util.cpp +++ b/paddle/utils/Util.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "Util.h" #include @@ -54,7 +53,8 @@ P_DEFINE_int32(seed, 1, "random number seed. 0 for srand(time)"); #include P_DEFINE_int32(profile_signal, 12, "signal for switch google profiler"); -P_DEFINE_string(profile_data_file, "gperf.prof", +P_DEFINE_string(profile_data_file, + "gperf.prof", "file for storing profile data"); static void profilerSwitch(int signalNumber) { @@ -94,18 +94,18 @@ static void installProfilerSwitch() {} namespace paddle { pid_t getTID() { - #if defined(__APPLE__) || defined(__OSX__) - // syscall is deprecated: first deprecated in macOS 10.12. - // syscall is unsupported; - // syscall pid_t tid = syscall(SYS_thread_selfid); - uint64_t tid; - pthread_threadid_np(NULL, &tid); - #else - #ifndef __NR_gettid - #define __NR_gettid 224 - #endif - pid_t tid = syscall(__NR_gettid); - #endif +#if defined(__APPLE__) || defined(__OSX__) + // syscall is deprecated: first deprecated in macOS 10.12. + // syscall is unsupported; + // syscall pid_t tid = syscall(SYS_thread_selfid); + uint64_t tid; + pthread_threadid_np(NULL, &tid); +#else +#ifndef __NR_gettid +#define __NR_gettid 224 +#endif + pid_t tid = syscall(__NR_gettid); +#endif CHECK_NE((int)tid, -1); return tid; } @@ -126,22 +126,25 @@ void registerInitFunction(std::function func, int priority) { } void runInitFunctions() { - std::call_once(g_onceFlag, []() { - LOG(INFO) << "Calling runInitFunctions"; - if (g_initFuncs) { - std::sort(g_initFuncs->begin(), g_initFuncs->end(), - [](const PriorityFuncPair& x, const PriorityFuncPair& y) { - return x.first > y.first; - }); - for (auto& f : *g_initFuncs) { - f.second(); - } - delete g_initFuncs; - g_initFuncs = nullptr; - } - g_initialized = true; - LOG(INFO) << "Call runInitFunctions done."; - }); + std::call_once( + g_onceFlag, + []() { + LOG(INFO) << "Calling runInitFunctions"; + if (g_initFuncs) { + std::sort(g_initFuncs->begin(), + g_initFuncs->end(), + [](const PriorityFuncPair& x, const PriorityFuncPair& y) { + return x.first > y.first; + }); + for (auto& f : *g_initFuncs) { + f.second(); + } + delete g_initFuncs; + g_initFuncs = nullptr; + } + g_initialized = true; + LOG(INFO) << "Call runInitFunctions done."; + }); } void initMain(int argc, char** argv) { @@ -282,7 +285,7 @@ void mkDir(const char* filename) { } } -void mkDirRecursively(const char *dir) { +void mkDirRecursively(const char* dir) { struct stat sb; if (!stat(dir, &sb)) return; @@ -303,7 +306,6 @@ void loadFileList(const std::string& fileListFileName, } } - double getMemoryUsage() { FILE* fp = fopen("/proc/meminfo", "r"); CHECK(fp) << "failed to fopen /proc/meminfo"; @@ -363,7 +365,9 @@ size_t calculateServiceNum(const std::string& pservers, int ports_num) { return hosts.size() * ports_num; } -void memcpyWithCheck(void* dest, const void* src, size_t num, +void memcpyWithCheck(void* dest, + const void* src, + size_t num, const void* srcEnd) { int minus = (char*)srcEnd - (char*)src - num; CHECK_LE(0, minus) << "memcpyWithCheck: copy " << num diff --git a/paddle/utils/Util.h b/paddle/utils/Util.h index 2adb626c83f94c7c5d7a8d53653a46090e19e7b7..ed38f8fa60b3716c12e755b047557c1409fa767c 100644 --- a/paddle/utils/Util.h +++ b/paddle/utils/Util.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include @@ -47,7 +46,8 @@ limitations under the License. */ */ #define FOR_EACH(iterator_name, container) \ for (auto iterator_name = (container).begin(), e = (container).end(); \ - iterator_name != e; ++iterator_name) + iterator_name != e; \ + ++iterator_name) /** * Loop over the elements in a container in reverse order @@ -60,8 +60,8 @@ limitations under the License. */ */ #define FOR_EACH_R(iterator_name, container) \ for (auto iterator_name = (container).rbegin(), e = (container).rend(); \ - iterator_name != e; ++iterator_name) - + iterator_name != e; \ + ++iterator_name) namespace paddle { @@ -77,11 +77,11 @@ pid_t getTID(); * \f] */ inline constexpr size_t findLastSet(size_t x) { - return std::is_same::value ? - (x ? 8 * sizeof(x) - __builtin_clz(x) : 0) - : (std::is_same::value ? // NOLINT - (x ? 8 * sizeof(x) - __builtin_clzl(x) : 0) - : (x ? 8 * sizeof(x) - __builtin_clzll(x) : 0)); + return std::is_same::value + ? (x ? 8 * sizeof(x) - __builtin_clz(x) : 0) + : (std::is_same::value // NOLINT + ? (x ? 8 * sizeof(x) - __builtin_clzl(x) : 0) + : (x ? 8 * sizeof(x) - __builtin_clzll(x) : 0)); } /** @@ -95,7 +95,6 @@ inline int mod(int a, int b) { return r >= 0 ? r : r + b; } - /** * find the value given a key k from container c. * If the key can be found, the value is stored in *value @@ -120,7 +119,7 @@ static bool contains(const Container& container, const T& val) { /** * pop and get the front element of a container */ -template +template typename Container::value_type pop_get_front(Container& c) { typename Container::value_type v; swap(v, c.front()); @@ -207,7 +206,6 @@ protected: int devId_; }; - /** * Enables direct access to memory allocations on a peer device(d2). * input: @@ -250,7 +248,6 @@ private: bool syncFlag_; }; - inline bool useGpu(int deviceId) { return FLAGS_parallel_nn ? (deviceId >= 0 ? true : false) : FLAGS_use_gpu; } @@ -328,7 +325,9 @@ T readT(char*& p, const char* pEnd) { return v; } -void memcpyWithCheck(void* dest, const void* src, size_t num, +void memcpyWithCheck(void* dest, + const void* src, + size_t num, const void* srcEnd); /** @@ -338,7 +337,6 @@ void memcpyWithCheck(void* dest, const void* src, size_t num, class SyncThreadPool; SyncThreadPool* getGlobalSyncThreadPool(); - namespace path { // directory separator @@ -363,7 +361,8 @@ std::string dirname(const std::string& path); std::string join(const std::string& part1, const std::string& part2); template -std::string join(const std::string& part1, const std::string& part2, +std::string join(const std::string& part1, + const std::string& part2, Args... args) { return join(join(part1, part2), args...); } @@ -392,8 +391,8 @@ public: std::call_once(onceFlag_, [&] { invokeThreadId_ = curThreadId; }); CHECK_EQ(invokeThreadId_, curThreadId) << "This method should invoke in " - "same thread, but first invoked in " << invokeThreadId_ - << " current invoked in " << curThreadId; + "same thread, but first invoked in " + << invokeThreadId_ << " current invoked in " << curThreadId; } private: @@ -447,28 +446,23 @@ private: * @brief The ScopedCallbacks class is a callback invoker when object is * created and destroyed. */ -template +template class ScopedCallbacks { public: - ScopedCallbacks(CallbackType enter, - CallbackType exit, - Args& ... args) - : exit_(std::bind(exit, args...)) { + ScopedCallbacks(CallbackType enter, CallbackType exit, Args&... args) + : exit_(std::bind(exit, args...)) { enter(args...); } ScopedCallbacks(const ScopedCallbacks& other) = delete; - ScopedCallbacks& operator = (const ScopedCallbacks& other) = delete; + ScopedCallbacks& operator=(const ScopedCallbacks& other) = delete; - ~ScopedCallbacks() { - exit_(); - } + ~ScopedCallbacks() { exit_(); } private: std::function exit_; }; - /** * std compatible allocator with memory alignment. * @tparam T type of allocator elements. @@ -537,8 +531,7 @@ public: return nullptr; } if (n > max_size()) { - throw std::length_error( - "AlignAllocator::allocate() - Int Overflow."); + throw std::length_error("AlignAllocator::allocate() - Int Overflow."); } void* r = nullptr; CHECK_EQ(posix_memalign(&r, Alignment * 8, sizeof(T) * n), 0); @@ -558,7 +551,6 @@ private: AlignedAllocator& operator=(const AlignedAllocator&); // disable }; - class Deprecated { public: explicit Deprecated(const std::string& msg = "") { diff --git a/paddle/utils/Version.cpp b/paddle/utils/Version.cpp index b59b78f5707bd4a7ee9f8073927f55c0c9ef0398..e706983918b4a865f6674a34083ef0143bd6e185 100644 --- a/paddle/utils/Version.cpp +++ b/paddle/utils/Version.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "Version.h" #include "Flags.h" @@ -34,18 +33,22 @@ void printVersion(std::ostream& os) { #ifndef PADDLE_VERSION #define PADDLE_VERSION "unknown" #endif - os << "paddle version: " << PADDLE_VERSION << std::endl << std::boolalpha - << "\t" << "withGpu: " << version::isWithGpu() << std::endl - << "\t" << "withAvx: " << version::isWithAvx() << std::endl - << "\t" << "withPyDataProvider: " << version::isWithPyDataProvider() - << std::endl - << "\t" << "withTimer: " << version::isWithTimer() << std::endl - << "\t" << "withFpga: " << version::isWithFpga() << std::endl - << "\t" << "real byte size: "<< version::sizeofReal() << std::endl - << std::endl; + os << "paddle version: " << PADDLE_VERSION << std::endl + << std::boolalpha << "\t" + << "withGpu: " << version::isWithGpu() << std::endl + << "\t" + << "withAvx: " << version::isWithAvx() << std::endl + << "\t" + << "withPyDataProvider: " << version::isWithPyDataProvider() << std::endl + << "\t" + << "withTimer: " << version::isWithTimer() << std::endl + << "\t" + << "withFpga: " << version::isWithFpga() << std::endl + << "\t" + << "real byte size: " << version::sizeofReal() << std::endl + << std::endl; } - void printVersion() { if (FLAGS_version) { printVersion(std::cout); diff --git a/paddle/utils/Version.h b/paddle/utils/Version.h index e6655fa75dabfeec99bc2157b8c9a1e9e4f19263..e6c799644ee7f88e4e90eec565d1bab2bc9faed7 100644 --- a/paddle/utils/Version.h +++ b/paddle/utils/Version.h @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once #include #include "TypeDefs.h" @@ -35,7 +34,6 @@ namespace paddle { * real byte size: 4 */ - namespace version { /** @@ -44,7 +42,6 @@ namespace version { */ void printVersion(); - void printVersion(std::ostream& os); /** * @brief isWithGpu @@ -75,7 +72,6 @@ constexpr bool isWithPyDataProvider() { #endif } - /** * @brief isWithTimer * @return true if paddle compiled with timer. @@ -116,25 +112,19 @@ constexpr bool isWithFpga() { * @brief sizeofReal * @return return the byte size of real */ -constexpr size_t sizeofReal() { - return sizeof(real); -} +constexpr size_t sizeofReal() { return sizeof(real); } /** * @brief isPaddleUseDouble * @return true if paddle compiled with double precision. */ -constexpr bool isPaddleUseDouble() { - return sizeofReal() == sizeof(double); -} +constexpr bool isPaddleUseDouble() { return sizeofReal() == sizeof(double); } /** * @brief isPaddleUseFloat * @return true if paddle compiled with float precision */ -constexpr bool isPaddleUseFloat() { - return sizeofReal() == sizeof(float); -} +constexpr bool isPaddleUseFloat() { return sizeofReal() == sizeof(float); } } // namespace version diff --git a/paddle/utils/arch/linux/Locks.cpp b/paddle/utils/arch/linux/Locks.cpp index 347ae64c26dfdfcdaff62886481c20e9c4c7cfec..93016daeaea644ca44499fdc6024ec8deac57ca8 100644 --- a/paddle/utils/arch/linux/Locks.cpp +++ b/paddle/utils/arch/linux/Locks.cpp @@ -22,26 +22,19 @@ public: sem_t sem; }; -Semaphore::Semaphore(int initValue): m(new SemaphorePrivate()) { +Semaphore::Semaphore(int initValue) : m(new SemaphorePrivate()) { sem_init(&m->sem, 0, initValue); } -Semaphore::~Semaphore() { - sem_destroy(&m->sem); -} +Semaphore::~Semaphore() { sem_destroy(&m->sem); } bool Semaphore::timeWait(struct timespec* ts) { return (0 == sem_timedwait(&m->sem, ts)); } -void Semaphore::wait() { - sem_wait(&m->sem); -} - -void Semaphore::post() { - sem_post(&m->sem); -} +void Semaphore::wait() { sem_wait(&m->sem); } +void Semaphore::post() { sem_post(&m->sem); } class SpinLockPrivate { public: @@ -51,25 +44,20 @@ public: char padding_[64 - sizeof(pthread_spinlock_t)]; }; -SpinLock::SpinLock():m(new SpinLockPrivate()) {} - +SpinLock::SpinLock() : m(new SpinLockPrivate()) {} SpinLock::~SpinLock() { delete m; } -void SpinLock::lock() { - pthread_spin_lock(&m->lock_); -} +void SpinLock::lock() { pthread_spin_lock(&m->lock_); } -void SpinLock::unlock() { - pthread_spin_unlock(&m->lock_); -} +void SpinLock::unlock() { pthread_spin_unlock(&m->lock_); } class ThreadBarrierPrivate { public: pthread_barrier_t barrier_; }; -ThreadBarrier::ThreadBarrier(int count): m(new ThreadBarrierPrivate()) { +ThreadBarrier::ThreadBarrier(int count) : m(new ThreadBarrierPrivate()) { pthread_barrier_init(&m->barrier_, nullptr, count); } @@ -78,8 +66,6 @@ ThreadBarrier::~ThreadBarrier() { delete m; } -void ThreadBarrier::wait() { - pthread_barrier_wait(&m->barrier_); -} +void ThreadBarrier::wait() { pthread_barrier_wait(&m->barrier_); } } // namespace paddle diff --git a/paddle/utils/arch/osx/Locks.cpp b/paddle/utils/arch/osx/Locks.cpp index b3ec454976520be10995bd8399b7ce838e3fa824..ae563a6afd29b6315d9c6609474faddbfaaded14 100644 --- a/paddle/utils/arch/osx/Locks.cpp +++ b/paddle/utils/arch/osx/Locks.cpp @@ -22,20 +22,16 @@ namespace paddle { class SemaphorePrivate { public: - ~SemaphorePrivate() { - dispatch_release(sem); - } + ~SemaphorePrivate() { dispatch_release(sem); } dispatch_semaphore_t sem; }; -Semaphore::Semaphore(int initValue): m(new SemaphorePrivate()) { +Semaphore::Semaphore(int initValue) : m(new SemaphorePrivate()) { m->sem = dispatch_semaphore_create(initValue); } -Semaphore::~Semaphore() { - delete m; -} +Semaphore::~Semaphore() { delete m; } bool Semaphore::timeWait(timespec *ts) { dispatch_time_t tm = dispatch_walltime(ts, 0); @@ -46,9 +42,7 @@ void Semaphore::wait() { dispatch_semaphore_wait(m->sem, DISPATCH_TIME_FOREVER); } -void Semaphore::post() { - dispatch_semaphore_signal(m->sem); -} +void Semaphore::post() { dispatch_semaphore_signal(m->sem); } class SpinLockPrivate { public: @@ -56,17 +50,15 @@ public: char padding_[64 - sizeof(lock_)]; // Padding to cache line size }; -SpinLock::SpinLock(): m(new SpinLockPrivate()) {} +SpinLock::SpinLock() : m(new SpinLockPrivate()) {} SpinLock::~SpinLock() { delete m; } void SpinLock::lock() { - while (m->lock_.test_and_set(std::memory_order_acquire)) {} -} - -void SpinLock::unlock() { - m->lock_.clear(std::memory_order_release); + while (m->lock_.test_and_set(std::memory_order_acquire)) { + } } +void SpinLock::unlock() { m->lock_.clear(std::memory_order_release); } class ThreadBarrierPrivate { public: @@ -75,7 +67,7 @@ public: int count_; int tripCount_; - inline explicit ThreadBarrierPrivate(int cnt):count_(0), tripCount_(cnt) { + inline explicit ThreadBarrierPrivate(int cnt) : count_(0), tripCount_(cnt) { CHECK_NE(cnt, 0); CHECK_GE(pthread_mutex_init(&mutex_, 0), 0); CHECK_GE(pthread_cond_init(&cond_, 0), 0); @@ -106,7 +98,7 @@ public: } }; -ThreadBarrier::ThreadBarrier(int count): m(new ThreadBarrierPrivate(count)) {} +ThreadBarrier::ThreadBarrier(int count) : m(new ThreadBarrierPrivate(count)) {} ThreadBarrier::~ThreadBarrier() { delete m; } void ThreadBarrier::wait() { m->wait(); } diff --git a/paddle/utils/tests/test_CommandLineParser.cpp b/paddle/utils/tests/test_CommandLineParser.cpp index 9bb6827540f61e8c6cc8b64c2b04ed4d0fcebab1..5ecfb2b4f511e63eac21a5eae3829532f6860d66 100644 --- a/paddle/utils/tests/test_CommandLineParser.cpp +++ b/paddle/utils/tests/test_CommandLineParser.cpp @@ -63,10 +63,15 @@ TEST(CommandLineParser, defaultValue) { } TEST(CommandLineParser, normal) { - char* argv[] = { - cc("test_program"), cc("--i2=32"), cc("--str1=abc"), - cc("--b2=1"), cc("-b1=False"), cc("--d2=.34"), - cc("--d1=0"), cc("--l1=-12345678901234"), cc("-ul2=3212")}; + char* argv[] = {cc("test_program"), + cc("--i2=32"), + cc("--str1=abc"), + cc("--b2=1"), + cc("-b1=False"), + cc("--d2=.34"), + cc("--d1=0"), + cc("--l1=-12345678901234"), + cc("-ul2=3212")}; int argc = sizeof(argv) / sizeof(char*); paddle::ParseCommandLineFlags(&argc, argv); ASSERT_EQ(argc, 1); @@ -104,8 +109,6 @@ int main(int argc, char** argv) { #else -int main(int argc, char** argv) { - return 0; -} +int main(int argc, char** argv) { return 0; } #endif diff --git a/paddle/utils/tests/test_CustomStackTrace.cpp b/paddle/utils/tests/test_CustomStackTrace.cpp index 3e665021471cb3c179b13960dcc9f2284a0d664c..3bfb381ed93659feebcc567a04b2a095dc94dfa8 100644 --- a/paddle/utils/tests/test_CustomStackTrace.cpp +++ b/paddle/utils/tests/test_CustomStackTrace.cpp @@ -22,11 +22,12 @@ limitations under the License. */ P_DEFINE_int32(test_thread_num, 10, "testing thread number"); -void testNormalImpl(const std::function&, - size_t, size_t, - paddle::ThreadBarrier&, - paddle::ThreadBarrier&)>& callback) { +void testNormalImpl( + const std::function&, + size_t, + size_t, + paddle::ThreadBarrier&, + paddle::ThreadBarrier&)>& callback) { paddle::CustomStackTrace tracer; paddle::ThreadBarrier doneBarrier(FLAGS_test_thread_num + 1); paddle::ThreadBarrier startBarrier(FLAGS_test_thread_num + 1); @@ -35,10 +36,13 @@ void testNormalImpl(const std::function> threads; threads.reserve(FLAGS_test_thread_num); - for (int32_t i=0; i < FLAGS_test_thread_num; ++i) { - threads.emplace_back(new std::thread([&tracer, &countDown, &layerSize, - &startBarrier, &doneBarrier, - &callback]{ + for (int32_t i = 0; i < FLAGS_test_thread_num; ++i) { + threads.emplace_back(new std::thread([&tracer, + &countDown, + &layerSize, + &startBarrier, + &doneBarrier, + &callback] { callback(tracer, countDown, layerSize, startBarrier, doneBarrier); })); } @@ -55,18 +59,19 @@ void testNormalImpl(const std::function& tracer, - size_t countDown, size_t layerSize, - paddle::ThreadBarrier& start, paddle::ThreadBarrier& finish){ + size_t countDown, + size_t layerSize, + paddle::ThreadBarrier& start, + paddle::ThreadBarrier& finish) { while (countDown-- > 0) { start.wait(); - for (size_t i=0; i < layerSize; ++i) { + for (size_t i = 0; i < layerSize; ++i) { tracer.push("layer_" + std::to_string(i)); } tracer.pop(""); - for (size_t i=0; i < layerSize; ++i) { + for (size_t i = 0; i < layerSize; ++i) { tracer.pop("layer_" + std::to_string(layerSize - 1 - i)); } finish.wait(); @@ -75,12 +80,14 @@ TEST(CustomStackTrace, normalTrain) { } TEST(CustomStackTrace, normalTest) { - testNormalImpl([] (paddle::CustomStackTrace& tracer, - size_t countDown, size_t layerSize, - paddle::ThreadBarrier& start, paddle::ThreadBarrier& finish){ + testNormalImpl([](paddle::CustomStackTrace& tracer, + size_t countDown, + size_t layerSize, + paddle::ThreadBarrier& start, + paddle::ThreadBarrier& finish) { while (countDown-- > 0) { start.wait(); - for (size_t i=0; i < layerSize; ++i) { + for (size_t i = 0; i < layerSize; ++i) { tracer.push("layer_" + std::to_string(i)); } tracer.clear(); // in forward test, tracer will clear after forward. diff --git a/paddle/utils/tests/test_CustomStackTracePrint.cpp b/paddle/utils/tests/test_CustomStackTracePrint.cpp index c19c98614e6a7d6285990aa19849131579f7307b..d39a190961a96906eef2b510cb3538c639d5df5c 100644 --- a/paddle/utils/tests/test_CustomStackTracePrint.cpp +++ b/paddle/utils/tests/test_CustomStackTracePrint.cpp @@ -18,7 +18,7 @@ limitations under the License. */ int main(int argc, char** argv) { paddle::initMain(argc, argv); - for (size_t i=0; i < 1000; ++i) { + for (size_t i = 0; i < 1000; ++i) { paddle::gLayerStackTrace.push("layer_" + std::to_string(i)); if (i == 998) { throw "Unhandle exception"; diff --git a/paddle/utils/tests/test_Logging.cpp b/paddle/utils/tests/test_Logging.cpp index a9382de6da4ef5b425afa4a8d76652d7506d8e72..9f477fab14a2abde93505a05fc4c9ccd3d6426b6 100644 --- a/paddle/utils/tests/test_Logging.cpp +++ b/paddle/utils/tests/test_Logging.cpp @@ -54,7 +54,7 @@ TEST(Logging, Check) { auto pcheckDown = [&] { P_CHECK(a == b); }; ASSERT_DEATH(pcheckDown(), - "F .*test_Logging.cpp:[0-9]+] Check failed: a == b "); + "F .*test_Logging.cpp:[0-9]+] Check failed: a == b "); P_CHECK_LE(a, b); P_CHECK_LT(a, b); @@ -157,8 +157,6 @@ int main(int argc, char** argv) { #else -int main(int, char**) { - return 0; -} +int main(int, char**) { return 0; } #endif diff --git a/paddle/utils/tests/test_SpinLock.cpp b/paddle/utils/tests/test_SpinLock.cpp index ebc84e0f52d823bf4799d08ff8ea6a036e131f66..77d281962cfeaa3cc951a72eddf4f37b619c5691 100644 --- a/paddle/utils/tests/test_SpinLock.cpp +++ b/paddle/utils/tests/test_SpinLock.cpp @@ -21,17 +21,18 @@ limitations under the License. */ P_DEFINE_int32(test_thread_num, 100, "testing thread number"); -void testNormalImpl(size_t thread_num, const std::function - & callback) { +void testNormalImpl( + size_t thread_num, + const std::function& callback) { paddle::SpinLock mutex; std::vector threads; threads.reserve(thread_num); size_t count = 0; for (size_t i = 0; i < thread_num; ++i) { - threads.emplace_back([&thread_num, &count, &mutex, &callback]{ - callback(thread_num, count, mutex); - }); + threads.emplace_back([&thread_num, &count, &mutex, &callback] { + callback(thread_num, count, mutex); + }); } for (auto& thread : threads) { thread.join(); @@ -41,12 +42,13 @@ void testNormalImpl(size_t thread_num, const std::function } TEST(ThreadSpinLock, normalTest) { - for (auto &thread_num : {10, 30, 50 , 100 , 300, 1000}) { - testNormalImpl(thread_num, [](size_t thread_num, - size_t& count, paddle::SpinLock& mutex){ - std::lock_guard lock(mutex); - ++count; - }); + for (auto& thread_num : {10, 30, 50, 100, 300, 1000}) { + testNormalImpl( + thread_num, + [](size_t thread_num, size_t& count, paddle::SpinLock& mutex) { + std::lock_guard lock(mutex); + ++count; + }); } } diff --git a/paddle/utils/tests/test_StringUtils.cpp b/paddle/utils/tests/test_StringUtils.cpp index b8636709e9b42c7baa5d0106492ab6c0782ed6d4..2c699b791ffad8ed680c5537005aac7dad832f41 100644 --- a/paddle/utils/tests/test_StringUtils.cpp +++ b/paddle/utils/tests/test_StringUtils.cpp @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #include "paddle/utils/StringUtil.h" #include diff --git a/paddle/utils/tests/test_Thread.cpp b/paddle/utils/tests/test_Thread.cpp index bf4e2753458e08a0b20a33663d8b8175919852b0..154db5d9c616d4817b933c82587834f5ce2d0f8e 100644 --- a/paddle/utils/tests/test_Thread.cpp +++ b/paddle/utils/tests/test_Thread.cpp @@ -20,7 +20,7 @@ using paddle::AsyncThreadPool; // NOLINT TEST(AsyncThreadPool, addJob) { AsyncThreadPool pool(8); - auto a = pool.addJob([]{ return 1; }); + auto a = pool.addJob([] { return 1; }); auto b = pool.addJob([] { return true; }); auto c = pool.addJob([] { return false; }); @@ -36,10 +36,7 @@ TEST(AsyncThreadPool, addBatchJob) { std::vector jobs; for (int i = 0; i < 10000; i++) { - jobs.emplace_back( - [&] { - counter++; - }); + jobs.emplace_back([&] { counter++; }); } pool.addBatchJobs(jobs); @@ -55,13 +52,16 @@ TEST(AsyncThreadPool, multiThreadAddBatchJob) { int counter = 0; const int numMonitors = 300; const int numSlaves = 300; - std::vector moniterJobs(numMonitors, [&] { - std::vector slaveJobs(numSlaves, - [mut, &counter] { - std::lock_guard lk(*mut); - counter++; - }); - levelTwoPool.addBatchJobs(slaveJobs); + std::vector moniterJobs( + numMonitors, + [&] { + std::vector slaveJobs( + numSlaves, + [mut, &counter] { + std::lock_guard lk(*mut); + counter++; + }); + levelTwoPool.addBatchJobs(slaveJobs); }); levelOnePool.addBatchJobs(moniterJobs); ASSERT_EQ(counter, numMonitors * numSlaves); @@ -70,13 +70,10 @@ TEST(AsyncThreadPool, multiThreadAddBatchJob) { TEST(AsyncThreadPool, addBatchJobWithResults) { AsyncThreadPool pool(100); - std::vector > jobs; + std::vector> jobs; const int numJobs = 100; for (int i = 0; i < numJobs; i++) { - jobs.emplace_back( - [i]{ - return i; - }); + jobs.emplace_back([i] { return i; }); } std::vector res; diff --git a/paddle/utils/tests/test_ThreadBarrier.cpp b/paddle/utils/tests/test_ThreadBarrier.cpp index 90bd6c21bc8e5ac05b248a0517f9e4fb43d04054..20b9babd94cf4e6a475daece349c871bd606d83d 100644 --- a/paddle/utils/tests/test_ThreadBarrier.cpp +++ b/paddle/utils/tests/test_ThreadBarrier.cpp @@ -22,42 +22,44 @@ limitations under the License. */ P_DEFINE_int32(test_thread_num, 100, "testing thread number"); -void testNormalImpl(size_t thread_num, - const std::function&, - paddle::ThreadBarrier&)>& callback) { - std::mutex mutex; - std::set tids; - paddle::ThreadBarrier barrier(thread_num); +void testNormalImpl( + size_t thread_num, + const std::function&, + paddle::ThreadBarrier&)>& callback) { + std::mutex mutex; + std::set tids; + paddle::ThreadBarrier barrier(thread_num); - std::vector threads; - threads.reserve(thread_num); - for (size_t i = 0; i < thread_num; ++i) { - threads.emplace_back([&thread_num, &mutex, - &tids, &barrier, &callback]{ - callback(thread_num, mutex, tids, barrier); + std::vector threads; + threads.reserve(thread_num); + for (size_t i = 0; i < thread_num; ++i) { + threads.emplace_back([&thread_num, &mutex, &tids, &barrier, &callback] { + callback(thread_num, mutex, tids, barrier); }); - } + } - for (auto& thread : threads) { - thread.join(); - } + for (auto& thread : threads) { + thread.join(); + } } TEST(ThreadBarrier, normalTest) { - for (auto &thread_num : {10, 30, 50 , 100 , 300, 1000}) { + for (auto& thread_num : {10, 30, 50, 100, 300, 1000}) { testNormalImpl(thread_num, - [](size_t thread_num, std::mutex& mutex, - std::set& tids, - paddle::ThreadBarrier& barrier){ - { - std::lock_guard guard(mutex); - tids.insert(std::this_thread::get_id()); - } - barrier.wait(); - // Check whether all threads reach this point or not - CHECK_EQ(tids.size(), thread_num); - }); + [](size_t thread_num, + std::mutex& mutex, + std::set& tids, + paddle::ThreadBarrier& barrier) { + { + std::lock_guard guard(mutex); + tids.insert(std::this_thread::get_id()); + } + barrier.wait(); + // Check whether all threads reach this point or not + CHECK_EQ(tids.size(), thread_num); + }); } }