From 6eab5638f03f49ab1ff3d3a4fc30d870f42a6153 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=AD=A6=E6=AF=85?= Date: Tue, 22 Aug 2017 13:28:51 +0800 Subject: [PATCH] Fix remote large update core (#3518) * fix remote large update core * wip * working version * fix style check * fix style check * update style check --- .../gserver/gradientmachines/NeuralNetwork.cpp | 2 +- paddle/parameter/Parameter.h | 5 ++++- paddle/pserver/ParameterClient2.cpp | 16 ++++++++++++++-- paddle/pserver/ParameterClient2.h | 1 + 4 files changed, 20 insertions(+), 4 deletions(-) diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp index cfa80a8936..26cff3e677 100644 --- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp +++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp @@ -202,7 +202,7 @@ void NeuralNetwork::prefetch(const std::vector& inArgs) { auto mat = dynamic_cast( para->getMat(PARAMETER_VALUE).get()); para->clearGradient(); - mat->clearIndices(); + if (mat) mat->clearIndices(); } } } diff --git a/paddle/parameter/Parameter.h b/paddle/parameter/Parameter.h index e31cbc3dee..321f4275d8 100644 --- a/paddle/parameter/Parameter.h +++ b/paddle/parameter/Parameter.h @@ -65,7 +65,10 @@ public: size_t getSize() const { return config_.size(); } bool isFullSize() const { - return this->getSize() == bufs_[PARAMETER_VALUE]->getSize(); + if (bufs_[PARAMETER_VALUE]) { + return this->getSize() == bufs_[PARAMETER_VALUE]->getSize(); + } + return false; } inline bool useGpu() const { return useGpu_; } diff --git a/paddle/pserver/ParameterClient2.cpp b/paddle/pserver/ParameterClient2.cpp index f7e391f763..54063a809a 100644 --- a/paddle/pserver/ParameterClient2.cpp +++ b/paddle/pserver/ParameterClient2.cpp @@ -65,7 +65,6 @@ void ParameterClient2::initThreads() { LOG(INFO) << "parallel_thread_num dosent need to set"; } syncThreadPool_.reset(new SyncThreadPool(threadNum_)); - startThreads(); } @@ -224,6 +223,14 @@ void ParameterClient2::prepareSendData( request.set_cost(cost); request.set_batch_status(batchStatus); CHECK_EQ(request.blocks_size(), 0); + VLOG(10) << "request: trainer_id: " << request.trainer_id() + << " update_mode" << request.update_mode() + << " send_back_parameter: " << request.send_back_parameter() + << " send_back_parameter_type: " + << request.send_back_parameter_type() + << " num_samples: " << request.num_samples() + << " cost: " << request.cost() + << " batch_status: " << request.batch_status(); } for (const auto& segments : parameterSegments) { const auto it = parameterMap_.find(segments.id); @@ -251,11 +258,17 @@ void ParameterClient2::prepareSendData( CHECK(sendMat != nullptr) << "sendMat is nullptr"; syncThreadPool_->exec([&](int tid, size_t numThreads) { + std::lock_guard guard(sparseAutoGrowthMutex_); const auto& localIndices = prefetchMat->getLocalIndices(); /// num of sparse rows size_t nLocalBlocks = localIndices.size(); uint64_t beginDim = 0; uint64_t endDim = 0; + + // FIXME(typhoonzero): let it resize first + prefetchMat->getLocalRow(nLocalBlocks + 1); + sendMat->getLocalRow(nLocalBlocks + 1); + for (size_t row = 0; row < nLocalBlocks; ++row) { int64_t blockId = localIndices[row]; // local row -> sparse row int serverId = std::abs((blockId + nameHash) % serviceNum_); @@ -275,7 +288,6 @@ void ParameterClient2::prepareSendData( block->set_begin_pos(row * blockSize); /// block len block->set_block_size(endDim - beginDim); - if (sendingPara) { sendJob->parallelInputIovs[serverId].push_back( {sendMat->getLocalRow(row), sizeof(real) * (size_t)blockSize}); diff --git a/paddle/pserver/ParameterClient2.h b/paddle/pserver/ParameterClient2.h index 89b3ddd502..29b9eeacdd 100644 --- a/paddle/pserver/ParameterClient2.h +++ b/paddle/pserver/ParameterClient2.h @@ -583,6 +583,7 @@ protected: #ifndef PADDLE_DISABLE_TIMER uint64_t forwardbackwordTime_; #endif + std::mutex sparseAutoGrowthMutex_; /// map id to parameter used for decoding protobuf data std::unordered_map parameterMap_; -- GitLab