diff --git a/doc/fluid/design/concepts/lod_tensor.md b/doc/fluid/design/concepts/lod_tensor.md index a88292e7888d0ebc64ee89ca315dfea38a12c71d..d606d7a790b4b0dc18553f2220d39cec8aa619ec 100644 --- a/doc/fluid/design/concepts/lod_tensor.md +++ b/doc/fluid/design/concepts/lod_tensor.md @@ -155,7 +155,7 @@ into offsets 3 2+3 4+5 1+9 2+10 3+12 ``` -so we know that the first sentence is from word 0 to word 3, and the second sentence from work 3 to word 5. +so we know that the first sentence is from word 0 to word 3, and the second sentence from word 3 to word 5. Similarly, the lengths in the top level LoD diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc index 21197d587b772aa046d6b3ce4394d3057ed6bf35..4755559f8d0c5b5fdeb6b56a28fff8a32ea7f82f 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc @@ -37,20 +37,26 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder( const std::string &loss_var_name, const std::unordered_set ¶ms, const std::vector &local_scopes, - platform::NCCLContextMap *nccl_ctxs, bool use_default_grad_scale) + platform::NCCLContextMap *nccl_ctxs, bool use_default_grad_scale, + bool balance_parameter_opt_between_cards) : loss_var_name_(loss_var_name), places_(places), local_scopes_(local_scopes), - nccl_ctxs_(nccl_ctxs) { + nccl_ctxs_(nccl_ctxs), + balance_parameter_opt_between_cards_( + balance_parameter_opt_between_cards) { #else MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder( const std::vector &places, const std::string &loss_var_name, const std::unordered_set ¶ms, - const std::vector &local_scopes, bool use_default_grad_scale) + const std::vector &local_scopes, bool use_default_grad_scale, + bool balance_parameter_opt_between_cards) : loss_var_name_(loss_var_name), places_(places), - local_scopes_(local_scopes) { + local_scopes_(local_scopes), + balance_parameter_opt_between_cards_( + balance_parameter_opt_between_cards) { #endif for (auto &p : params) { grad_names_.insert(GradVarName(p)); @@ -124,6 +130,12 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build( // Find "send" op first for split is in front of send. OpDesc *send_op = GetSendOpDesc(program); + size_t cur_device_id = 0; + std::vector> var_name_on_devices; + std::vector> bcast_var_name_set; + var_name_on_devices.resize(places_.size()); + bcast_var_name_set.resize(places_.size()); + bool is_forwarding = true; for (auto *op : program.Block(0).AllOps()) { if (op->Type() == "send") { @@ -139,17 +151,33 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build( } is_forwarding = false; } else { - CreateComputationalOps(&result, *op, places_.size()); + int op_dev_id = GetOpDeviceID(var_name_on_devices, *op); + if (op_dev_id == -1) { // var on all device + CreateComputationalOps(&result, *op, places_.size()); + } else { + CreateComputationalOp(&result, *op, op_dev_id); + for (auto &var_name : op->OutputArgumentNames()) { + var_name_on_devices[op_dev_id].emplace(var_name); + } + } if (!is_forwarding && places_.size() > 1) { // Currently, we assume that once gradient is generated, it can be // broadcast, and each gradient is only broadcast once. for (auto &og : op->OutputArgumentNames()) { if (IsParameterGradientOnce(og, &og_has_been_broadcast)) { - if (IsSparseGradient(var_types, og)) { - CreateReduceOp(&result, og, 0); - CreateBroadcastOp(&result, og, 0); + if (balance_parameter_opt_between_cards_) { + CreateReduceOp(&result, og, cur_device_id); + var_name_on_devices[cur_device_id].emplace(og); + bcast_var_name_set[cur_device_id].emplace( + og.substr(0, og.size() - strlen(kGradVarSuffix))); + cur_device_id = (cur_device_id + 1) % places_.size(); } else { - InsertNCCLAllReduceOp(&result, og); + if (IsSparseGradient(var_types, og)) { + CreateReduceOp(&result, og, 0); + CreateBroadcastOp(&result, og, 0); + } else { + InsertNCCLAllReduceOp(&result, og); + } } } } @@ -157,6 +185,13 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build( } } + // Insert BCast Ops + for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) { + auto &to_bcast_set = bcast_var_name_set[dev_id]; + for (auto &bcast_name : to_bcast_set) { + CreateBroadcastOp(&result, bcast_name, dev_id); + } + } /* Dependency graph has been constructed. However, there are still data harzaeds need to be handled. @@ -265,6 +300,26 @@ bool MultiDevSSAGraphBuilder::IsParameterGradientOnce( return is_pg_once; } +int MultiDevSSAGraphBuilder::GetOpDeviceID( + const std::vector> &var_name_on_devices, + const OpDesc &op) const { + if (!balance_parameter_opt_between_cards_) { + return -1; + } + + int var_dev_id = -1; + for (auto &var_name : op.InputArgumentNames()) { + if (var_dev_id != -1) break; + for (size_t i = 0; i < var_name_on_devices.size(); ++i) { + if (var_name_on_devices[i].count(var_name)) { + var_dev_id = static_cast(i); + break; + } + } + } + return var_dev_id; +} + void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(SSAGraph *result) const { for (size_t i = 0; i < places_.size(); ++i) { // Insert ScaleCost OpHandle diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h index 674e2779a112c26d05b84cd54df2c826e9a63373..3a3e9e3b8538f52962e6a5ccd1a177e58d6c2f6b 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.h +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h @@ -36,13 +36,15 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { const std::unordered_set ¶ms, const std::vector &local_scopes, platform::NCCLContextMap *nccl_ctxs, - bool use_default_grad_scale); + bool use_default_grad_scale, + bool balance_parameter_opt_between_cards); #else MultiDevSSAGraphBuilder(const std::vector &places, const std::string &loss_var_name, const std::unordered_set ¶ms, const std::vector &local_scopes, - bool use_default_grad_scale); + bool use_default_grad_scale, + bool balance_parameter_opt_between_cards); #endif std::unique_ptr Build(const ProgramDesc &program) const override; @@ -60,6 +62,7 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { #ifdef PADDLE_WITH_CUDA platform::NCCLContextMap *nccl_ctxs_; #endif + bool balance_parameter_opt_between_cards_; bool use_default_grad_scale_; bool IsScaleLossOp(const OpDesc &op) const; @@ -84,6 +87,10 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { const std::string &og, std::unordered_set *og_has_been_broadcast) const; + int GetOpDeviceID( + const std::vector> &var_name_on_devices, + const OpDesc &op) const; + void InsertNCCLAllReduceOp(SSAGraph *result, const std::string &og) const; void CreateBroadcastOp(SSAGraph *result, const std::string &p_name, diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 4d62edfef4b996235819d63147f2809dd50f2cfb..95e807c0afa45bc4f4feb84d450b2d0584bc3b28 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -58,7 +58,8 @@ ParallelExecutor::ParallelExecutor( const std::unordered_set &bcast_vars, const ProgramDesc &main_program, const std::string &loss_var_name, Scope *scope, const std::vector &local_scopes, bool allow_op_delay, - bool use_default_grad_scale, size_t num_trainers, size_t trainer_id) + bool use_default_grad_scale, bool balance_parameter_opt_between_cards, + size_t num_trainers, size_t trainer_id) : member_(new ParallelExecutorPrivate(places)) { member_->global_scope_ = scope; @@ -99,11 +100,12 @@ ParallelExecutor::ParallelExecutor( #ifdef PADDLE_WITH_CUDA details::MultiDevSSAGraphBuilder builder( member_->places_, loss_var_name, params, member_->local_scopes_, - member_->nccl_ctxs_.get(), use_default_grad_scale); + member_->nccl_ctxs_.get(), use_default_grad_scale, + balance_parameter_opt_between_cards); #else - details::MultiDevSSAGraphBuilder builder(member_->places_, loss_var_name, - params, member_->local_scopes_, - use_default_grad_scale); + details::MultiDevSSAGraphBuilder builder( + member_->places_, loss_var_name, params, member_->local_scopes_, + use_default_grad_scale, balance_parameter_opt_between_cards); #endif auto graph = builder.Build(main_program); diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 306d2bdfafe6399e8d1a5cf09feed126447a44aa..7ab17de3896792194ac6d6ca6800ee1741d01ae1 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -41,6 +41,7 @@ class ParallelExecutor { const std::string& loss_var_name, Scope* scope, const std::vector& local_scopes, bool allow_op_delay, bool use_default_grad_scale, + bool balance_parameter_opt_between_cards, size_t num_trainers = 0, size_t trainer_id = 0); ~ParallelExecutor(); diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 6223688ada6a448bca1c90d8800d603f7a8cbcb9..d38a9ce58726a1d045d6905354b0b592166c0110 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -276,6 +276,11 @@ foreach(src ${READER_LIBRARY}) set(OP_LIBRARY ${src} ${OP_LIBRARY}) endforeach() +add_subdirectory(detection) +foreach(src ${DETECTION_LIBRARY}) + set(OP_LIBRARY ${src} ${OP_LIBRARY}) +endforeach() + set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library") cc_test(gather_test SRCS gather_test.cc DEPS tensor) diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..a5bb58c2f4047a3bf2f8592b605772b4fa166c57 --- /dev/null +++ b/paddle/fluid/operators/detection/CMakeLists.txt @@ -0,0 +1,29 @@ +set(LOCAL_DETECTION_LIBS) + +function(detection_library TARGET_NAME) + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + set(options "") + set(common_deps op_registry) + set(pybind_flag 0) + cmake_parse_arguments(detection_library "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) + op_library(${TARGET_NAME} SRCS ${detection_library_SRCS} DEPS ${common_deps} ${detection_library_DEPS}) + set(LOCAL_DETECTION_LIBS + ${TARGET_NAME} + ${LOCAL_DETECTION_LIBS} + PARENT_SCOPE) +endfunction() + +detection_library(bipartite_match_op SRCS bipartite_match_op.cc) +detection_library(box_coder_op SRCS box_coder_op.cc box_coder_op.cu) +detection_library(iou_similarity_op SRCS iou_similarity_op.cc +iou_similarity_op.cu) +detection_library(mine_hard_examples_op SRCS mine_hard_examples_op.cc) +detection_library(multiclass_nms_op SRCS multiclass_nms_op.cc) +detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu) +detection_library(target_assign_op SRCS target_assign_op.cc +target_assign_op.cu) + +# Export local libraries to parent +set(DETECTION_LIBRARY ${LOCAL_DETECTION_LIBS} PARENT_SCOPE) diff --git a/paddle/fluid/operators/bipartite_match_op.cc b/paddle/fluid/operators/detection/bipartite_match_op.cc similarity index 100% rename from paddle/fluid/operators/bipartite_match_op.cc rename to paddle/fluid/operators/detection/bipartite_match_op.cc diff --git a/paddle/fluid/operators/box_coder_op.cc b/paddle/fluid/operators/detection/box_coder_op.cc similarity index 99% rename from paddle/fluid/operators/box_coder_op.cc rename to paddle/fluid/operators/detection/box_coder_op.cc index ce9bf10dc3c4718c5adfc0d5034d06c22f9d54e1..74848005d0bea6e5c818ff999727aa2b8ad51d84 100644 --- a/paddle/fluid/operators/box_coder_op.cc +++ b/paddle/fluid/operators/detection/box_coder_op.cc @@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/box_coder_op.h" +#include "paddle/fluid/operators/detection/box_coder_op.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/box_coder_op.cu b/paddle/fluid/operators/detection/box_coder_op.cu similarity index 99% rename from paddle/fluid/operators/box_coder_op.cu rename to paddle/fluid/operators/detection/box_coder_op.cu index 708c7a5fa96c2f9ce6a2d913ca5f30126bb6192f..8cef8e03439df4ca5b0fa94176a21a36f9eb9f70 100644 --- a/paddle/fluid/operators/box_coder_op.cu +++ b/paddle/fluid/operators/detection/box_coder_op.cu @@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/box_coder_op.h" +#include "paddle/fluid/operators/detection/box_coder_op.h" #include "paddle/fluid/platform/cuda_primitives.h" namespace paddle { diff --git a/paddle/fluid/operators/box_coder_op.h b/paddle/fluid/operators/detection/box_coder_op.h similarity index 100% rename from paddle/fluid/operators/box_coder_op.h rename to paddle/fluid/operators/detection/box_coder_op.h diff --git a/paddle/fluid/operators/iou_similarity_op.cc b/paddle/fluid/operators/detection/iou_similarity_op.cc similarity index 98% rename from paddle/fluid/operators/iou_similarity_op.cc rename to paddle/fluid/operators/detection/iou_similarity_op.cc index 007e0af7a5a9fd8c562218e641e1867388b081f1..8e58605fcea04f9ffa97ce8cca53c073e7068aaf 100644 --- a/paddle/fluid/operators/iou_similarity_op.cc +++ b/paddle/fluid/operators/detection/iou_similarity_op.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/iou_similarity_op.h" +#include "paddle/fluid/operators/detection/iou_similarity_op.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/iou_similarity_op.cu b/paddle/fluid/operators/detection/iou_similarity_op.cu similarity index 92% rename from paddle/fluid/operators/iou_similarity_op.cu rename to paddle/fluid/operators/detection/iou_similarity_op.cu index f40a388d62e66a110656ebb71094d46b5ac147eb..8342b4138c87e6ea1803146bac6d6954a569ef5f 100644 --- a/paddle/fluid/operators/iou_similarity_op.cu +++ b/paddle/fluid/operators/detection/iou_similarity_op.cu @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/iou_similarity_op.h" +#include "paddle/fluid/operators/detection/iou_similarity_op.h" namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( diff --git a/paddle/fluid/operators/iou_similarity_op.h b/paddle/fluid/operators/detection/iou_similarity_op.h similarity index 100% rename from paddle/fluid/operators/iou_similarity_op.h rename to paddle/fluid/operators/detection/iou_similarity_op.h diff --git a/paddle/fluid/operators/mine_hard_examples_op.cc b/paddle/fluid/operators/detection/mine_hard_examples_op.cc similarity index 100% rename from paddle/fluid/operators/mine_hard_examples_op.cc rename to paddle/fluid/operators/detection/mine_hard_examples_op.cc diff --git a/paddle/fluid/operators/multiclass_nms_op.cc b/paddle/fluid/operators/detection/multiclass_nms_op.cc similarity index 100% rename from paddle/fluid/operators/multiclass_nms_op.cc rename to paddle/fluid/operators/detection/multiclass_nms_op.cc diff --git a/paddle/fluid/operators/prior_box_op.cc b/paddle/fluid/operators/detection/prior_box_op.cc similarity index 99% rename from paddle/fluid/operators/prior_box_op.cc rename to paddle/fluid/operators/detection/prior_box_op.cc index a0b069da0dda59c769723211533df8c33511fe3f..4e35c38e4e03d4d0f00601812fdc4803519b89ae 100644 --- a/paddle/fluid/operators/prior_box_op.cc +++ b/paddle/fluid/operators/detection/prior_box_op.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/prior_box_op.h" +#include "paddle/fluid/operators/detection/prior_box_op.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/prior_box_op.cu b/paddle/fluid/operators/detection/prior_box_op.cu similarity index 99% rename from paddle/fluid/operators/prior_box_op.cu rename to paddle/fluid/operators/detection/prior_box_op.cu index 0ea8909296f8f52d252b0ec258666cf32d69a8bb..f67e6ca91c0852b5a3be35d23246884d1157caa4 100644 --- a/paddle/fluid/operators/prior_box_op.cu +++ b/paddle/fluid/operators/detection/prior_box_op.cu @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/prior_box_op.h" +#include "paddle/fluid/operators/detection/prior_box_op.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/prior_box_op.h b/paddle/fluid/operators/detection/prior_box_op.h similarity index 100% rename from paddle/fluid/operators/prior_box_op.h rename to paddle/fluid/operators/detection/prior_box_op.h diff --git a/paddle/fluid/operators/target_assign_op.cc b/paddle/fluid/operators/detection/target_assign_op.cc similarity index 98% rename from paddle/fluid/operators/target_assign_op.cc rename to paddle/fluid/operators/detection/target_assign_op.cc index 9fce216e880f2af3b665202f4c4d2777995db003..367001939251114a9cf442fd85c734958ccb2da8 100644 --- a/paddle/fluid/operators/target_assign_op.cc +++ b/paddle/fluid/operators/detection/target_assign_op.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/target_assign_op.h" +#include "paddle/fluid/operators/detection/target_assign_op.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/target_assign_op.cu b/paddle/fluid/operators/detection/target_assign_op.cu similarity index 97% rename from paddle/fluid/operators/target_assign_op.cu rename to paddle/fluid/operators/detection/target_assign_op.cu index 24664f99b20f92108220d27ec58e8fdf3ba6193c..ddf6889942355457fb281b6c33430ab8337db3ed 100644 --- a/paddle/fluid/operators/target_assign_op.cu +++ b/paddle/fluid/operators/detection/target_assign_op.cu @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/target_assign_op.h" +#include "paddle/fluid/operators/detection/target_assign_op.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/target_assign_op.h b/paddle/fluid/operators/detection/target_assign_op.h similarity index 100% rename from paddle/fluid/operators/target_assign_op.h rename to paddle/fluid/operators/detection/target_assign_op.h diff --git a/paddle/fluid/operators/gen_nccl_id_op.cc b/paddle/fluid/operators/gen_nccl_id_op.cc index 6d50bd13e8a7cb440aa049ab8e14cc0aa38b9000..b4ff6b75988b4d61d3e4c17e6871b8e01ec87d1c 100644 --- a/paddle/fluid/operators/gen_nccl_id_op.cc +++ b/paddle/fluid/operators/gen_nccl_id_op.cc @@ -83,8 +83,8 @@ class GenNCCLIdOp : public framework::OperatorBase { rpc_service_->SetProgram(&empty_program); rpc_service_->SetExecutor(&executor); - server_thread_.reset(new std::thread( - std::bind(&detail::AsyncGRPCServer::RunSyncUpdate, rpc_service_))); + std::thread server_thread( + std::bind(&detail::AsyncGRPCServer::RunSyncUpdate, rpc_service_)); rpc_service_->SetCond(0); VLOG(3) << "start getting nccl id from trainer 0..."; auto recv = rpc_service_->Get(); @@ -92,13 +92,12 @@ class GenNCCLIdOp : public framework::OperatorBase { rpc_service_->ShutDown(); VLOG(3) << "rpc server stopped"; // TODO(wuyi): reinit nccl communicators - server_thread_->join(); + server_thread.join(); delete rpc_service_; } protected: mutable detail::AsyncGRPCServer* rpc_service_ = nullptr; - mutable std::shared_ptr server_thread_; }; class GenNCCLIdOpMaker : public framework::OpProtoAndCheckerMaker { diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h index c76fc17d78cce514b5e35ce8e5ca890d7cec1e98..d84c88cb3bc1a13acb83b3444dbd1bfca3cba503 100644 --- a/paddle/fluid/operators/math/blas_impl.cu.h +++ b/paddle/fluid/operators/math/blas_impl.cu.h @@ -96,10 +96,22 @@ struct CUBlas { reinterpret_cast<__half *>(C), ldc)); } - template - static void GEMM_BATCH(ARGS... args) { + static void GEMM_BATCH(cublasHandle_t handle, cublasOperation_t transa, + cublasOperation_t transb, int m, int n, int k, + const float16 *alpha, const float16 *A, int lda, + long long int strideA, const float16 *B, // NOLINT + int ldb, long long int strideB, // NOLINT + const float16 *beta, float16 *C, int ldc, + long long int strideC, // NOLINT + int batchCount) { #if CUDA_VERSION >= 8000 - PADDLE_ENFORCE(platform::dynload::cublasHgemmStridedBatched(args...)); + PADDLE_ENFORCE(platform::dynload::cublasHgemmStridedBatched( + handle, transa, transb, m, n, k, + reinterpret_cast(alpha), + reinterpret_cast(A), lda, strideA, + reinterpret_cast(B), ldb, strideB, + reinterpret_cast(beta), reinterpret_cast<__half *>(C), + ldc, strideC, batchCount)); #else PADDLE_THROW("HgemmStridedBatched is not supported on cuda <= 7.5"); #endif diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h index 577cbe3beb806ffcb2f1a7d7a469402be9b69224..14b3624b420cb883b36268c0a5a9e8692dbb5b43 100644 --- a/paddle/fluid/operators/math/blas_impl.h +++ b/paddle/fluid/operators/math/blas_impl.h @@ -172,9 +172,9 @@ void Blas::BatchedGEMM( c_array.data(), &ldc, 1 /* group_count */, &batchCount); #else for (int k = 0; k < batchCount; ++k) { - const float *Ak = &A[k * strideA]; - const float *Bk = &B[k * strideB]; - float *Ck = &C[k * M * N]; + auto *Ak = &A[k * strideA]; + auto *Bk = &B[k * strideB]; + auto *Ck = &C[k * M * N]; this->template GEMM(transA, transB, M, N, K, alpha, Ak, Bk, beta, Ck); } #endif diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu index b5bf84e5178c143de35ec6dcb16b1bde5577c166..d5af718723e8d44da0971ea7756b8c36e771cca2 100644 --- a/paddle/fluid/operators/math/math_function.cu +++ b/paddle/fluid/operators/math/math_function.cu @@ -33,9 +33,10 @@ template struct SetConstant; template struct SetConstant; template struct SetConstant; -#define DEFINE_GPU_TRANS(RANK) \ - template struct Transpose; \ - template struct Transpose; +#define DEFINE_GPU_TRANS(RANK) \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; DEFINE_GPU_TRANS(1); DEFINE_GPU_TRANS(2); diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc index 2d05449822e6addc42c4fea5af8422ae6dcfd37d..7182149164854038bb67a9f06cdbec8a4a0f1fb2 100644 --- a/paddle/fluid/operators/matmul_op.cc +++ b/paddle/fluid/operators/matmul_op.cc @@ -25,7 +25,7 @@ namespace operators { * Get row matrix shape from a vector shape. If the rank of x_dim > 1, the * original x_dim is returned. */ -static framework::DDim RowMatrixFromVector(const framework::DDim& x_dim) { +static framework::DDim RowMatrixFromVector(const framework::DDim &x_dim) { if (x_dim.size() > 1) { return x_dim; } @@ -36,7 +36,7 @@ static framework::DDim RowMatrixFromVector(const framework::DDim& x_dim) { * Get column matrix shape from a vector shape. If the ran of y_dim > 1, the * original y_dim is returned. */ -static framework::DDim ColumnMatrixFromVector(const framework::DDim& y_dim) { +static framework::DDim ColumnMatrixFromVector(const framework::DDim &y_dim) { if (y_dim.size() > 1) { return y_dim; } @@ -46,12 +46,12 @@ static framework::DDim ColumnMatrixFromVector(const framework::DDim& y_dim) { template class MatMulKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& context) const override { - auto& x = + void Compute(const framework::ExecutionContext &context) const override { + auto &x = detail::Ref(context.Input("X"), "Cannot find X"); - auto& y = + auto &y = detail::Ref(context.Input("Y"), "Cannot find Y"); - auto* out = context.Output("Out"); + auto *out = context.Output("Out"); out->mutable_data(context.GetPlace()); auto blas = math::GetBlas(context); @@ -65,7 +65,7 @@ class MatMulKernel : public framework::OpKernel { // Reshape a rank-3 tensor from P x M x N to (P * M) x N. // Identity op if the tensor is not of rank 3. -static framework::Tensor FoldInitDims(const framework::Tensor& input) { +static framework::Tensor FoldInitDims(const framework::Tensor &input) { auto output = input; auto in_dims = input.dims(); if (in_dims.size() == 3) { @@ -78,8 +78,8 @@ static framework::Tensor FoldInitDims(const framework::Tensor& input) { // (Warning: This requires transposing data and writes into new memory.) // Identity op if the tensor is not of rank 3. template -static framework::Tensor FoldHeadAndLastDims(const DeviceContext& context, - const framework::Tensor& input) { +static framework::Tensor FoldHeadAndLastDims(const DeviceContext &context, + const framework::Tensor &input) { auto in_dims = input.dims(); if (in_dims.size() != 3) { return input; @@ -102,7 +102,7 @@ static framework::Tensor FoldHeadAndLastDims(const DeviceContext& context, * If transposed, `H,W` will be swapped. */ static void ReshapeTensorIntoMatrixSequence( - framework::Tensor* x, const math::MatDescriptor& descriptor) { + framework::Tensor *x, const math::MatDescriptor &descriptor) { int64_t h, w; h = descriptor.height_; w = descriptor.width_; @@ -130,9 +130,9 @@ static void ReshapeTensorIntoMatrixSequence( * If any of `X` and `Y` has batch size BatchSize, the out will have the * BatchSize. */ -static void ReshapeXYOutIntoMatrixSequence(framework::Tensor* x, - framework::Tensor* y, - framework::Tensor* out, bool trans_x, +static void ReshapeXYOutIntoMatrixSequence(framework::Tensor *x, + framework::Tensor *y, + framework::Tensor *out, bool trans_x, bool trans_y) { auto x_dim = RowMatrixFromVector(x->dims()); auto y_dim = ColumnMatrixFromVector(y->dims()); @@ -177,10 +177,10 @@ static void ReshapeXYOutIntoMatrixSequence(framework::Tensor* x, template class MatMulGradKernel : public framework::OpKernel { public: - void MatMul(const framework::ExecutionContext& context, - const framework::Tensor& a, bool trans_a, - const framework::Tensor& b, bool trans_b, - framework::Tensor* out) const { + void MatMul(const framework::ExecutionContext &context, + const framework::Tensor &a, bool trans_a, + const framework::Tensor &b, bool trans_b, + framework::Tensor *out) const { out->mutable_data(context.GetPlace()); auto blas = math::GetBlas(context); auto mat_dim_a = math::CreateMatrixDescriptor(a.dims(), 0, trans_a); @@ -188,18 +188,18 @@ class MatMulGradKernel : public framework::OpKernel { blas.MatMul(a, mat_dim_a, b, mat_dim_b, T(1), out, T(0)); } - void CalcInputGrad(const framework::ExecutionContext& context, - const framework::Tensor& a, bool trans_a, - bool is_fold_init_dims_a, const framework::Tensor& b, + void CalcInputGrad(const framework::ExecutionContext &context, + const framework::Tensor &a, bool trans_a, + bool is_fold_init_dims_a, const framework::Tensor &b, bool trans_b, bool is_fold_init_dims_b, - framework::Tensor* out) const { + framework::Tensor *out) const { if (out == nullptr) return; bool need_combine = (a.dims().size() == 3 || b.dims().size() == 3) && out->dims().size() == 2; if (!need_combine) { MatMul(context, a, trans_a, b, trans_b, out); } else { - auto& ctx = context.template device_context(); + auto &ctx = context.template device_context(); MatMul(context, is_fold_init_dims_a ? FoldInitDims(a) : FoldHeadAndLastDims(ctx, a), @@ -210,13 +210,13 @@ class MatMulGradKernel : public framework::OpKernel { } } - void Compute(const framework::ExecutionContext& context) const override { + void Compute(const framework::ExecutionContext &context) const override { auto x = *context.Input("X"); auto y = *context.Input("Y"); auto dout = *context.Input(framework::GradVarName("Out")); - auto* dx = context.Output(framework::GradVarName("X")); - auto* dy = context.Output(framework::GradVarName("Y")); + auto *dx = context.Output(framework::GradVarName("X")); + auto *dy = context.Output(framework::GradVarName("Y")); bool transpose_x = context.Attr("transpose_X"); bool transpose_y = context.Attr("transpose_Y"); @@ -269,7 +269,7 @@ class MatMulOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; protected: - void InferShape(framework::InferShapeContext* context) const override { + void InferShape(framework::InferShapeContext *context) const override { PADDLE_ENFORCE(context->HasInput("X"), "Input(X) of MatMulOp should not be null."); PADDLE_ENFORCE(context->HasInput("Y"), @@ -375,7 +375,7 @@ class MatMulOpGrad : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; protected: - void InferShape(framework::InferShapeContext* context) const override { + void InferShape(framework::InferShapeContext *context) const override { PADDLE_ENFORCE(context->HasInput("X"), "Input(X) should not be null"); PADDLE_ENFORCE(context->HasInput("Y"), "Input(Y) should not be null"); PADDLE_ENFORCE(context->HasInput(framework::GradVarName("Out")), @@ -401,7 +401,7 @@ class MatMulOpGradMaker : public framework::SingleGradOpDescMaker { protected: std::unique_ptr Apply() const override { - auto* retv = new framework::OpDesc(); + auto *retv = new framework::OpDesc(); retv->SetType("matmul_grad"); retv->SetInput("X", Input("X")); retv->SetInput("Y", Input("Y")); @@ -420,15 +420,27 @@ REGISTER_OPERATOR(matmul, ops::MatMulOp, ops::MatMulOpMaker, ops::MatMulOpGradMaker); REGISTER_OPERATOR(matmul_grad, ops::MatMulOpGrad); REGISTER_OP_CPU_KERNEL( - matmul, ops::MatMulKernel); + matmul, ops::MatMulKernel, + ops::MatMulKernel, + ops::MatMulKernel); REGISTER_OP_CPU_KERNEL( matmul_grad, - ops::MatMulGradKernel); + ops::MatMulGradKernel, + ops::MatMulGradKernel, + ops::MatMulGradKernel); #ifdef PADDLE_WITH_CUDA REGISTER_OP_CUDA_KERNEL( - matmul, ops::MatMulKernel); + matmul, ops::MatMulKernel, + ops::MatMulKernel, + ops::MatMulKernel); REGISTER_OP_CUDA_KERNEL( matmul_grad, - ops::MatMulGradKernel); + ops::MatMulGradKernel, + ops::MatMulGradKernel, + ops::MatMulGradKernel); #endif diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 827c2701ba151b971654131b87fb1eff6d4df8be..b62291a99d34457dd17bf2bcafc1fc611419f086 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -503,12 +503,13 @@ All parameter, weight, gradient are variables in Paddle. const ProgramDesc &main_program, const std::string &loss_var_name, Scope *scope, std::vector &local_scopes, bool allow_op_delay, bool use_default_grad_scale, - size_t num_trainers, size_t trainer_id) { + bool balance_parameter_opt_between_cards, size_t num_trainers, + size_t trainer_id) { new (&self) ParallelExecutor( num_threads, use_event, places, params, bcast_vars, main_program, loss_var_name, scope, local_scopes, - allow_op_delay, use_default_grad_scale, num_trainers, - trainer_id); + allow_op_delay, use_default_grad_scale, + balance_parameter_opt_between_cards, num_trainers, trainer_id); }) .def("bcast_params", &ParallelExecutor::BCastParamsToGPUs) // NOTE: even we return a vec* to Python use reference policy. diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py index ac02401c79b787716b2e5f43e0d1c5686cf2bd13..0051b698471b40bffc12921f86dcde642714e07d 100644 --- a/python/paddle/fluid/data_feeder.py +++ b/python/paddle/fluid/data_feeder.py @@ -16,6 +16,7 @@ from __future__ import print_function import core import numpy import six.moves as six +import multiprocessing from framework import Variable, default_main_program @@ -116,3 +117,60 @@ class DataFeeder(object): for each_name, each_converter in six.zip(self.feed_names, converter): ret_dict[each_name] = each_converter.done() return ret_dict + + def feed_parallel(self, iterable, num_places=None): + if isinstance(self.place, core.CUDAPlace): + places = [ + core.CUDAPlace(i) + for i in six.xrange(self._get_number_of_places_(num_places)) + ] + else: + places = [ + core.CPUPlace() + for _ in six.xrange(self._get_number_of_places_(num_places)) + ] + + if len(iterable) != len(places): + raise ValueError("feed_parallel takes multiple mini-batches. Each " + "mini-batch will be feed on each device. The " + "number of devices and number of mini-batches " + "must be same.") + + place = self.place + for p, batch in six.zip(places, iterable): + self.place = p + yield self.feed(batch) + self.place = place + + def _get_number_of_places_(self, num_places): + if num_places is not None: + return int(num_places) + elif isinstance(self.place, core.CUDAPlace): + return core.get_cuda_device_count() + else: + return multiprocessing.cpu_count() + + def decorate_reader(self, + reader, + multi_devices, + num_places=None, + drop_last=True): + def __reader_creator__(): + if not multi_devices: + for item in reader(): + yield self.feed(item) + else: + num = self._get_number_of_places_(num_places) + item = [] + for batch in reader(): + item.append(batch) + if len(item) == num: + yield list(self.feed_parallel(item, num)) + item = [] + if not drop_last and len(item) != 0: + raise ValueError( + "The data batch which cannot fit for devices will be " + "dropped is not implementation. Other strategies are " + "not implemented") + + return __reader_creator__ diff --git a/python/paddle/fluid/inferencer.py b/python/paddle/fluid/inferencer.py index b38526bc574a103ece86aecdecf06b0bcfd6cad0..1b8b9c07622dce80245c69484bb6abf737ffca83 100644 --- a/python/paddle/fluid/inferencer.py +++ b/python/paddle/fluid/inferencer.py @@ -16,31 +16,42 @@ import core import framework import executor import io +from trainer import check_and_get_place + __all__ = ['Inferencer', ] class Inferencer(object): - def __init__(self, network_func, param_path=None, place=None): - # 1. we need to generate a framework.Program by calling - # network_func. Reference: fluid.program_guard in test_word2vec.py - - # 2. move the default_main_program to self.program. - - # 3. run the default_startup program. - - # 4. load params from param_path into scope + def __init__(self, param_path, place=None): + """ + :param param_path: the path where the inference model is saved by fluid.io.save_inference_model + :param place: place to do the inference + """ + self.param_path = param_path self.scope = core.Scope() - self.place = place - self.startup_program = framework.Program() - # TODO: generate the startup_program with network_func - - exe = executor.Executor(place) - exe.run(self.startup_program, scope=self.scope) - if param_path: + self.exe = executor.Executor(check_and_get_place(place)) + with executor.scope_guard(self.scope): # load params from param_path into scope - io.load_persistables(exe, dirname=param_path) - - def infer(self, inputs): - # run self.program - pass + [self.inference_program, _, + self.fetch_targets] = io.load_inference_model( + executor=self.exe, dirname=param_path) + + def infer(self, inputs, return_numpy=True): + """ + :param inputs: a map of {"input_name": input_var} that will be feed into the inference program + to get the predict value + :param return_numpy: if return numpy value for row tensor + :return: the predict value of the inference model + """ + if not isinstance(inputs, dict): + raise ValueError( + "inputs should be a map of {'input_name': input_var}") + + with executor.scope_guard(self.scope): + results = self.exe.run(self.inference_program, + feed=inputs, + fetch_list=self.fetch_targets, + return_numpy=return_numpy) + + return results diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index 08b8a878b6490bc989620085f3f9c06c7032d882..54506e97ed5c9a23f5a1e9624391f466c1c498d6 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -263,6 +263,9 @@ def get_inference_program(target_vars, main_program=None): def prepend_feed_ops(inference_program, feed_target_names, feed_holder_name='feed'): + if len(feed_target_names) == 0: + return + global_block = inference_program.global_block() feed_var = global_block.create_var( name=feed_holder_name, @@ -323,9 +326,10 @@ def save_inference_model(dirname, if isinstance(feeded_var_names, basestring): feeded_var_names = [feeded_var_names] else: - if not (bool(feeded_var_names) and all( - isinstance(name, basestring) for name in feeded_var_names)): - raise ValueError("'feed_var_names' should be a list of str.") + if len(feeded_var_names) > 0: + if not (bool(feeded_var_names) and all( + isinstance(name, basestring) for name in feeded_var_names)): + raise ValueError("'feed_var_names' should be a list of str.") if isinstance(target_vars, Variable): target_vars = [target_vars] diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 4f6db7c2be23686f949002722d52fcb69cbe3cea..59294d972116a624bed60b0c766df1e00f02acdc 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -31,6 +31,7 @@ class ParallelExecutor(object): allow_op_delay=False, share_vars_from=None, use_default_grad_scale=True, + balance_parameter_opt_between_cards=False, num_trainers=0, trainer_id=0): """ @@ -53,6 +54,9 @@ class ParallelExecutor(object): gradients of each device and scaled gradients would be aggregated. Otherwise, a customized scale value should be fed to the network. + balance_parameter_opt_between_cards(bool, default True): Whether + updating different gradients on different cards. Currently, it + is not recommended. num_trainers(int, default 0): If greater than 0, NCCL will be initialized with multpile rank of nodes, each node should have same number of GPUs. Distributed training will be enabled then. @@ -137,6 +141,7 @@ class ParallelExecutor(object): local_scopes, allow_op_delay, use_default_grad_scale, + balance_parameter_opt_between_cards, num_trainers, trainer_id) self.scope = scope diff --git a/python/paddle/fluid/tests/book/CMakeLists.txt b/python/paddle/fluid/tests/book/CMakeLists.txt index 673c965b662a022739f8d489c331f4de9455a926..ee734f3c782adb5196a03aca5718377009a5b4e7 100644 --- a/python/paddle/fluid/tests/book/CMakeLists.txt +++ b/python/paddle/fluid/tests/book/CMakeLists.txt @@ -5,3 +5,5 @@ string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") foreach(src ${TEST_OPS}) py_test(${src} SRCS ${src}.py) endforeach() + +add_subdirectory(high-level-api) diff --git a/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..9ab00325a2eef3bbc79757ad1a3e6f8511c49552 --- /dev/null +++ b/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt @@ -0,0 +1,9 @@ +file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") +string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") + +# default test +foreach(src ${TEST_OPS}) + py_test(${src} SRCS ${src}.py) +endforeach() + +add_subdirectory(recognize_digits) diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..673c965b662a022739f8d489c331f4de9455a926 --- /dev/null +++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/CMakeLists.txt @@ -0,0 +1,7 @@ +file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") +string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") + +# default test +foreach(src ${TEST_OPS}) + py_test(${src} SRCS ${src}.py) +endforeach() diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/notest_recognize_digits_conv.py b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py similarity index 57% rename from python/paddle/fluid/tests/book/high-level-api/recognize_digits/notest_recognize_digits_conv.py rename to python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py index a8282c71f895718930ea14a1e0bff777441c4c57..1f91f471f22f7c8fafc80672e660ddccf6f7cc4a 100644 --- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/notest_recognize_digits_conv.py +++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py @@ -21,7 +21,6 @@ import unittest import math import sys import os -import paddle.v2.dataset as dataset BATCH_SIZE = 64 @@ -54,47 +53,65 @@ def train_program(): predict = inference_program() cost = fluid.layers.cross_entropy(input=predict, label=label) avg_cost = fluid.layers.mean(cost) - acc = fluid.layers.accuracy(input=predict, label=label) - return avg_cost, acc + # acc = fluid.layers.accuracy(input=predict, label=label) + # return avg_cost, acc + return avg_cost def train(use_cuda, save_dirname): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - optimizer = fluid.optimizer.Adam(learning_rate=0.001) - trainer = fluid.Trainer(train_program, place=place, optimizer=optimizer) + + trainer = fluid.Trainer( + train_func=train_program, + infer_func=inference_program, + place=place, + optimizer=optimizer) def event_handler(event): - if isinstance(event, fluid.EndIteration): - avg_cost, acc = event.values - print("avg_cost: %s" % avg_cost) - print("acc : %s" % acc) - - if (event.batch_id + 1) % 10 == 0: - test_metrics = trainer.test(reader=dataset.mnist.test()) - avg_cost_set = test_metrics[0] - acc_set = test_metrics[1] - - # get test acc and loss - acc = numpy.array(acc_set).mean() - avg_cost = numpy.array(avg_cost_set).mean() - if float(acc) > 0.2: # Smaller value to increase CI speed - trainer.save_params(save_dirname) - else: - print('BatchID {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format( - event.batch_id + 1, float(avg_cost), float(acc))) - if math.isnan(float(avg_cost)): - sys.exit("got NaN loss, training failed.") + if isinstance(event, fluid.EndEpochEvent): + # if (event.epoch + 1) % 10 == 0: + # trainer.save_params(save_dirname) + trainer.save_inference_model(save_dirname) + + # TODO: Uncomment this part once we are sure that .train is working + # test_reader = paddle.batch( + # paddle.dataset.mnist.test(), batch_size=BATCH_SIZE) + # test_metrics = trainer.test(reader=test_reader) + # avg_cost_set = test_metrics[0] + # acc_set = test_metrics[1] + # + # # get test acc and loss + # acc = numpy.array(acc_set).mean() + # avg_cost = numpy.array(avg_cost_set).mean() + # + # print("avg_cost: %s" % avg_cost) + # print("acc : %s" % acc) + # + # if float(acc) > 0.2: # Smaller value to increase CI speed + # trainer.save_params(save_dirname) + # else: + # print('BatchID {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format( + # event.epoch + 1, float(avg_cost), float(acc))) + # if math.isnan(float(avg_cost)): + # sys.exit("got NaN loss, training failed.") + + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.mnist.train(), buf_size=500), + batch_size=BATCH_SIZE) trainer.train( - reader=dataset.mnist.train(), num_pass=100, event_handler=event_handler) + num_epochs=1, + event_handler=event_handler, + reader=train_reader, + feed_order=['img', 'label']) def infer(use_cuda, save_dirname=None): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - inferencer = fluid.Inferencer( - inference_program, param_path=save_dirname, place=place) + inferencer = fluid.Inferencer(param_path=save_dirname, place=place) batch_size = 1 tensor_img = numpy.random.uniform(-1.0, 1.0, @@ -114,5 +131,5 @@ def main(use_cuda): if __name__ == '__main__': - for use_cuda in (False, True): - main(use_cuda=use_cuda) + # for use_cuda in (False, True): + main(use_cuda=False) diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/notest_recognize_digits_mlp.py b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py similarity index 54% rename from python/paddle/fluid/tests/book/high-level-api/recognize_digits/notest_recognize_digits_mlp.py rename to python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py index 3efa931d5886e565d2d876a31309883ee1660389..f072d70abdba5077bf0ac2ff6ff972e24c8226f5 100644 --- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/notest_recognize_digits_mlp.py +++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py @@ -21,7 +21,6 @@ import unittest import math import sys import os -import paddle.v2.dataset as dataset BATCH_SIZE = 64 @@ -41,47 +40,64 @@ def train_program(): predict = inference_program() cost = fluid.layers.cross_entropy(input=predict, label=label) avg_cost = fluid.layers.mean(cost) - acc = fluid.layers.accuracy(input=predict, label=label) - return avg_cost, acc + # acc = fluid.layers.accuracy(input=predict, label=label) + # return avg_cost, acc + return avg_cost def train(use_cuda, save_dirname): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - optimizer = fluid.optimizer.Adam(learning_rate=0.001) - trainer = fluid.Trainer(train_program, place=place, optimizer=optimizer) + + trainer = fluid.Trainer( + train_func=train_program, + infer_func=inference_program, + place=place, + optimizer=optimizer) def event_handler(event): - if isinstance(event, fluid.EndIteration): - avg_cost, acc = event.values - print("avg_cost: %s" % avg_cost) - print("acc : %s" % acc) - - if (event.batch_id + 1) % 10 == 0: - test_metrics = trainer.test(reader=dataset.mnist.test()) - avg_cost_set = test_metrics[0] - acc_set = test_metrics[1] - - # get test acc and loss - acc = numpy.array(acc_set).mean() - avg_cost = numpy.array(avg_cost_set).mean() - if float(acc) > 0.2: # Smaller value to increase CI speed - trainer.save_params(save_dirname) - else: - print('BatchID {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format( - event.batch_id + 1, float(avg_cost), float(acc))) - if math.isnan(float(avg_cost)): - sys.exit("got NaN loss, training failed.") + if isinstance(event, fluid.EndEpochEvent): + # if (event.epoch + 1) % 10 == 0: + trainer.save_inference_model(save_dirname) + + # TODO: Uncomment this part once we are sure that .train is working + # test_reader = paddle.batch( + # paddle.dataset.mnist.test(), batch_size=BATCH_SIZE) + # test_metrics = trainer.test(reader=test_reader) + # avg_cost_set = test_metrics[0] + # acc_set = test_metrics[1] + # + # # get test acc and loss + # acc = numpy.array(acc_set).mean() + # avg_cost = numpy.array(avg_cost_set).mean() + # + # print("avg_cost: %s" % avg_cost) + # print("acc : %s" % acc) + # + # if float(acc) > 0.2: # Smaller value to increase CI speed + # trainer.save_params(save_dirname) + # else: + # print('BatchID {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format( + # event.epoch + 1, float(avg_cost), float(acc))) + # if math.isnan(float(avg_cost)): + # sys.exit("got NaN loss, training failed.") + + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.mnist.train(), buf_size=500), + batch_size=BATCH_SIZE) trainer.train( - reader=dataset.mnist.train(), num_pass=100, event_handler=event_handler) + num_epochs=1, + event_handler=event_handler, + reader=train_reader, + feed_order=['img', 'label']) def infer(use_cuda, save_dirname=None): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - inferencer = fluid.Inferencer( - inference_program, param_path=save_dirname, place=place) + inferencer = fluid.Inferencer(param_path=save_dirname, place=place) batch_size = 1 tensor_img = numpy.random.uniform(-1.0, 1.0, @@ -101,5 +117,5 @@ def main(use_cuda): if __name__ == '__main__': - for use_cuda in (False, True): - main(use_cuda=use_cuda) + # for use_cuda in (False, True): + main(use_cuda=False) diff --git a/python/paddle/fluid/tests/book/high-level-api/word2vec/no_test_word2vec_new_api.py b/python/paddle/fluid/tests/book/high-level-api/word2vec/no_test_word2vec_new_api.py index 93f7757a66ea9b217c7831c7263936ece0aa8f18..00ba4acf88b1b543c256a48709d7f514ebbe1e8c 100644 --- a/python/paddle/fluid/tests/book/high-level-api/word2vec/no_test_word2vec_new_api.py +++ b/python/paddle/fluid/tests/book/high-level-api/word2vec/no_test_word2vec_new_api.py @@ -99,45 +99,45 @@ def train(use_cuda, is_sparse, save_path): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() def event_handler(event): - # print type(event) if isinstance(event, fluid.EndEpochEvent): outs = trainer.test(reader=test_reader) avg_cost = outs[0] print("loss= ", avg_cost) if avg_cost < 5.0: - trainer.save_params(save_path) + trainer.save_inference_model(save_path) return if math.isnan(avg_cost): sys.exit("got NaN loss, training failed.") trainer = fluid.Trainer( partial(train_program, is_sparse), + partial(inference_program, is_sparse), fluid.optimizer.SGD(learning_rate=0.001), place=place) trainer.train( - reader=train_reader, num_epochs=100, event_handler=event_handler) + reader=train_reader, num_epochs=1, event_handler=event_handler) def infer(use_cuda, is_sparse, save_path): place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - inferencer = fluid.Inferencer( - partial(inference_program, is_sparse), - param_path=save_path, - place=place) + inferencer = fluid.Inferencer(param_path=save_path, place=place) lod = [0, 1] first_word = create_random_lodtensor(lod, place, low=0, high=dict_size - 1) second_word = create_random_lodtensor(lod, place, low=0, high=dict_size - 1) third_word = create_random_lodtensor(lod, place, low=0, high=dict_size - 1) fourth_word = create_random_lodtensor(lod, place, low=0, high=dict_size - 1) - result = inferencer.infer({ - 'firstw': first_word, - 'secondw': second_word, - 'thirdw': third_word, - 'forthw': fourth_word - }) - print(result) + + result = inferencer.infer( + { + 'firstw': first_word, + 'secondw': second_word, + 'thirdw': third_word, + 'forthw': fourth_word + }, + return_numpy=False) + print(np.array(result[0])) def main(use_cuda, is_sparse): diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py index 4eb25a6e00b7564ac17db568ec78c1c84933af43..a3be1a8db68c0d9d46746e70d95342447c35e237 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py @@ -205,7 +205,8 @@ class TestParallelExecutorBase(unittest.TestCase): allow_op_delay=False, feed_dict=None, seed=None, - use_parallel_executor=True): + use_parallel_executor=True, + balance_parameter_opt_between_cards=False): def run_executor(exe, feed, fetch_list, program=None): if isinstance(exe, fluid.ParallelExecutor): res = exe.run(fetch_list=fetch_list, feed=feed) @@ -234,7 +235,11 @@ class TestParallelExecutorBase(unittest.TestCase): if use_parallel_executor: exe = fluid.ParallelExecutor( - True, loss_name=loss.name, allow_op_delay=allow_op_delay) + True, + loss_name=loss.name, + allow_op_delay=allow_op_delay, + balance_parameter_opt_between_cards=balance_parameter_opt_between_cards + ) else: exe = fluid.Executor(place=place) @@ -280,20 +285,27 @@ class TestMNIST(TestParallelExecutorBase): fluid.recordio_writer.convert_reader_to_recordio_file( './mnist.recordio', reader, feeder) - def check_simple_fc_convergence(self): + def check_simple_fc_convergence(self, balance_parameter_opt_between_cards): self.check_network_convergence(simple_fc_net) self.check_network_convergence(simple_fc_net, allow_op_delay=True) img = np.zeros(shape=[32, 784], dtype='float32') label = np.ones(shape=[32, 1], dtype='int64') self.check_network_convergence( - simple_fc_net, feed_dict={"image": img, - "label": label}) + simple_fc_net, + feed_dict={"image": img, + "label": label}, + balance_parameter_opt_between_cards=balance_parameter_opt_between_cards + ) def test_simple_fc(self): - self.check_simple_fc_convergence() + self.check_simple_fc_convergence(False) + + def test_simple_fc_with_new_strategy(self): + self.check_simple_fc_convergence(True) - def check_simple_fc_parallel_accuracy(self): + def check_simple_fc_parallel_accuracy(self, + balance_parameter_opt_between_cards): img = np.zeros(shape=[32, 784], dtype='float32') label = np.ones(shape=[32, 1], dtype='int64') single_first_loss, single_last_loss = self.check_network_convergence( @@ -307,7 +319,9 @@ class TestMNIST(TestParallelExecutorBase): seed=1000, feed_dict={"image": img, "label": label}, - use_parallel_executor=True) + use_parallel_executor=True, + balance_parameter_opt_between_cards=balance_parameter_opt_between_cards + ) for p_f in parallel_first_loss: self.assertAlmostEquals(p_f, single_first_loss[0], delta=1e-6) @@ -315,18 +329,28 @@ class TestMNIST(TestParallelExecutorBase): self.assertAlmostEquals(p_l, single_last_loss[0], delta=1e-6) def test_simple_fc_parallel_accuracy(self): - self.check_simple_fc_parallel_accuracy() + self.check_simple_fc_parallel_accuracy(False) - def check_batchnorm_fc_convergence(self): + def test_simple_fc_parallel_accuracy_with_new_strategy(self): + self.check_simple_fc_parallel_accuracy(True) + + def check_batchnorm_fc_convergence(self, + balance_parameter_opt_between_cards): self.check_network_convergence(fc_with_batchnorm) img = np.zeros(shape=[32, 784], dtype='float32') label = np.ones(shape=[32, 1], dtype='int64') self.check_network_convergence( - fc_with_batchnorm, feed_dict={"image": img, - "label": label}) + fc_with_batchnorm, + feed_dict={"image": img, + "label": label}, + balance_parameter_opt_between_cards=balance_parameter_opt_between_cards + ) def test_batchnorm_fc(self): - self.check_batchnorm_fc_convergence() + self.check_batchnorm_fc_convergence(False) + + def test_batchnorm_fc_with_new_strategy(self): + self.check_batchnorm_fc_convergence(True) class TestResnet(TestParallelExecutorBase): @@ -348,17 +372,22 @@ class TestResnet(TestParallelExecutorBase): # fluid.recordio_writer.convert_reader_to_recordio_file( # "./flowers.recordio", reader, feeder, compressor=fluid.core.RecordIOWriter.Compressor.NoCompress) - def check_resnet_convergence(self): + def check_resnet_convergence(self, balance_parameter_opt_between_cards): import functools batch_size = 2 self.check_network_convergence( functools.partial( SE_ResNeXt50Small, batch_size=batch_size), iter=20, - batch_size=batch_size) + batch_size=batch_size, + balance_parameter_opt_between_cards=balance_parameter_opt_between_cards + ) def test_resnet(self): - self.check_resnet_convergence() + self.check_resnet_convergence(False) + + def test_resnet_with_new_strategy(self): + self.check_resnet_convergence(True) class ModelHyperParams(object): @@ -519,7 +548,7 @@ class TestTransformer(TestParallelExecutorBase): class ParallelExecutorTestingDuringTraining(unittest.TestCase): - def check_network_convergence(self): + def check_network_convergence(self, balance_parameter_opt_between_cards): main = fluid.Program() startup = fluid.Program() with fluid.program_guard(main, startup): @@ -539,12 +568,18 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase): feed_dict = {'image': image, 'label': label} train_exe = fluid.ParallelExecutor( - use_cuda=True, loss_name=loss.name, main_program=main) + use_cuda=True, + loss_name=loss.name, + main_program=main, + balance_parameter_opt_between_cards=balance_parameter_opt_between_cards + ) test_exe = fluid.ParallelExecutor( use_cuda=True, main_program=test_program, - share_vars_from=train_exe) + share_vars_from=train_exe, + balance_parameter_opt_between_cards=balance_parameter_opt_between_cards + ) for i in xrange(5): test_loss, = test_exe.run([loss.name], feed=feed_dict) @@ -558,8 +593,11 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase): "Train loss: " + str(train_loss) + "\n Test loss:" + str(test_loss)) - def test_parallel(self): - self.check_network_convergence() + def test_parallel_testing(self): + self.check_network_convergence(False) + + def test_parallel_testing_with_new_strategy(self): + self.check_network_convergence(True) import paddle.dataset.conll05 as conll05 @@ -579,7 +617,7 @@ embedding_name = 'emb' def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, - is_sparse, **ignored): + is_sparse, balance_parameter_opt_between_cards, **ignored): # 8 features predicate_embedding = fluid.layers.embedding( input=predicate, @@ -648,7 +686,9 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, class TestCRFModel(unittest.TestCase): - def check_network_convergence(self, is_sparse): + def check_network_convergence(self, + is_sparse, + balance_parameter_opt_between_cards=False): main = fluid.Program() startup = fluid.Program() with fluid.program_guard(main, startup): @@ -696,7 +736,11 @@ class TestCRFModel(unittest.TestCase): exe = fluid.Executor(place) exe.run(startup) - pe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name) + pe = fluid.ParallelExecutor( + use_cuda=True, + loss_name=avg_cost.name, + balance_parameter_opt_between_cards=balance_parameter_opt_between_cards + ) feeder = fluid.DataFeeder( feed_list=[ @@ -718,6 +762,14 @@ class TestCRFModel(unittest.TestCase): def test_update_dense_parameter(self): self.check_network_convergence(is_sparse=False) + def test_update_sparse_parameter_with_new_strategy(self): + self.check_network_convergence( + is_sparse=False, balance_parameter_opt_between_cards=True) + + def test_update_dense_parameter_with_new_strategy(self): + self.check_network_convergence( + is_sparse=False, balance_parameter_opt_between_cards=True) + # test fetch all the variables of global_block @@ -796,5 +848,42 @@ class TestFetchOp(unittest.TestCase): self.parallel_exe(train_inputs, seed=1) +class TestFeedParallel(unittest.TestCase): + def test_main(self): + main = fluid.Program() + startup = fluid.Program() + startup.random_seed = 1 + with fluid.scope_guard(fluid.core.Scope()): + with fluid.program_guard(main, startup): + data = fluid.layers.data( + name='image', shape=[3, 224, 224], dtype='float32') + label = fluid.layers.data( + name='label', shape=[1], dtype='int64') + out = Lenet(data, class_dim=102) + loss = fluid.layers.cross_entropy(input=out, label=label) + loss = fluid.layers.mean(loss) + opt = fluid.optimizer.Momentum( + learning_rate=0.1, + momentum=0.9, + regularization=fluid.regularizer.L2Decay(1e-4)) + + opt.minimize(loss) + place = fluid.CUDAPlace(0) + feeder = fluid.DataFeeder(place=place, feed_list=[data, label]) + reader = feeder.decorate_reader( + paddle.batch( + flowers.train(), batch_size=16), multi_devices=True) + exe = fluid.Executor(place) + exe.run(startup) + pe = fluid.ParallelExecutor( + use_cuda=True, loss_name=loss.name, main_program=main) + + for batch_id, data in enumerate(reader()): + loss_np = np.array(pe.run(feed=data, fetch_list=[loss.name])[0]) + print batch_id, loss_np + if batch_id == 2: + break + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py index 30b58b465ef2a7945ed87ce69397a050fc337623..67d8be82d5fa850a526f92753a086dac0cab5e2c 100644 --- a/python/paddle/fluid/trainer.py +++ b/python/paddle/fluid/trainer.py @@ -19,7 +19,7 @@ import executor import data_feeder import contextlib import io -import transpiler +import unique_name # optimizer is same as the parameter of Trainer.__init__. Rename it to opt_module import optimizer as opt_module @@ -56,26 +56,62 @@ class EndStepEvent(object): self.step = step_id +def check_and_get_place(place): + """ + Check the type of place or get the default place + Args: + place(None|core.CUDAPlace|core.CPUPlace): the place that trainer will be executed on. + + Raises: + TypeError if the type mismatched. + + Returns: + the original place if it is not None. + if fluid is compiled with CUDA, returns CUDAPlace(0) by default. + Otherwise returns CPUPlace by default. + """ + if place is None: + if core.is_compiled_with_cuda(): + return core.CUDAPlace(0) + else: + return core.CPUPlace() + else: + if not isinstance(place, core.CUDAPlace) and not isinstance( + place, core.CPUPlace): + raise TypeError("Place should be either CUDAPlace or CPUPlace") + return place + + class Trainer(object): """ Args: - program_func(callable): A function which will return loss. The loss must be a scaler. + train_func(callable): A function which will return loss. The loss must be a scalar. + infer_func(callable): A function which will return predict, used to save inference model optimizer(optimizer.Optimizer): The optimizer should be an instance of Optimizer place: The device place of this trainer. """ - def __init__(self, program_func, optimizer, param_path=None, place=None): + def __init__(self, + train_func, + infer_func, + optimizer, + param_path=None, + place=None): # 1. we need to generate a framework.Program by calling # program_func. Reference: fluid.program_guard in # test_word2vec.py + if not isinstance(optimizer, opt_module.Optimizer): + raise TypeError("The optimizer should be an instance of Optimizer") + + self.infer_func = infer_func self.scope = core.Scope() self.startup_program = framework.Program() self.train_program = framework.Program() with framework.program_guard(self.train_program, self.startup_program): - program_func_outs = program_func() + program_func_outs = train_func() self.test_outputs = program_func_outs if isinstance( program_func_outs, list) else [program_func_outs] self.test_program = self.train_program.clone() @@ -86,9 +122,9 @@ class Trainer(object): loss = self.test_outputs[0] optimize_ops, params_grads = optimizer.minimize(loss) - self.place = Trainer._check_and_get_place(place) + self.place = check_and_get_place(place) - self.dist_transpile_if_necessary(optimize_ops, params_grads) + self._dist_transpile_if_necessary(optimize_ops, params_grads) # 2. move the default_main_program to self.program and run the # default_startup program on an empty core.Scope() @@ -101,7 +137,7 @@ class Trainer(object): # load params from param_path into scope io.load_persistables(exe, dirname=param_path) - def dist_transpile_if_necessary(self, optimize_ops, params_grads): + def _dist_transpile_if_necessary(self, optimize_ops, params_grads): if "PADDLE_TRAINING_ROLE" not in os.environ: return @@ -190,31 +226,14 @@ class Trainer(object): exe = executor.Executor(self.place) io.save_persistables(exe, dirname=param_path) - @staticmethod - def _check_and_get_place(place): - """ - Check the type of place or get the default place - Args: - place(None|core.CUDAPlace|core.CPUPlace): the place that trainer will be executed on. - - Raises: - TypeError if the type mismatched. - - Returns: - the original place if it is not None. - if fluid is compiled with CUDA, returns CUDAPlace(0) by default. - Otherwise returns CPUPlace by default. - """ - if place is None: - if core.is_compiled_with_cuda(): - return core.CUDAPlace(0) - else: - return core.CPUPlace() - else: - if not isinstance(place, core.CUDAPlace) and not isinstance( - place, core.CPUPlace): - raise TypeError("Place should be either CUDAPlace or CPUPlace") - return place + def save_inference_model(self, model_path): + inference_program = framework.Program() + with framework.program_guard(inference_program): + with unique_name.guard(): + predict_var = self.infer_func() + predict_var = self.train_program.block(0).var(predict_var.name) + exe = executor.Executor(self.place) + io.save_inference_model(model_path, [], [predict_var], exe) @contextlib.contextmanager def _prog_and_scope_guard(self):