diff --git a/CMakeLists.txt b/CMakeLists.txt index db3c3b8e2069f9ae5ad02286b59decf8fe764c2d..4861e8065a8a70bcc3b859b121cf404461f47ba7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -66,6 +66,12 @@ option(WITH_ANAKIN "Compile with Anakin library" OFF) option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE}) option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF) option(WITH_SYSTEM_BLAS "Use system blas library" OFF) +option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION}) + +# PY_VERSION +if(NOT PY_VERSION) + set(PY_VERSION 2.7) +endif() # CMAKE_BUILD_TYPE if(NOT CMAKE_BUILD_TYPE) @@ -146,6 +152,7 @@ endif() ######################################################################################## include(external/mklml) # download mklml package +include(external/libxsmm) # download, build, install libxsmm include(external/zlib) # download, build, install zlib include(external/gflags) # download, build, install gflags include(external/glog) # download, build, install glog @@ -232,6 +239,10 @@ if(WITH_MKLML) list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB}) endif() +if(WITH_LIBXSMM) + list(APPEND EXTERNAL_LIBS ${LIBXSMM_LIBS}) +endif() + if(WITH_MKLDNN) list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB}) endif() diff --git a/Dockerfile b/Dockerfile index 48c750358cfcb227667c429f19befcaa2f51ebbd..402adee2ea2822250ebc8f6229fd6a44545d58e5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -80,7 +80,7 @@ RUN pip install pre-commit 'ipython==5.3.0' && \ pip install opencv-python #For docstring checker -RUN pip install pylint pytest astroid isort +RUN pip install pylint pytest astroid isort LinkChecker COPY ./python/requirements.txt /root/ RUN pip install -r /root/requirements.txt diff --git a/cmake/external/libxsmm.cmake b/cmake/external/libxsmm.cmake new file mode 100644 index 0000000000000000000000000000000000000000..530f7ebe2813fb2f00c6b5b4d1f7b2f04fe650b0 --- /dev/null +++ b/cmake/external/libxsmm.cmake @@ -0,0 +1,57 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +OPTION(WITH_LIBXSMM "Compile with libxsmm" OFF) + +IF(NOT WITH_LIBXSMM) + return() +ENDIF() + +IF(WIN32 OR APPLE OR ANDROID OR IOS) + MESSAGE(WARNING "Windows, Mac or Mobile are not supported with libxsmm in Paddle yet.") + SET(WITH_LIBXSMM OFF CACHE STRING "Disable LIBXSMM" FORCE) + return() +ENDIF() + +INCLUDE (ExternalProject) + +SET(LIBXSMM_SOURCES_DIR ${THIRD_PARTY_PATH}/libxsmm) +SET(LIBXSMM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/libxsmm) +SET(LIBXSMM_INCLUDE_DIR "${LIBXSMM_INSTALL_DIR}/include" CACHE PATH "LIBXSMM include directory." FORCE) +SET(LIBXSMM_LIBRARY_DIR "${LIBXSMM_INSTALL_DIR}/lib" CACHE PATH "LIBXSMM library directory." FORCE) +SET(LIBXSMM_LIBS "${LIBXSMM_LIBRARY_DIR}/libxsmm.a" + "${LIBXSMM_LIBRARY_DIR}/libxsmmnoblas.a") + +ExternalProject_Add( + extern_libxsmm + GIT_REPOSITORY "https://github.com/hfp/libxsmm.git" + GIT_TAG "7cc03b5b342fdbc6b6d990b190671c5dbb8489a2" + PREFIX ${LIBXSMM_SOURCES_DIR} + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_IN_SOURCE 1 + BUILD_COMMAND $(MAKE) --silent PREFIX=${LIBXSMM_INSTALL_DIR} CXX=g++ CC=gcc WARP=0 install + INSTALL_COMMAND "" +) +ADD_LIBRARY(libxsmm STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET libxsmm PROPERTY IMPORTED_LOCATION "${LIBXSMM_LIBRARY_DIR}/libxsmm.a") +SET_PROPERTY(TARGET libxsmm PROPERTY IMPORTED_LOCATION "${LIBXSMM_LIBRARY_DIR}/libxsmmnoblas.a") + +MESSAGE(STATUS "Libxsmm library: ${LIBXSMM_LIBS}") +include_directories(${LIBXSMM_INCLUDE_DIR}) +ADD_DEFINITIONS(-DPADDLE_WITH_LIBXSMM) +ADD_DEPENDENCIES(libxsmm extern_libxsmm) +LIST(APPEND external_project_dependencies libxsmm) + diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake index ce6a88b51dc98ac46dd3935f12658d60d364ba8c..56024edf5be092f81ed893633a8e7cafc8c8d429 100644 --- a/cmake/external/openblas.cmake +++ b/cmake/external/openblas.cmake @@ -121,6 +121,11 @@ ELSE() TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES}) ENDIF("${CBLAS_PROVIDER}" STREQUAL "MKLML") +IF(WITH_LIBXSMM) + TARGET_LINK_LIBRARIES(cblas ${LIBXSMM_LIBS}) + ADD_DEPENDENCIES(cblas extern_libxsmm) +ENDIF() + IF(NOT ${CBLAS_FOUND}) ADD_DEPENDENCIES(cblas extern_openblas) LIST(APPEND external_project_dependencies cblas) diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake index d7e5571bdbd8ba58d8a08c9426971f1c7b186413..f17b8d46dc2d8ded81ced7de5827d5e7fd5109f0 100644 --- a/cmake/external/python.cmake +++ b/cmake/external/python.cmake @@ -18,8 +18,9 @@ ENDIF() INCLUDE(python_module) -FIND_PACKAGE(PythonInterp 2.7) -FIND_PACKAGE(PythonLibs 2.7) +FIND_PACKAGE(PythonInterp ${PY_VERSION}) +FIND_PACKAGE(PythonLibs ${PY_VERSION}) + # Fixme: Maybe find a static library. Get SHARED/STATIC by FIND_PACKAGE. ADD_LIBRARY(python SHARED IMPORTED GLOBAL) SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES}) diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc index b82c2ef4082110f1621eb38d50361396511a4825..6f5d4471a97cc4efc73b9df68040ab9eccde0b1c 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc @@ -276,13 +276,22 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build( } } - // Insert BCast Ops - for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) { - auto &to_bcast_set = bcast_var_name_set[dev_id]; - for (auto &bcast_name : to_bcast_set) { - CreateBroadcastOp(&result, bcast_name, dev_id); + bool use_gpu = false; +#ifdef PADDLE_WITH_CUDA + use_gpu = nccl_ctxs_ != nullptr; +#endif + + if (use_gpu || + strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) { + // Insert BCast Ops + for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) { + auto &to_bcast_set = bcast_var_name_set[dev_id]; + for (auto &bcast_name : to_bcast_set) { + CreateBroadcastOp(&result, bcast_name, dev_id); + } } } + /* Dependency graph has been constructed. However, there are still data hazards need to be handled. @@ -412,14 +421,19 @@ int MultiDevSSAGraphBuilder::GetOpDeviceID(const OpDesc &op) const { if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) { return -1; } - - for (auto &varname : op.InputArgumentNames()) { - int dev_id = GetVarDeviceID(varname); - if (dev_id != -1) { - return dev_id; - } + int op_role = boost::get( + op.GetAttr(framework::OpProtoAndCheckerMaker::OpRoleAttrName())); + if (op_role != static_cast(framework::OpRole::kOptimize)) { + return -1; } - return -1; + auto param_grad = boost::get>( + op.GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName())); + + PADDLE_ENFORCE_EQ(param_grad.size(), 2U); + int dev_id = GetVarDeviceID(param_grad[1]); + PADDLE_ENFORCE_NE(dev_id, -1, "dev_id should not be -1.[%s, %s]", op.Type(), + param_grad[0]); + return dev_id; } int MultiDevSSAGraphBuilder::GetVarDeviceID(const std::string &varname) const { diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 3a9027713afb5287c7addf8be745acfd185104ee..9a72e1baa34274201c40bd83a7aace549a7fc6ae 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -45,6 +45,7 @@ class ParallelExecutorPrivate { #endif bool own_local_scope_; bool use_cuda_; + bool use_all_reduce_; }; std::vector &ParallelExecutor::GetLocalScopes() { @@ -62,6 +63,14 @@ ParallelExecutor::ParallelExecutor( : member_(new ParallelExecutorPrivate(places)) { member_->global_scope_ = scope; member_->use_cuda_ = exec_strategy.use_cuda_; + member_->use_all_reduce_ = + build_strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce; + + if (!member_->use_all_reduce_) { + PADDLE_ENFORCE(places.size() > 1, + "If you set build_strategy.reduce with 'Reduce'," + "the number of places must be greater than 1."); + } // Step 1. Bcast the params to devs. // Create local scopes @@ -95,7 +104,7 @@ ParallelExecutor::ParallelExecutor( } if (member_->local_scopes_.size() != 1 && local_scopes.empty()) { - BCastParamsToDevs(bcast_vars); + BCastParamsToDevices(bcast_vars); } // Startup Program has been run. All local scopes has correct parameters. @@ -117,7 +126,7 @@ ParallelExecutor::ParallelExecutor( #ifdef PADDLE_WITH_CUDA builder_factory.SetNCCLContextMap(member_->nccl_ctxs_.get()); #else - PADDLE_THROW("Not compiled with CUDA"); + PADDLE_THROW("Not compiled with CUDA."); #endif } @@ -131,9 +140,9 @@ ParallelExecutor::ParallelExecutor( member_->places_, std::move(member_->executor_))); } -void ParallelExecutor::BCastParamsToDevs( +void ParallelExecutor::BCastParamsToDevices( const std::unordered_set &vars) const { - // the the initializing bcast, all vars would be bcast from device(0), + // the initializing bcast, all vars would be bcast from device(0), // otherwise // bcast from the specified device. bool initializing = builder_.get() == nullptr ? true : false; @@ -209,9 +218,16 @@ void ParallelExecutor::BCastParamsToDevs( auto local_scope = member_->local_scopes_[i]; auto *t = local_scope->Var(var)->GetMutable(); - t->Resize(dims); - t->mutable_data(cpu, main_tensor.type()); - paddle::framework::TensorCopy(main_tensor, cpu, t); + + // FIXME(zcd): LR_DECAY_COUNTER should not be shared. This is a hot fix. + if (member_->use_all_reduce_ || member_->use_cuda_ || + var == "@LR_DECAY_COUNTER@") { + t->Resize(dims); + t->mutable_data(cpu, main_tensor.type()); + paddle::framework::TensorCopy(main_tensor, cpu, t); + } else { + t->ShareDataWith(main_tensor); + } } } } diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 6985b6540690c6218bcee51ba0e69f3d34812bfc..ffb9934a2d702b2bf6db7ad75a6bf9867e1e9901 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -66,7 +66,7 @@ class ParallelExecutor { void Run(const std::vector &fetch_tensors, const std::string &fetched_var_name); - void BCastParamsToDevs(const std::unordered_set &vars) const; + void BCastParamsToDevices(const std::unordered_set &vars) const; private: ParallelExecutorPrivate *member_; diff --git a/paddle/fluid/operators/checkpoint_notify_op.cc b/paddle/fluid/operators/checkpoint_notify_op.cc index c4219a429a53eb4869426a2674109555fb784b85..3a2527e407bb179c4873fa3ffe2e8f22fb47faf7 100644 --- a/paddle/fluid/operators/checkpoint_notify_op.cc +++ b/paddle/fluid/operators/checkpoint_notify_op.cc @@ -48,7 +48,7 @@ class CheckpointNotifyOp : public framework::OperatorBase { VLOG(3) << "checkpoint notify sending lookup table: " << lookup_table_name << " and dir:" << dir << " to " << epmap[i]; } - rpc_client->Wait(); + PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient"); } }; diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc index 35318a805898de645c844a2224f6df8c458d346c..4d60801b6a6ecaabf1165321e0cb19044d27aa34 100644 --- a/paddle/fluid/operators/distributed/grpc_client.cc +++ b/paddle/fluid/operators/distributed/grpc_client.cc @@ -281,9 +281,10 @@ void GRPCClient::AsyncCheckpointNotify(const std::string& ep, req_count_++; } -void GRPCClient::Wait() { +bool GRPCClient::Wait() { std::unique_lock lk(sync_mutex_); - sync_cond_.wait(lk, [this] { return req_count_ == 0; }); + sync_cond_.wait(lk, [this] { return (req_count_ == 0 || ok_ == false); }); + return ok_; } void GRPCClient::Proceed() { @@ -297,6 +298,14 @@ void GRPCClient::Proceed() { if (c->status_.ok()) { VLOG(3) << c->var_h_.String() << " process"; c->Process(); + } else if (c->status_.error_code() == grpc::StatusCode::DEADLINE_EXCEEDED) { + LOG(ERROR) << c->var_h_.String() + << " meets grpc error:" << c->status_.error_message(); + { + std::lock_guard lk(sync_mutex_); + ok_ = false; + } + sync_cond_.notify_all(); } else { LOG(FATAL) << c->var_h_.String() << " meets grpc error:" << c->status_.error_message(); diff --git a/paddle/fluid/operators/distributed/grpc_client.h b/paddle/fluid/operators/distributed/grpc_client.h index 5dae20155edcf9edd746a5d9a9bbe0ccd789f431..d03a3e56aedbe4a008ee9ff187111f7635d14b58 100644 --- a/paddle/fluid/operators/distributed/grpc_client.h +++ b/paddle/fluid/operators/distributed/grpc_client.h @@ -188,7 +188,7 @@ class CheckpointNotifyProcessor : public BaseProcessor { class GRPCClient : public RPCClient { public: - GRPCClient() {} + GRPCClient() : ok_(true) {} virtual ~GRPCClient(); bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx, @@ -221,7 +221,7 @@ class GRPCClient : public RPCClient { void AsyncSendEndPass(const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override; - void Wait() override; + bool Wait() override; void SendBeginPass() override; @@ -247,6 +247,7 @@ class GRPCClient : public RPCClient { std::mutex sync_mutex_; std::condition_variable sync_cond_; std::atomic req_count_{0}; + bool ok_; // mutex for GetChannel thread safety std::mutex chan_mutex_; diff --git a/paddle/fluid/operators/distributed/rpc_client.h b/paddle/fluid/operators/distributed/rpc_client.h index 6479d3a97bafba37b74a1d1c04852a6e60e01be8..4d87376fbf776e29156b78d826f5012bc53460df 100644 --- a/paddle/fluid/operators/distributed/rpc_client.h +++ b/paddle/fluid/operators/distributed/rpc_client.h @@ -72,7 +72,7 @@ class RPCClient { virtual void SendBeginPass() = 0; virtual void SendEndPass() = 0; - virtual void Wait() = 0; + virtual bool Wait() = 0; template static RPCClient* GetInstance() { diff --git a/paddle/fluid/operators/fetch_barrier_op.cc b/paddle/fluid/operators/fetch_barrier_op.cc index 02beb80fc8a9f451393dcdd54492c4f88f908497..680fde19eefe57475b7526ebc29d4ff977a16977 100644 --- a/paddle/fluid/operators/fetch_barrier_op.cc +++ b/paddle/fluid/operators/fetch_barrier_op.cc @@ -45,13 +45,13 @@ class FetchBarrierOp : public framework::OperatorBase { distributed::RPCClient* rpc_client = distributed::RPCClient::GetInstance(); - rpc_client->Wait(); + PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient"); for (auto& ep : eps) { VLOG(3) << "fetch barrier, ep: " << ep; rpc_client->AsyncSendFetchBarrier(ep); } - rpc_client->Wait(); + PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient"); } }; diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h index 9f6c1e5c35f02cd4bc729eea78b17fac017aa90e..70f88f24f682e05972ca73ef7b50f96be50d1ef4 100644 --- a/paddle/fluid/operators/math/blas.h +++ b/paddle/fluid/operators/math/blas.h @@ -21,6 +21,10 @@ #include "paddle/fluid/platform/dynload/mklml.h" #endif +#ifdef PADDLE_WITH_LIBXSMM +#include +#endif + #ifdef PADDLE_USE_OPENBLAS #include #endif diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h index 2ce94cfc93823aa891114ef8fd1e851727ebc623..238bd3f8def9eaa6c18afdab1031c4babfde8ae2 100644 --- a/paddle/fluid/operators/math/blas_impl.h +++ b/paddle/fluid/operators/math/blas_impl.h @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. #pragma once +#include #include #include "paddle/fluid/operators/math/math_function.h" @@ -30,6 +31,12 @@ struct CBlas { platform::dynload::cblas_sgemm(args...); } +#ifdef PADDLE_WITH_LIBXSMM + template + static void SMM_GEMM(ARGS... args) { + libxsmm_sgemm(args...); + } +#endif template static void AXPY(ARGS... args) { platform::dynload::cblas_saxpy(args...); @@ -63,6 +70,12 @@ struct CBlas { platform::dynload::cblas_dgemm(args...); } +#ifdef PADDLE_WITH_LIBXSMM + template + static void SMM_GEMM(ARGS... args) { + libxsmm_dgemm(args...); + } +#endif template static void AXPY(ARGS... args) { platform::dynload::cblas_daxpy(args...); @@ -140,6 +153,9 @@ struct CBlas { template <> struct CBlas { static void GEMM(...) { PADDLE_THROW("float16 GEMM not supported on CPU"); } + static void SMM_GEMM(...) { + PADDLE_THROW("float16 SMM_GEMM not supported on CPU"); + } #ifdef PADDLE_WITH_MKLML static void GEMM_BATCH(...) { PADDLE_THROW("float16 GEMM_BATCH not supported on CPU"); @@ -147,6 +163,33 @@ struct CBlas { #endif }; +template +inline bool UseXSMM(const int &m, const int &n, const int &k, bool transa, + bool transb, const T &alpha, const T &beta) { +#ifdef PADDLE_WITH_LIBXSMM + // Refer to https://github.com/hfp/libxsmm/blob/master/README.md + // But the threshold is custom + constexpr int LIBXSMM_THRESHOLD = 20 * 20 * 20; + if (m * n * k > LIBXSMM_THRESHOLD || transa || transb || + std::abs(alpha - static_cast(1) > + std::numeric_limits::epsilon()) || + std::abs(beta) > std::numeric_limits::epsilon()) { + return false; + } else { + return true; + } +#endif + return false; +} + +template <> +inline bool UseXSMM(const int &m, const int &n, const int &k, + bool transa, bool transb, + const platform::float16 &alpha, + const platform::float16 &beta) { + return false; +} + template <> template void Blas::GEMM(CBLAS_TRANSPOSE transA, @@ -156,8 +199,21 @@ void Blas::GEMM(CBLAS_TRANSPOSE transA, int lda = (transA == CblasNoTrans) ? K : M; int ldb = (transB == CblasNoTrans) ? N : K; int ldc = N; - CBlas::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb, - beta, C, ldc); +#ifdef PADDLE_WITH_LIBXSMM + if (UseXSMM(M, N, K, transA != CblasNoTrans, transB != CblasNoTrans, alpha, + beta)) { + // Note: SMM use ColMajor + const char transa = 'N'; + const char transb = 'N'; + CBlas::SMM_GEMM(&transa, &transb, &N, &M, &K, &alpha, B, &ldb, A, &lda, + &beta, C, &ldc); + } else { +#endif + CBlas::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, + ldb, beta, C, ldc); +#ifdef PADDLE_WITH_LIBXSMM + } +#endif } template <> diff --git a/paddle/fluid/operators/math/math_function_test.cc b/paddle/fluid/operators/math/math_function_test.cc index b545671b43d3a453ab03e4774427179617f62db0..078dd448c385dbb8a00025ee2ba08d0c41a4730a 100644 --- a/paddle/fluid/operators/math/math_function_test.cc +++ b/paddle/fluid/operators/math/math_function_test.cc @@ -54,8 +54,64 @@ TEST(math_function, gemm_notrans_cblas) { EXPECT_EQ(input3_ptr[6], 86); EXPECT_EQ(input3_ptr[7], 99); } +#ifdef PADDLE_WITH_LIBXSMM +template +void MklSmmCompare(int m, int n, int k) { + paddle::framework::Tensor mat_a; + paddle::framework::Tensor mat_b; + paddle::framework::Tensor mat_c_smm; + paddle::framework::Tensor mat_c_mkl; + auto* cpu_place = new paddle::platform::CPUPlace(); + + T* A = mat_a.mutable_data({m, k}, *cpu_place); + T* B = mat_b.mutable_data({k, n}, *cpu_place); + T* CSMM = mat_c_smm.mutable_data({m, n}, *cpu_place); + T* CMKL = mat_c_mkl.mutable_data({m, n}, *cpu_place); + T alpha = static_cast(1); + T beta = static_cast(0); + for (int i = 0; i < mat_a.numel(); ++i) { + A[i] = static_cast(i); + } + for (int i = 0; i < mat_b.numel(); ++i) { + B[i] = static_cast(i); + } + // lda,ldb,ldc follow RowMajor + int lda = k; + int ldb = n; + int ldc = n; + + auto smm = [&, m, n, k, lda, ldb, ldc, alpha, beta]() { + const char transa = 'N'; + const char transb = 'N'; + paddle::operators::math::CBlas::SMM_GEMM(&transa, &transb, &n, &m, &k, + &alpha, B, &ldb, A, &lda, &beta, + CSMM, &ldc); + }; + + auto mkl = [&, m, n, k, lda, ldb, ldc, alpha, beta]() { + paddle::operators::math::CBlas::GEMM(CblasRowMajor, CblasNoTrans, + CblasNoTrans, m, n, k, alpha, A, + lda, B, ldb, beta, CMKL, ldc); + }; + + smm(); + mkl(); + ASSERT_EQ(mat_c_mkl.numel(), mat_c_smm.numel()); + for (int i = 0; i < mat_c_mkl.numel(); ++i) { + EXPECT_FLOAT_EQ(CSMM[i], CMKL[i]); + } +} +TEST(math_function, gemm_mkl_vs_smm) { + MklSmmCompare(1, 2, 3); + MklSmmCompare(1, 2, 3); + MklSmmCompare(3, 2, 1); + MklSmmCompare(3, 2, 1); + MklSmmCompare(3, 8, 5); + MklSmmCompare(3, 8, 5); +} +#endif -TEST(math_function, gemm_trans_clbas) { +TEST(math_function, gemm_trans_cblas) { paddle::framework::Tensor input1; paddle::framework::Tensor input2; paddle::framework::Tensor input3; diff --git a/paddle/fluid/operators/prefetch_op.cc b/paddle/fluid/operators/prefetch_op.cc index 8734282fe496b8e90af19abd5549566d62316fc3..4b804740a06f9e29704f2b3f58a90191e3559347 100644 --- a/paddle/fluid/operators/prefetch_op.cc +++ b/paddle/fluid/operators/prefetch_op.cc @@ -53,7 +53,7 @@ class PrefetchOp : public framework::OperatorBase { VLOG(3) << "don't send no-initialied variable: " << ins[i]; } } - rpc_client->Wait(); + PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient"); } }; diff --git a/paddle/fluid/operators/recv_op.cc b/paddle/fluid/operators/recv_op.cc index 9854a31f5b10f5ecd940c0d41c2c3e468fc17bad..1ba684014904e61a86bebacd7d29d7e10d313092 100644 --- a/paddle/fluid/operators/recv_op.cc +++ b/paddle/fluid/operators/recv_op.cc @@ -51,7 +51,7 @@ class RecvOp : public framework::OperatorBase { rpc_client->AsyncGetVar(epmap[i], ctx, scope, outs[i]); } if (sync_mode) { - rpc_client->Wait(); + PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient"); } } }; diff --git a/paddle/fluid/operators/send_barrier_op.cc b/paddle/fluid/operators/send_barrier_op.cc index 6b4572dcccc21e783f1df0b9bcde11d532ff4ba8..d7f8e994afd7e656bd5a9dd7c5ab45f0d52fe88b 100644 --- a/paddle/fluid/operators/send_barrier_op.cc +++ b/paddle/fluid/operators/send_barrier_op.cc @@ -50,13 +50,13 @@ class SendBarrierOp : public framework::OperatorBase { VLOG(3) << "SendBarrierOp sync_mode:" << sync_mode; // need to wait before sending send_barrier message - rpc_client->Wait(); + PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient"); if (sync_mode) { for (auto& ep : eps) { VLOG(3) << "send barrier, ep: " << ep; rpc_client->AsyncSendBatchBarrier(ep); } - rpc_client->Wait(); + PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient"); } } }; diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc index 0cac329aafa8c4c67cae48ba62a48575f5edba92..829f310d4233c01a7fbb9ccf7427f6e47ce8d384 100644 --- a/paddle/fluid/operators/send_op.cc +++ b/paddle/fluid/operators/send_op.cc @@ -59,7 +59,7 @@ class SendOp : public framework::OperatorBase { } } if (sync_send) { - rpc_client->Wait(); + PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient"); } } }; diff --git a/paddle/fluid/operators/top_k_op.h b/paddle/fluid/operators/top_k_op.h index 7ddb82ef6ff063868a4b9b603b8ab89700b9dd13..054dd481994d03f71b0ed5dc73e103085f6c91aa 100644 --- a/paddle/fluid/operators/top_k_op.h +++ b/paddle/fluid/operators/top_k_op.h @@ -60,6 +60,7 @@ class TopkKernel : public framework::OpKernel { #endif for (size_t i = 0; i < row; i++) { std::vector> vec; + vec.reserve(col); for (size_t j = 0; j < col; j++) { vec.push_back(std::pair(eg_input(i, j), j)); } diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index d8dc421bed711cfc1a149592c24b11c4ef115ec9..216c4666c0a311f93f29692b2ca1d17bf9dafab8 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -68,7 +68,7 @@ bool IsCompiledWithCUDA() { } bool IsCompiledWithDIST() { -#ifdef PADDLE_WITH_DIST +#ifdef PADDLE_WITH_DISTRIBUTE return true; #else return false; @@ -669,7 +669,7 @@ All parameter, weight, gradient are variables in Paddle. const std::string &, Scope *, std::vector &, const ExecutionStrategy &, const BuildStrategy &, size_t, size_t>()) - .def("bcast_params", &ParallelExecutor::BCastParamsToDevs) + .def("bcast_params", &ParallelExecutor::BCastParamsToDevices) // NOTE: even we return a vec* to Python use reference policy. // We still cannot get local_scope from this vector, since the element // of vec will be freed by Python GC. We can only return Scope* diff --git a/paddle/legacy/utils/PythonUtil.cpp b/paddle/legacy/utils/PythonUtil.cpp index 7faeff55c28b9065179ad27b3b604a9f411249e5..21ed049c4d2743d1fa914d6948d6c8c2862f0bfc 100644 --- a/paddle/legacy/utils/PythonUtil.cpp +++ b/paddle/legacy/utils/PythonUtil.cpp @@ -136,7 +136,13 @@ std::string callPythonFunc(const std::string& moduleName, const std::string& funcName, const std::vector& args) { PyObjectPtr obj = callPythonFuncRetPyObj(moduleName, funcName, args); +#if PY_MAJOR_VERSION >= 3 + Py_ssize_t str_size = 0u; + const char* str = PyUnicode_AsUTF8AndSize(obj.get(), &str_size); + return std::string(str, (size_t)str_size); +#else return std::string(PyString_AsString(obj.get()), PyString_Size(obj.get())); +#endif // PY_MAJOR_VERSION >= 3 } PyObjectPtr createPythonClass( diff --git a/paddle/legacy/utils/PythonUtil.h b/paddle/legacy/utils/PythonUtil.h index b0c8612c378fbe12cdf24e51a5b6546740b2d4c8..d5b2dbddde21f5c2a0696aadeda2b057175fc5e9 100644 --- a/paddle/legacy/utils/PythonUtil.h +++ b/paddle/legacy/utils/PythonUtil.h @@ -88,6 +88,33 @@ PyObjectPtr createPythonClass(const std::string& moduleName, namespace py { PyObjectPtr import(const std::string& moduleName); +#if PY_MAJOR_VERSION >= 3 +/** + * Cast a PyLong to int type T. + * @tparam T return type. + * @param [in] obj PyLong object. + * @param [out] ok status for casting. False if error occured. nullptr if user + * don't care is ok or not. + * @return The value of python object, or 0 if not ok. + */ +template +T castInt(PyObject* obj, bool* ok = nullptr) { + // Refer to https://www.python.org/dev/peps/pep-0237/, the int and long object + // were unified to long since python3 + if (PyLong_Check(obj)) { + if (ok) *ok = true; + return (T)PyLong_AsUnsignedLong(obj); + } else { + if (ok) *ok = false; + return (T)0; + } +} + +// Convert PyAPI from 2.x to 3.x +#define PyString_FromString PyUnicode_FromString +#define PyString_AsString PyUnicode_AsUTF8 + +#else /** * Cast a PyLong or PyInt to int type T. * @tparam T return type. @@ -109,6 +136,7 @@ T castInt(PyObject* obj, bool* ok = nullptr) { return (T)0; } } +#endif // PY_MAJOR_VERSION >= 3 /** * Invoke repr of python object. diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 57fd28d5b4791c66bc3762be63b27ad9f199d69c..4bdd86990f82be96085dca640cd52ffb843b2c32 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -78,6 +78,12 @@ function cmake_gen() { PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27mu/bin/python -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27mu/include/python2.7 -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs4/lib/libpython2.7.so" + elif [ "$1" == "cp35-cp35m" ]; then + export LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} + export PATH=/opt/_internal/cpython-3.5.1/bin/:${PATH} + export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.5.1/bin/python3 + -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.5.1/include/python3.5m + -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.5.1/lib/libpython3.so" fi fi @@ -108,6 +114,7 @@ function cmake_gen() { -DWITH_CONTRIB=${WITH_CONTRIB:-ON} -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} -DWITH_INFERENCE_DEMO=${WITH_INFERENCE_DEMO:-ON} + -DPY_VERSION=${PY_VERSION:-2.7} ======================================== EOF # Disable UNITTEST_USE_VIRTUALENV in docker because @@ -136,7 +143,8 @@ EOF -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \ -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \ -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \ - -DWITH_INFERENCE_DEMO=${WITH_INFERENCE_DEMO:-ON} + -DWITH_INFERENCE_DEMO=${WITH_INFERENCE_DEMO:-ON} \ + -DPY_VERSION=${PY_VERSION:-2.7} } function abort(){ diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index d1cf04161ae4444ebc7da7fbc20e37dafe6c0fb1..241a07a35297e85763781a42696fd727733459a3 100644 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -12,16 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. try: - from version import full_version as __version__ - from version import commit as __git_commit__ + from paddle.version import full_version as __version__ + from paddle.version import commit as __git_commit__ except ImportError: import sys - sys.stderr.write('''Warning with import paddle: you should not + sys.stderr.write('''Warning with import paddle: you should not import paddle from the source directory; please install paddlepaddle*.whl firstly.''' ) -import reader -import dataset -import batch +import paddle.reader +import paddle.dataset +import paddle.batch batch = batch.batch diff --git a/python/paddle/dataset/__init__.py b/python/paddle/dataset/__init__.py index 3315e826e82a33dfeb9c5223ce196cffb1ae7234..54aa3edc51d3734633ce077a59bd86cec8d09032 100644 --- a/python/paddle/dataset/__init__.py +++ b/python/paddle/dataset/__init__.py @@ -15,20 +15,20 @@ Dataset package. """ -import mnist -import imikolov -import imdb -import cifar -import movielens -import conll05 -import uci_housing -import sentiment -import wmt14 -import wmt16 -import mq2007 -import flowers -import voc2012 -import image +import paddle.dataset.mnist +import paddle.dataset.imikolov +import paddle.dataset.imdb +import paddle.dataset.cifar +import paddle.dataset.movielens +import paddle.dataset.conll05 +import paddle.dataset.uci_housing +import paddle.dataset.sentiment +import paddle.dataset.wmt14 +import paddle.dataset.wmt16 +import paddle.dataset.mq2007 +import paddle.dataset.flowers +import paddle.dataset.voc2012 +import paddle.dataset.image __all__ = [ 'mnist', diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py index 2a8e3d410add466436524d8cc7714fce955af2b5..d9acfef58c3ba92c763d195c88f1323b3c6512b9 100644 --- a/python/paddle/fluid/clip.py +++ b/python/paddle/fluid/clip.py @@ -324,10 +324,12 @@ def set_gradient_clip(clip, param_list=None, program=None): param.gradient_clip_attr = copy.deepcopy(clip) -def append_gradient_clip_ops(param_grad): +def append_gradient_clip_ops(param_grads): context = dict() - for p, g in param_grad: - with p.block.program.optimized_guard(p): + for p, g in param_grads: + if g is None: + continue + with p.block.program.optimized_guard([p, g]): clip_attr = getattr(p, 'gradient_clip_attr', NullGradientClipAttr()) if clip_attr is None: clip_attr = NullGradientClipAttr() @@ -339,8 +341,10 @@ def append_gradient_clip_ops(param_grad): clip_attr._process_context(context=context, param=p, grad=g) res = [] - for p, g in param_grad: - with p.block.program.optimized_guard(p): + for p, g in param_grads: + if g is None: + continue + with p.block.program.optimized_guard([p, g]): res.append(clip_attr._create_operators(param=p, grad=g)) return res diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index ea3117e02bd993b06de39725b2c3296031065e3c..d89cb246a939f247b94bc49f39198a909b1c30ea 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -1319,7 +1319,7 @@ class Program(object): self._op_role_var = [var_name] @contextlib.contextmanager - def optimized_guard(self, var): + def optimized_guard(self, param_and_grads): """ A with guard to set :code:`Optimization` :code:`OpRole` and :code:`OpRoleVar` automatically. @@ -1327,17 +1327,20 @@ class Program(object): Notes: This is a very low level API. Users should not use it directly. Args: - var(Variable|str): The variable (name) to be optimized. + param_and_grads(list): The variables (names) to be optimized. Examples: >>> p, g = backward(...) - >>> with program.optimized_guard(p): + >>> with program.optimized_guard([p,g]): >>> p = p - 0.001 * g """ OpRole = core.op_proto_and_checker_maker.OpRole self._current_role = OpRole.Optimize - self._op_role_var = [var.name if isinstance(var, Variable) else var] + self._op_role_var = [ + var.name if isinstance(var, Variable) else var + for var in param_and_grads + ] yield self._op_role_var = [] self._current_role = OpRole.Forward diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 214f47afa1e4e29d53b11ccc035b4d840f963591..94e78d155f1c9aa5b7abda0e83db528ad5e2aafb 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -123,7 +123,7 @@ class Optimizer(object): """ pass - def _finish_update(self, block, parameters): + def _finish_update(self, block, parameters_and_grads): """Finish any custom updates needed before completing an optimization step @@ -226,18 +226,18 @@ class Optimizer(object): optimize_ops = [] for param_and_grad in parameters_and_grads: + if param_and_grad[1] is None: + continue with param_and_grad[0].block.program.optimized_guard( - param_and_grad[0]): - if param_and_grad[0].trainable is True and param_and_grad[ - 1] is not None: + param_and_grad): + if param_and_grad[0].trainable is True: optimize_op = self._append_optimize_op(loss.block, param_and_grad) optimize_ops.append(optimize_op) # Get custom finish ops for subclasses # FIXME: Need to fix this once we figure out how to handle dependencies - self._finish_update(loss.block, - [p[0] for p in parameters_and_grads]) + self._finish_update(loss.block, parameters_and_grads) end = len(global_block.ops) return global_block.slice_ops(start, end) @@ -564,13 +564,15 @@ class AdamOptimizer(Optimizer): return adam_op - def _finish_update(self, block, parameters): + def _finish_update(self, block, param_and_grads): """Update Beta1 and Beta2 Power accumulators """ assert isinstance(block, framework.Block) main_block = block.program.global_block() - for param in parameters: - with param.block.program.optimized_guard(param): + for param, grad in param_and_grads: + if grad is None: + continue + with param.block.program.optimized_guard([param, grad]): beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, param) beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str, @@ -691,13 +693,15 @@ class AdamaxOptimizer(Optimizer): return adamax_op - def _finish_update(self, block, parameters): + def _finish_update(self, block, parameters_and_grads): """Update Beta1 Power accumulator """ assert isinstance(block, framework.Block) main_block = block.program.global_block() - for param in parameters: - with param.block.program.optimized_guard(param): + for param, grad in parameters_and_grads: + if grad is None: + continue + with param.block.program.optimized_guard([param, grad]): beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, param) main_block.append_op( @@ -1158,7 +1162,9 @@ class ModelAverage(Optimizer): self.params_grads.append((param, grad)) for param, grad in self.params_grads: - with param.block.program.optimized_guard(param): + if grad is None: + continue + with param.block.program.optimized_guard([param, grad]): self._append_average_accumulate_op(param) self.apply_program = Program() diff --git a/python/paddle/fluid/regularizer.py b/python/paddle/fluid/regularizer.py index 53f35f5cc062b4da431be19e4484f316bb37be9f..080c185420bdc79d6da1d5a52fdd11fa4105d59a 100644 --- a/python/paddle/fluid/regularizer.py +++ b/python/paddle/fluid/regularizer.py @@ -41,12 +41,11 @@ def append_regularization_ops(parameters_and_grads, regularization=None): """ params_and_grads = [] for param, grad in parameters_and_grads: - with param.block.program.optimized_guard(param): - # If no gradient then we don't need to do anything - if grad is None: - params_and_grads.append((param, grad)) - continue - + # If no gradient then we don't need to do anything + if grad is None: + params_and_grads.append((param, grad)) + continue + with param.block.program.optimized_guard([param, grad]): regularization_term = None if param.regularizer is not None: # Add variable for regularization term in grad block diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py index cddf00765f4894126988c794763c34629449e8e6..fcf86cc5839113b75855ce97459b2ee4881238cd 100644 --- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py +++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py @@ -35,7 +35,8 @@ class TestParallelExecutorBase(unittest.TestCase): feed_dict=None, seed=None, use_parallel_executor=True, - balance_parameter_opt_between_cards=False): + use_reduce=False, + optimizer=fluid.optimizer.Adam): def run_executor(exe, feed, fetch_list, program=None): if isinstance(exe, fluid.ParallelExecutor): res = exe.run(fetch_list=fetch_list, feed=feed) @@ -50,14 +51,19 @@ class TestParallelExecutorBase(unittest.TestCase): main = fluid.Program() startup = fluid.Program() startup.random_seed = 1 # Fix random seed + main.random_seed = 1 with fluid.program_guard(main, startup): if seed is not None: startup.random_seed = seed + main.random_seed = seed + loss = method(use_feed=feed_dict is not None) - adam = fluid.optimizer.Adam() - adam.minimize(loss) + + optimizer().minimize(loss) + if memory_opt: fluid.memory_optimize(main) + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() startup_exe = fluid.Executor(place) startup_exe.run(startup) @@ -65,7 +71,8 @@ class TestParallelExecutorBase(unittest.TestCase): exec_strategy.allow_op_delay = allow_op_delay build_strategy = fluid.BuildStrategy() - build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce if balance_parameter_opt_between_cards else fluid.BuildStrategy.ReduceStrategy.AllReduce + build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \ + if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce if use_parallel_executor: exe = fluid.ParallelExecutor( diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py index a801d99aa1ced35eb7f081fde63ad541f0eb2589..f098dc7a3fb670e23471c2aa897011a3cf882c33 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py @@ -101,9 +101,7 @@ class TestMNIST(TestParallelExecutorBase): fluid.recordio_writer.convert_reader_to_recordio_file( MNIST_RECORDIO_FILE, reader, feeder) - def check_simple_fc_convergence(self, - balance_parameter_opt_between_cards, - use_cuda=True): + def check_simple_fc_convergence(self, use_cuda, use_reduce=False): self.check_network_convergence(simple_fc_net, use_cuda=use_cuda) self.check_network_convergence( simple_fc_net, use_cuda=use_cuda, allow_op_delay=True) @@ -115,20 +113,19 @@ class TestMNIST(TestParallelExecutorBase): feed_dict={"image": img, "label": label}, use_cuda=use_cuda, - balance_parameter_opt_between_cards=balance_parameter_opt_between_cards - ) + use_reduce=use_reduce) def test_simple_fc(self): - self.check_simple_fc_convergence(False, use_cuda=True) - self.check_simple_fc_convergence(False, use_cuda=False) + # use_cuda + self.check_simple_fc_convergence(True) + self.check_simple_fc_convergence(False) def test_simple_fc_with_new_strategy(self): - self.check_simple_fc_convergence(True, use_cuda=True) - self.check_simple_fc_convergence(True, use_cuda=False) + # use_cuda, use_reduce + self.check_simple_fc_convergence(True, True) + self.check_simple_fc_convergence(False, True) - def check_simple_fc_parallel_accuracy(self, - balance_parameter_opt_between_cards, - use_cuda=True): + def check_simple_fc_parallel_accuracy(self, use_cuda, use_reduce=False): img = np.zeros(shape=[32, 784], dtype='float32') label = np.ones(shape=[32, 1], dtype='int64') single_first_loss, single_last_loss = self.check_network_convergence( @@ -145,8 +142,7 @@ class TestMNIST(TestParallelExecutorBase): "label": label}, use_cuda=use_cuda, use_parallel_executor=True, - balance_parameter_opt_between_cards=balance_parameter_opt_between_cards - ) + use_reduce=use_reduce) for p_f in parallel_first_loss: self.assertAlmostEquals(p_f, single_first_loss[0], delta=1e-6) @@ -154,15 +150,15 @@ class TestMNIST(TestParallelExecutorBase): self.assertAlmostEquals(p_l, single_last_loss[0], delta=1e-6) def test_simple_fc_parallel_accuracy(self): - self.check_simple_fc_parallel_accuracy(False, use_cuda=True) - self.check_simple_fc_parallel_accuracy(False, use_cuda=False) + self.check_simple_fc_parallel_accuracy(True) + self.check_simple_fc_parallel_accuracy(False) def test_simple_fc_parallel_accuracy_with_new_strategy(self): - self.check_simple_fc_parallel_accuracy(True, use_cuda=True) - self.check_simple_fc_parallel_accuracy(True, use_cuda=False) + # use_cuda, use_reduce + self.check_simple_fc_parallel_accuracy(True, True) + self.check_simple_fc_parallel_accuracy(False, True) - def check_batchnorm_fc_convergence( - self, balance_parameter_opt_between_cards, use_cuda): + def check_batchnorm_fc_convergence(self, use_cuda, use_reduce=False): self.check_network_convergence(fc_with_batchnorm, use_cuda=use_cuda) img = np.zeros(shape=[32, 784], dtype='float32') label = np.ones(shape=[32, 1], dtype='int64') @@ -171,16 +167,16 @@ class TestMNIST(TestParallelExecutorBase): feed_dict={"image": img, "label": label}, use_cuda=use_cuda, - balance_parameter_opt_between_cards=balance_parameter_opt_between_cards - ) + use_reduce=use_reduce) def test_batchnorm_fc(self): - self.check_batchnorm_fc_convergence(False, use_cuda=True) - self.check_batchnorm_fc_convergence(False, use_cuda=False) + self.check_batchnorm_fc_convergence(True) + self.check_batchnorm_fc_convergence(False) def test_batchnorm_fc_with_new_strategy(self): - self.check_batchnorm_fc_convergence(True, use_cuda=True) - self.check_batchnorm_fc_convergence(True, use_cuda=False) + # use_cuda, use_reduce + self.check_batchnorm_fc_convergence(True, True) + self.check_batchnorm_fc_convergence(False, True) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py index 066299e6c6f7f6c159cb0886e86d3404b027b698..4d39505b66abf44249e0ea160b82aaf7be0638cb 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py @@ -13,8 +13,12 @@ # limitations under the License. import paddle.fluid as fluid +import paddle.fluid.layers.ops as ops +from paddle.fluid.initializer import init_on_cpu +from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter from parallel_executor_test_base import TestParallelExecutorBase import unittest +import math import os @@ -131,30 +135,71 @@ def SE_ResNeXt50Small(batch_size=2, use_feed=False): class TestResnet(TestParallelExecutorBase): - def check_resnet_convergence(self, - balance_parameter_opt_between_cards, - use_cuda=True, - iter=20): + def check_resnet_convergence_with_learning_rate_decay(self, + use_cuda=True, + use_reduce=False, + iter=20): + os.environ['CPU_NUM'] = str(4) + def _cosine_decay(learning_rate, step_each_epoch, epochs=120): + """ + Applies cosine decay to the learning rate. + lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1) + """ + global_step = _decay_step_counter() + + with init_on_cpu(): + epoch = ops.floor(global_step / step_each_epoch) + decayed_lr = learning_rate * \ + (ops.cos(epoch * (math.pi / epochs)) + 1)/2 + return decayed_lr + + def _optimizer(learning_rate=0.01): + optimizer = fluid.optimizer.Momentum( + learning_rate=_cosine_decay( + learning_rate=learning_rate, step_each_epoch=2, epochs=1), + momentum=0.9, + regularization=fluid.regularizer.L2Decay(1e-4)) + return optimizer + import functools + batch_size = 2 - self.check_network_convergence( + + single_first_loss, single_last_loss = self.check_network_convergence( functools.partial( SE_ResNeXt50Small, batch_size=batch_size), iter=iter, batch_size=batch_size, use_cuda=use_cuda, - balance_parameter_opt_between_cards=balance_parameter_opt_between_cards - ) - - def test_resnet(self): - self.check_resnet_convergence(False, use_cuda=True) - self.check_resnet_convergence(False, use_cuda=False, iter=5) + use_reduce=use_reduce, + optimizer=_optimizer, + use_parallel_executor=False) - def test_resnet_with_new_strategy(self): - self.check_resnet_convergence(True, use_cuda=True) - self.check_resnet_convergence(True, use_cuda=False, iter=5) + parallel_first_loss, parallel_last_loss = self.check_network_convergence( + functools.partial( + SE_ResNeXt50Small, batch_size=batch_size), + iter=iter, + batch_size=batch_size, + use_cuda=use_cuda, + use_reduce=use_reduce, + optimizer=_optimizer) + + for p_f in parallel_first_loss: + self.assertAlmostEquals(p_f, single_first_loss[0], delta=1e-6) + for p_l in parallel_last_loss: + self.assertAlmostEquals(p_l, single_last_loss[0], delta=1e-6) + + def test_seresnext_with_learning_rate_decay(self): + self.check_resnet_convergence_with_learning_rate_decay(True, False) + self.check_resnet_convergence_with_learning_rate_decay( + False, False, iter=5) + + def test_seresnext_with_new_strategy_with_learning_rate_decay(self): + self.check_resnet_convergence_with_learning_rate_decay(True, True) + self.check_resnet_convergence_with_learning_rate_decay( + False, True, iter=5) if __name__ == '__main__': diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 121c36e477327d4d0e7b1cba1713e68ce4d06e03..b0e9a6599730cde98338edc3e12eb81fc8f793c3 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -31,6 +31,7 @@ Steps to transpile pserver: from __future__ import print_function import math +import random import numpy as np from ps_dispatcher import RoundRobin, HashName, PSDispatcher @@ -197,7 +198,8 @@ class DistributeTranspiler(object): # shuffle the map will avoid the uneven distribution above grad_var_mapping_items = self.grad_var_mapping.items() if not slice_var_up: - np.random.shuffle(grad_var_mapping_items) + random.seed(self.trainer_num) + random.shuffle(grad_var_mapping_items) for orig_varname, splited_vars in grad_var_mapping_items: eplist = ps_dispatcher.dispatch(splited_vars) diff --git a/python/paddle/reader/__init__.py b/python/paddle/reader/__init__.py index 3b059735a924d58714cd88a761eb83143f1192d6..678026cf95970e8ff58c1bad20246059ffb464c1 100644 --- a/python/paddle/reader/__init__.py +++ b/python/paddle/reader/__init__.py @@ -66,9 +66,9 @@ An example implementation for multiple item data reader creator: TODO(yuyang18): Should we add whole design doc here? """ -import decorator -from decorator import * +import paddle.reader.decorator +from paddle.reader.decorator import * -import creator +import paddle.reader.creator __all__ = decorator.__all__ + ['creator'] diff --git a/python/paddle/reader/decorator.py b/python/paddle/reader/decorator.py index 1f83cabb8481451736944823be45185deea4f43b..4b1fe94222d35f8c0e4e4cccc364227a3f9509d0 100644 --- a/python/paddle/reader/decorator.py +++ b/python/paddle/reader/decorator.py @@ -20,7 +20,7 @@ __all__ = [ from threading import Thread import subprocess -from Queue import Queue +from six.moves.queue import Queue import itertools import random import zlib diff --git a/python/requirements.txt b/python/requirements.txt index ea827e9d5a0dcf8eb2ede1f6eaa88c777a138816..c091ecb111bda9d5e83c3ddcae93aed0745f9e4c 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -8,4 +8,4 @@ scipy>=0.19.0 Pillow nltk>=3.2.2 graphviz -LinkChecker +six diff --git a/python/setup.py.in b/python/setup.py.in index a064f36cc19dbc626dd85d76290280a01fa57215..a81cd19e10153be0d07badfa0c0fbcb01fe568f7 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -17,7 +17,8 @@ def git_commit(): git_commit = subprocess.Popen(cmd, stdout = subprocess.PIPE).communicate()[0].strip() except: git_commit = 'Unknown' - return git_commit + git_commit = git_commit.decode() + return str(git_commit) def _get_version_detail(idx): assert idx < 3, "vesion info consists of %(major)d.%(minor)d.%(patch)d, \ @@ -44,6 +45,7 @@ def is_taged(): try: cmd = ['git', 'describe', '--exact-match', '--tags', 'HEAD', '2>/dev/null'] git_tag = subprocess.Popen(cmd, stdout = subprocess.PIPE).communicate()[0].strip() + git_tag = git_tag.decode() except: return False @@ -67,13 +69,13 @@ with_mkl = '%(with_mkl)s' def show(): if istaged: - print 'full_version:', full_version - print 'major:', major - print 'minor:', minor - print 'patch:', patch - print 'rc:', rc + print('full_version:', full_version) + print('major:', major) + print('minor:', minor) + print('patch:', patch) + print('rc:', rc) else: - print 'commit:', commit + print('commit:', commit) def mkl(): return with_mkl