diff --git a/CMakeLists.txt b/CMakeLists.txt index db3c3b8e2069f9ae5ad02286b59decf8fe764c2d..4861e8065a8a70bcc3b859b121cf404461f47ba7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -66,6 +66,12 @@ option(WITH_ANAKIN "Compile with Anakin library" OFF) option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE}) option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF) option(WITH_SYSTEM_BLAS "Use system blas library" OFF) +option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION}) + +# PY_VERSION +if(NOT PY_VERSION) + set(PY_VERSION 2.7) +endif() # CMAKE_BUILD_TYPE if(NOT CMAKE_BUILD_TYPE) @@ -146,6 +152,7 @@ endif() ######################################################################################## include(external/mklml) # download mklml package +include(external/libxsmm) # download, build, install libxsmm include(external/zlib) # download, build, install zlib include(external/gflags) # download, build, install gflags include(external/glog) # download, build, install glog @@ -232,6 +239,10 @@ if(WITH_MKLML) list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB}) endif() +if(WITH_LIBXSMM) + list(APPEND EXTERNAL_LIBS ${LIBXSMM_LIBS}) +endif() + if(WITH_MKLDNN) list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB}) endif() diff --git a/Dockerfile b/Dockerfile index 48c750358cfcb227667c429f19befcaa2f51ebbd..402adee2ea2822250ebc8f6229fd6a44545d58e5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -80,7 +80,7 @@ RUN pip install pre-commit 'ipython==5.3.0' && \ pip install opencv-python #For docstring checker -RUN pip install pylint pytest astroid isort +RUN pip install pylint pytest astroid isort LinkChecker COPY ./python/requirements.txt /root/ RUN pip install -r /root/requirements.txt diff --git a/cmake/external/libxsmm.cmake b/cmake/external/libxsmm.cmake new file mode 100644 index 0000000000000000000000000000000000000000..530f7ebe2813fb2f00c6b5b4d1f7b2f04fe650b0 --- /dev/null +++ b/cmake/external/libxsmm.cmake @@ -0,0 +1,57 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +OPTION(WITH_LIBXSMM "Compile with libxsmm" OFF) + +IF(NOT WITH_LIBXSMM) + return() +ENDIF() + +IF(WIN32 OR APPLE OR ANDROID OR IOS) + MESSAGE(WARNING "Windows, Mac or Mobile are not supported with libxsmm in Paddle yet.") + SET(WITH_LIBXSMM OFF CACHE STRING "Disable LIBXSMM" FORCE) + return() +ENDIF() + +INCLUDE (ExternalProject) + +SET(LIBXSMM_SOURCES_DIR ${THIRD_PARTY_PATH}/libxsmm) +SET(LIBXSMM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/libxsmm) +SET(LIBXSMM_INCLUDE_DIR "${LIBXSMM_INSTALL_DIR}/include" CACHE PATH "LIBXSMM include directory." FORCE) +SET(LIBXSMM_LIBRARY_DIR "${LIBXSMM_INSTALL_DIR}/lib" CACHE PATH "LIBXSMM library directory." FORCE) +SET(LIBXSMM_LIBS "${LIBXSMM_LIBRARY_DIR}/libxsmm.a" + "${LIBXSMM_LIBRARY_DIR}/libxsmmnoblas.a") + +ExternalProject_Add( + extern_libxsmm + GIT_REPOSITORY "https://github.com/hfp/libxsmm.git" + GIT_TAG "7cc03b5b342fdbc6b6d990b190671c5dbb8489a2" + PREFIX ${LIBXSMM_SOURCES_DIR} + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_IN_SOURCE 1 + BUILD_COMMAND $(MAKE) --silent PREFIX=${LIBXSMM_INSTALL_DIR} CXX=g++ CC=gcc WARP=0 install + INSTALL_COMMAND "" +) +ADD_LIBRARY(libxsmm STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET libxsmm PROPERTY IMPORTED_LOCATION "${LIBXSMM_LIBRARY_DIR}/libxsmm.a") +SET_PROPERTY(TARGET libxsmm PROPERTY IMPORTED_LOCATION "${LIBXSMM_LIBRARY_DIR}/libxsmmnoblas.a") + +MESSAGE(STATUS "Libxsmm library: ${LIBXSMM_LIBS}") +include_directories(${LIBXSMM_INCLUDE_DIR}) +ADD_DEFINITIONS(-DPADDLE_WITH_LIBXSMM) +ADD_DEPENDENCIES(libxsmm extern_libxsmm) +LIST(APPEND external_project_dependencies libxsmm) + diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake index ce6a88b51dc98ac46dd3935f12658d60d364ba8c..56024edf5be092f81ed893633a8e7cafc8c8d429 100644 --- a/cmake/external/openblas.cmake +++ b/cmake/external/openblas.cmake @@ -121,6 +121,11 @@ ELSE() TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES}) ENDIF("${CBLAS_PROVIDER}" STREQUAL "MKLML") +IF(WITH_LIBXSMM) + TARGET_LINK_LIBRARIES(cblas ${LIBXSMM_LIBS}) + ADD_DEPENDENCIES(cblas extern_libxsmm) +ENDIF() + IF(NOT ${CBLAS_FOUND}) ADD_DEPENDENCIES(cblas extern_openblas) LIST(APPEND external_project_dependencies cblas) diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake index d7e5571bdbd8ba58d8a08c9426971f1c7b186413..f17b8d46dc2d8ded81ced7de5827d5e7fd5109f0 100644 --- a/cmake/external/python.cmake +++ b/cmake/external/python.cmake @@ -18,8 +18,9 @@ ENDIF() INCLUDE(python_module) -FIND_PACKAGE(PythonInterp 2.7) -FIND_PACKAGE(PythonLibs 2.7) +FIND_PACKAGE(PythonInterp ${PY_VERSION}) +FIND_PACKAGE(PythonLibs ${PY_VERSION}) + # Fixme: Maybe find a static library. Get SHARED/STATIC by FIND_PACKAGE. ADD_LIBRARY(python SHARED IMPORTED GLOBAL) SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES}) diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc index b82c2ef4082110f1621eb38d50361396511a4825..6f5d4471a97cc4efc73b9df68040ab9eccde0b1c 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc @@ -276,13 +276,22 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build( } } - // Insert BCast Ops - for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) { - auto &to_bcast_set = bcast_var_name_set[dev_id]; - for (auto &bcast_name : to_bcast_set) { - CreateBroadcastOp(&result, bcast_name, dev_id); + bool use_gpu = false; +#ifdef PADDLE_WITH_CUDA + use_gpu = nccl_ctxs_ != nullptr; +#endif + + if (use_gpu || + strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) { + // Insert BCast Ops + for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) { + auto &to_bcast_set = bcast_var_name_set[dev_id]; + for (auto &bcast_name : to_bcast_set) { + CreateBroadcastOp(&result, bcast_name, dev_id); + } } } + /* Dependency graph has been constructed. However, there are still data hazards need to be handled. @@ -412,14 +421,19 @@ int MultiDevSSAGraphBuilder::GetOpDeviceID(const OpDesc &op) const { if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) { return -1; } - - for (auto &varname : op.InputArgumentNames()) { - int dev_id = GetVarDeviceID(varname); - if (dev_id != -1) { - return dev_id; - } + int op_role = boost::get( + op.GetAttr(framework::OpProtoAndCheckerMaker::OpRoleAttrName())); + if (op_role != static_cast(framework::OpRole::kOptimize)) { + return -1; } - return -1; + auto param_grad = boost::get>( + op.GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName())); + + PADDLE_ENFORCE_EQ(param_grad.size(), 2U); + int dev_id = GetVarDeviceID(param_grad[1]); + PADDLE_ENFORCE_NE(dev_id, -1, "dev_id should not be -1.[%s, %s]", op.Type(), + param_grad[0]); + return dev_id; } int MultiDevSSAGraphBuilder::GetVarDeviceID(const std::string &varname) const { diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 3a9027713afb5287c7addf8be745acfd185104ee..58be61362cabf22a3543af364f1b0bd180df826a 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -45,6 +45,7 @@ class ParallelExecutorPrivate { #endif bool own_local_scope_; bool use_cuda_; + bool use_all_reduce_; }; std::vector &ParallelExecutor::GetLocalScopes() { @@ -62,6 +63,14 @@ ParallelExecutor::ParallelExecutor( : member_(new ParallelExecutorPrivate(places)) { member_->global_scope_ = scope; member_->use_cuda_ = exec_strategy.use_cuda_; + member_->use_all_reduce_ = + build_strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce; + + if (!member_->use_all_reduce_) { + PADDLE_ENFORCE(places.size() > 1, + "If you set build_strategy.reduce with 'Reduce'," + "the number of places must be greater than 1."); + } // Step 1. Bcast the params to devs. // Create local scopes @@ -117,7 +126,7 @@ ParallelExecutor::ParallelExecutor( #ifdef PADDLE_WITH_CUDA builder_factory.SetNCCLContextMap(member_->nccl_ctxs_.get()); #else - PADDLE_THROW("Not compiled with CUDA"); + PADDLE_THROW("Not compiled with CUDA."); #endif } @@ -133,7 +142,7 @@ ParallelExecutor::ParallelExecutor( void ParallelExecutor::BCastParamsToDevs( const std::unordered_set &vars) const { - // the the initializing bcast, all vars would be bcast from device(0), + // the initializing bcast, all vars would be bcast from device(0), // otherwise // bcast from the specified device. bool initializing = builder_.get() == nullptr ? true : false; @@ -209,9 +218,13 @@ void ParallelExecutor::BCastParamsToDevs( auto local_scope = member_->local_scopes_[i]; auto *t = local_scope->Var(var)->GetMutable(); - t->Resize(dims); - t->mutable_data(cpu, main_tensor.type()); - paddle::framework::TensorCopy(main_tensor, cpu, t); + if (member_->use_all_reduce_ || member_->use_cuda_) { + t->Resize(dims); + t->mutable_data(cpu, main_tensor.type()); + paddle::framework::TensorCopy(main_tensor, cpu, t); + } else { + t->ShareDataWith(main_tensor); + } } } } diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h index 9f6c1e5c35f02cd4bc729eea78b17fac017aa90e..70f88f24f682e05972ca73ef7b50f96be50d1ef4 100644 --- a/paddle/fluid/operators/math/blas.h +++ b/paddle/fluid/operators/math/blas.h @@ -21,6 +21,10 @@ #include "paddle/fluid/platform/dynload/mklml.h" #endif +#ifdef PADDLE_WITH_LIBXSMM +#include +#endif + #ifdef PADDLE_USE_OPENBLAS #include #endif diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h index 2ce94cfc93823aa891114ef8fd1e851727ebc623..238bd3f8def9eaa6c18afdab1031c4babfde8ae2 100644 --- a/paddle/fluid/operators/math/blas_impl.h +++ b/paddle/fluid/operators/math/blas_impl.h @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. #pragma once +#include #include #include "paddle/fluid/operators/math/math_function.h" @@ -30,6 +31,12 @@ struct CBlas { platform::dynload::cblas_sgemm(args...); } +#ifdef PADDLE_WITH_LIBXSMM + template + static void SMM_GEMM(ARGS... args) { + libxsmm_sgemm(args...); + } +#endif template static void AXPY(ARGS... args) { platform::dynload::cblas_saxpy(args...); @@ -63,6 +70,12 @@ struct CBlas { platform::dynload::cblas_dgemm(args...); } +#ifdef PADDLE_WITH_LIBXSMM + template + static void SMM_GEMM(ARGS... args) { + libxsmm_dgemm(args...); + } +#endif template static void AXPY(ARGS... args) { platform::dynload::cblas_daxpy(args...); @@ -140,6 +153,9 @@ struct CBlas { template <> struct CBlas { static void GEMM(...) { PADDLE_THROW("float16 GEMM not supported on CPU"); } + static void SMM_GEMM(...) { + PADDLE_THROW("float16 SMM_GEMM not supported on CPU"); + } #ifdef PADDLE_WITH_MKLML static void GEMM_BATCH(...) { PADDLE_THROW("float16 GEMM_BATCH not supported on CPU"); @@ -147,6 +163,33 @@ struct CBlas { #endif }; +template +inline bool UseXSMM(const int &m, const int &n, const int &k, bool transa, + bool transb, const T &alpha, const T &beta) { +#ifdef PADDLE_WITH_LIBXSMM + // Refer to https://github.com/hfp/libxsmm/blob/master/README.md + // But the threshold is custom + constexpr int LIBXSMM_THRESHOLD = 20 * 20 * 20; + if (m * n * k > LIBXSMM_THRESHOLD || transa || transb || + std::abs(alpha - static_cast(1) > + std::numeric_limits::epsilon()) || + std::abs(beta) > std::numeric_limits::epsilon()) { + return false; + } else { + return true; + } +#endif + return false; +} + +template <> +inline bool UseXSMM(const int &m, const int &n, const int &k, + bool transa, bool transb, + const platform::float16 &alpha, + const platform::float16 &beta) { + return false; +} + template <> template void Blas::GEMM(CBLAS_TRANSPOSE transA, @@ -156,8 +199,21 @@ void Blas::GEMM(CBLAS_TRANSPOSE transA, int lda = (transA == CblasNoTrans) ? K : M; int ldb = (transB == CblasNoTrans) ? N : K; int ldc = N; - CBlas::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb, - beta, C, ldc); +#ifdef PADDLE_WITH_LIBXSMM + if (UseXSMM(M, N, K, transA != CblasNoTrans, transB != CblasNoTrans, alpha, + beta)) { + // Note: SMM use ColMajor + const char transa = 'N'; + const char transb = 'N'; + CBlas::SMM_GEMM(&transa, &transb, &N, &M, &K, &alpha, B, &ldb, A, &lda, + &beta, C, &ldc); + } else { +#endif + CBlas::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, + ldb, beta, C, ldc); +#ifdef PADDLE_WITH_LIBXSMM + } +#endif } template <> diff --git a/paddle/fluid/operators/math/math_function_test.cc b/paddle/fluid/operators/math/math_function_test.cc index b545671b43d3a453ab03e4774427179617f62db0..078dd448c385dbb8a00025ee2ba08d0c41a4730a 100644 --- a/paddle/fluid/operators/math/math_function_test.cc +++ b/paddle/fluid/operators/math/math_function_test.cc @@ -54,8 +54,64 @@ TEST(math_function, gemm_notrans_cblas) { EXPECT_EQ(input3_ptr[6], 86); EXPECT_EQ(input3_ptr[7], 99); } +#ifdef PADDLE_WITH_LIBXSMM +template +void MklSmmCompare(int m, int n, int k) { + paddle::framework::Tensor mat_a; + paddle::framework::Tensor mat_b; + paddle::framework::Tensor mat_c_smm; + paddle::framework::Tensor mat_c_mkl; + auto* cpu_place = new paddle::platform::CPUPlace(); + + T* A = mat_a.mutable_data({m, k}, *cpu_place); + T* B = mat_b.mutable_data({k, n}, *cpu_place); + T* CSMM = mat_c_smm.mutable_data({m, n}, *cpu_place); + T* CMKL = mat_c_mkl.mutable_data({m, n}, *cpu_place); + T alpha = static_cast(1); + T beta = static_cast(0); + for (int i = 0; i < mat_a.numel(); ++i) { + A[i] = static_cast(i); + } + for (int i = 0; i < mat_b.numel(); ++i) { + B[i] = static_cast(i); + } + // lda,ldb,ldc follow RowMajor + int lda = k; + int ldb = n; + int ldc = n; + + auto smm = [&, m, n, k, lda, ldb, ldc, alpha, beta]() { + const char transa = 'N'; + const char transb = 'N'; + paddle::operators::math::CBlas::SMM_GEMM(&transa, &transb, &n, &m, &k, + &alpha, B, &ldb, A, &lda, &beta, + CSMM, &ldc); + }; + + auto mkl = [&, m, n, k, lda, ldb, ldc, alpha, beta]() { + paddle::operators::math::CBlas::GEMM(CblasRowMajor, CblasNoTrans, + CblasNoTrans, m, n, k, alpha, A, + lda, B, ldb, beta, CMKL, ldc); + }; + + smm(); + mkl(); + ASSERT_EQ(mat_c_mkl.numel(), mat_c_smm.numel()); + for (int i = 0; i < mat_c_mkl.numel(); ++i) { + EXPECT_FLOAT_EQ(CSMM[i], CMKL[i]); + } +} +TEST(math_function, gemm_mkl_vs_smm) { + MklSmmCompare(1, 2, 3); + MklSmmCompare(1, 2, 3); + MklSmmCompare(3, 2, 1); + MklSmmCompare(3, 2, 1); + MklSmmCompare(3, 8, 5); + MklSmmCompare(3, 8, 5); +} +#endif -TEST(math_function, gemm_trans_clbas) { +TEST(math_function, gemm_trans_cblas) { paddle::framework::Tensor input1; paddle::framework::Tensor input2; paddle::framework::Tensor input3; diff --git a/paddle/fluid/operators/top_k_op.h b/paddle/fluid/operators/top_k_op.h index 7ddb82ef6ff063868a4b9b603b8ab89700b9dd13..054dd481994d03f71b0ed5dc73e103085f6c91aa 100644 --- a/paddle/fluid/operators/top_k_op.h +++ b/paddle/fluid/operators/top_k_op.h @@ -60,6 +60,7 @@ class TopkKernel : public framework::OpKernel { #endif for (size_t i = 0; i < row; i++) { std::vector> vec; + vec.reserve(col); for (size_t j = 0; j < col; j++) { vec.push_back(std::pair(eg_input(i, j), j)); } diff --git a/paddle/legacy/utils/PythonUtil.cpp b/paddle/legacy/utils/PythonUtil.cpp index 7faeff55c28b9065179ad27b3b604a9f411249e5..21ed049c4d2743d1fa914d6948d6c8c2862f0bfc 100644 --- a/paddle/legacy/utils/PythonUtil.cpp +++ b/paddle/legacy/utils/PythonUtil.cpp @@ -136,7 +136,13 @@ std::string callPythonFunc(const std::string& moduleName, const std::string& funcName, const std::vector& args) { PyObjectPtr obj = callPythonFuncRetPyObj(moduleName, funcName, args); +#if PY_MAJOR_VERSION >= 3 + Py_ssize_t str_size = 0u; + const char* str = PyUnicode_AsUTF8AndSize(obj.get(), &str_size); + return std::string(str, (size_t)str_size); +#else return std::string(PyString_AsString(obj.get()), PyString_Size(obj.get())); +#endif // PY_MAJOR_VERSION >= 3 } PyObjectPtr createPythonClass( diff --git a/paddle/legacy/utils/PythonUtil.h b/paddle/legacy/utils/PythonUtil.h index b0c8612c378fbe12cdf24e51a5b6546740b2d4c8..d5b2dbddde21f5c2a0696aadeda2b057175fc5e9 100644 --- a/paddle/legacy/utils/PythonUtil.h +++ b/paddle/legacy/utils/PythonUtil.h @@ -88,6 +88,33 @@ PyObjectPtr createPythonClass(const std::string& moduleName, namespace py { PyObjectPtr import(const std::string& moduleName); +#if PY_MAJOR_VERSION >= 3 +/** + * Cast a PyLong to int type T. + * @tparam T return type. + * @param [in] obj PyLong object. + * @param [out] ok status for casting. False if error occured. nullptr if user + * don't care is ok or not. + * @return The value of python object, or 0 if not ok. + */ +template +T castInt(PyObject* obj, bool* ok = nullptr) { + // Refer to https://www.python.org/dev/peps/pep-0237/, the int and long object + // were unified to long since python3 + if (PyLong_Check(obj)) { + if (ok) *ok = true; + return (T)PyLong_AsUnsignedLong(obj); + } else { + if (ok) *ok = false; + return (T)0; + } +} + +// Convert PyAPI from 2.x to 3.x +#define PyString_FromString PyUnicode_FromString +#define PyString_AsString PyUnicode_AsUTF8 + +#else /** * Cast a PyLong or PyInt to int type T. * @tparam T return type. @@ -109,6 +136,7 @@ T castInt(PyObject* obj, bool* ok = nullptr) { return (T)0; } } +#endif // PY_MAJOR_VERSION >= 3 /** * Invoke repr of python object. diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index bf45c11a9de53a109c72ff7a89b807bc80feb7c8..f21fd5cd6ec1a7c526694bfc3e7fc574cbbc365d 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -78,6 +78,12 @@ function cmake_gen() { PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27mu/bin/python -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27mu/include/python2.7 -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs4/lib/libpython2.7.so" + elif [ "$1" == "cp35-cp35m" ]; then + export LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} + export PATH=/opt/_internal/cpython-3.5.1/bin/:${PATH} + export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.5.1/bin/python3 + -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.5.1/include/python3.5m + -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.5.1/lib/libpython3.so" fi fi @@ -108,6 +114,7 @@ function cmake_gen() { -DWITH_CONTRIB=${WITH_CONTRIB:-ON} -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} -DWITH_INFERENCE_DEMO=${WITH_INFERENCE_DEMO:-ON} + -DPY_VERSION=${PY_VERSION:-2.7} ======================================== EOF # Disable UNITTEST_USE_VIRTUALENV in docker because @@ -136,7 +143,8 @@ EOF -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \ -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \ -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \ - -DWITH_INFERENCE_DEMO=${WITH_INFERENCE_DEMO:-ON} + -DWITH_INFERENCE_DEMO=${WITH_INFERENCE_DEMO:-ON} \ + -DPY_VERSION=${PY_VERSION:-2.7} } function abort(){ diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index d1cf04161ae4444ebc7da7fbc20e37dafe6c0fb1..241a07a35297e85763781a42696fd727733459a3 100644 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -12,16 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. try: - from version import full_version as __version__ - from version import commit as __git_commit__ + from paddle.version import full_version as __version__ + from paddle.version import commit as __git_commit__ except ImportError: import sys - sys.stderr.write('''Warning with import paddle: you should not + sys.stderr.write('''Warning with import paddle: you should not import paddle from the source directory; please install paddlepaddle*.whl firstly.''' ) -import reader -import dataset -import batch +import paddle.reader +import paddle.dataset +import paddle.batch batch = batch.batch diff --git a/python/paddle/dataset/__init__.py b/python/paddle/dataset/__init__.py index 3315e826e82a33dfeb9c5223ce196cffb1ae7234..54aa3edc51d3734633ce077a59bd86cec8d09032 100644 --- a/python/paddle/dataset/__init__.py +++ b/python/paddle/dataset/__init__.py @@ -15,20 +15,20 @@ Dataset package. """ -import mnist -import imikolov -import imdb -import cifar -import movielens -import conll05 -import uci_housing -import sentiment -import wmt14 -import wmt16 -import mq2007 -import flowers -import voc2012 -import image +import paddle.dataset.mnist +import paddle.dataset.imikolov +import paddle.dataset.imdb +import paddle.dataset.cifar +import paddle.dataset.movielens +import paddle.dataset.conll05 +import paddle.dataset.uci_housing +import paddle.dataset.sentiment +import paddle.dataset.wmt14 +import paddle.dataset.wmt16 +import paddle.dataset.mq2007 +import paddle.dataset.flowers +import paddle.dataset.voc2012 +import paddle.dataset.image __all__ = [ 'mnist', diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py index 2a8e3d410add466436524d8cc7714fce955af2b5..d9acfef58c3ba92c763d195c88f1323b3c6512b9 100644 --- a/python/paddle/fluid/clip.py +++ b/python/paddle/fluid/clip.py @@ -324,10 +324,12 @@ def set_gradient_clip(clip, param_list=None, program=None): param.gradient_clip_attr = copy.deepcopy(clip) -def append_gradient_clip_ops(param_grad): +def append_gradient_clip_ops(param_grads): context = dict() - for p, g in param_grad: - with p.block.program.optimized_guard(p): + for p, g in param_grads: + if g is None: + continue + with p.block.program.optimized_guard([p, g]): clip_attr = getattr(p, 'gradient_clip_attr', NullGradientClipAttr()) if clip_attr is None: clip_attr = NullGradientClipAttr() @@ -339,8 +341,10 @@ def append_gradient_clip_ops(param_grad): clip_attr._process_context(context=context, param=p, grad=g) res = [] - for p, g in param_grad: - with p.block.program.optimized_guard(p): + for p, g in param_grads: + if g is None: + continue + with p.block.program.optimized_guard([p, g]): res.append(clip_attr._create_operators(param=p, grad=g)) return res diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index ea3117e02bd993b06de39725b2c3296031065e3c..d89cb246a939f247b94bc49f39198a909b1c30ea 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -1319,7 +1319,7 @@ class Program(object): self._op_role_var = [var_name] @contextlib.contextmanager - def optimized_guard(self, var): + def optimized_guard(self, param_and_grads): """ A with guard to set :code:`Optimization` :code:`OpRole` and :code:`OpRoleVar` automatically. @@ -1327,17 +1327,20 @@ class Program(object): Notes: This is a very low level API. Users should not use it directly. Args: - var(Variable|str): The variable (name) to be optimized. + param_and_grads(list): The variables (names) to be optimized. Examples: >>> p, g = backward(...) - >>> with program.optimized_guard(p): + >>> with program.optimized_guard([p,g]): >>> p = p - 0.001 * g """ OpRole = core.op_proto_and_checker_maker.OpRole self._current_role = OpRole.Optimize - self._op_role_var = [var.name if isinstance(var, Variable) else var] + self._op_role_var = [ + var.name if isinstance(var, Variable) else var + for var in param_and_grads + ] yield self._op_role_var = [] self._current_role = OpRole.Forward diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 214f47afa1e4e29d53b11ccc035b4d840f963591..94e78d155f1c9aa5b7abda0e83db528ad5e2aafb 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -123,7 +123,7 @@ class Optimizer(object): """ pass - def _finish_update(self, block, parameters): + def _finish_update(self, block, parameters_and_grads): """Finish any custom updates needed before completing an optimization step @@ -226,18 +226,18 @@ class Optimizer(object): optimize_ops = [] for param_and_grad in parameters_and_grads: + if param_and_grad[1] is None: + continue with param_and_grad[0].block.program.optimized_guard( - param_and_grad[0]): - if param_and_grad[0].trainable is True and param_and_grad[ - 1] is not None: + param_and_grad): + if param_and_grad[0].trainable is True: optimize_op = self._append_optimize_op(loss.block, param_and_grad) optimize_ops.append(optimize_op) # Get custom finish ops for subclasses # FIXME: Need to fix this once we figure out how to handle dependencies - self._finish_update(loss.block, - [p[0] for p in parameters_and_grads]) + self._finish_update(loss.block, parameters_and_grads) end = len(global_block.ops) return global_block.slice_ops(start, end) @@ -564,13 +564,15 @@ class AdamOptimizer(Optimizer): return adam_op - def _finish_update(self, block, parameters): + def _finish_update(self, block, param_and_grads): """Update Beta1 and Beta2 Power accumulators """ assert isinstance(block, framework.Block) main_block = block.program.global_block() - for param in parameters: - with param.block.program.optimized_guard(param): + for param, grad in param_and_grads: + if grad is None: + continue + with param.block.program.optimized_guard([param, grad]): beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, param) beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str, @@ -691,13 +693,15 @@ class AdamaxOptimizer(Optimizer): return adamax_op - def _finish_update(self, block, parameters): + def _finish_update(self, block, parameters_and_grads): """Update Beta1 Power accumulator """ assert isinstance(block, framework.Block) main_block = block.program.global_block() - for param in parameters: - with param.block.program.optimized_guard(param): + for param, grad in parameters_and_grads: + if grad is None: + continue + with param.block.program.optimized_guard([param, grad]): beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, param) main_block.append_op( @@ -1158,7 +1162,9 @@ class ModelAverage(Optimizer): self.params_grads.append((param, grad)) for param, grad in self.params_grads: - with param.block.program.optimized_guard(param): + if grad is None: + continue + with param.block.program.optimized_guard([param, grad]): self._append_average_accumulate_op(param) self.apply_program = Program() diff --git a/python/paddle/fluid/regularizer.py b/python/paddle/fluid/regularizer.py index 53f35f5cc062b4da431be19e4484f316bb37be9f..080c185420bdc79d6da1d5a52fdd11fa4105d59a 100644 --- a/python/paddle/fluid/regularizer.py +++ b/python/paddle/fluid/regularizer.py @@ -41,12 +41,11 @@ def append_regularization_ops(parameters_and_grads, regularization=None): """ params_and_grads = [] for param, grad in parameters_and_grads: - with param.block.program.optimized_guard(param): - # If no gradient then we don't need to do anything - if grad is None: - params_and_grads.append((param, grad)) - continue - + # If no gradient then we don't need to do anything + if grad is None: + params_and_grads.append((param, grad)) + continue + with param.block.program.optimized_guard([param, grad]): regularization_term = None if param.regularizer is not None: # Add variable for regularization term in grad block diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py index cddf00765f4894126988c794763c34629449e8e6..f5c93319de02249a22981b50733da05bb8658e3a 100644 --- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py +++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py @@ -35,7 +35,7 @@ class TestParallelExecutorBase(unittest.TestCase): feed_dict=None, seed=None, use_parallel_executor=True, - balance_parameter_opt_between_cards=False): + use_reduce=False): def run_executor(exe, feed, fetch_list, program=None): if isinstance(exe, fluid.ParallelExecutor): res = exe.run(fetch_list=fetch_list, feed=feed) @@ -50,14 +50,19 @@ class TestParallelExecutorBase(unittest.TestCase): main = fluid.Program() startup = fluid.Program() startup.random_seed = 1 # Fix random seed + main.random_seed = 1 with fluid.program_guard(main, startup): if seed is not None: startup.random_seed = seed + main.random_seed = seed + loss = method(use_feed=feed_dict is not None) adam = fluid.optimizer.Adam() adam.minimize(loss) + if memory_opt: fluid.memory_optimize(main) + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() startup_exe = fluid.Executor(place) startup_exe.run(startup) @@ -65,7 +70,8 @@ class TestParallelExecutorBase(unittest.TestCase): exec_strategy.allow_op_delay = allow_op_delay build_strategy = fluid.BuildStrategy() - build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce if balance_parameter_opt_between_cards else fluid.BuildStrategy.ReduceStrategy.AllReduce + build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \ + if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce if use_parallel_executor: exe = fluid.ParallelExecutor( diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py index 3ab477a3e0f3255165c0873af7d84f1926877f54..ccd1b1c122d53fc1d8a2c4d7869e4a2164260906 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py @@ -97,9 +97,7 @@ class TestMNIST(TestParallelExecutorBase): fluid.recordio_writer.convert_reader_to_recordio_file( MNIST_RECORDIO_FILE, reader, feeder) - def check_simple_fc_convergence(self, - balance_parameter_opt_between_cards, - use_cuda=True): + def check_simple_fc_convergence(self, use_cuda, use_reduce=False): self.check_network_convergence(simple_fc_net, use_cuda=use_cuda) self.check_network_convergence( simple_fc_net, use_cuda=use_cuda, allow_op_delay=True) @@ -111,20 +109,19 @@ class TestMNIST(TestParallelExecutorBase): feed_dict={"image": img, "label": label}, use_cuda=use_cuda, - balance_parameter_opt_between_cards=balance_parameter_opt_between_cards - ) + use_reduce=use_reduce) def test_simple_fc(self): - self.check_simple_fc_convergence(False, use_cuda=True) - self.check_simple_fc_convergence(False, use_cuda=False) + # use_cuda + self.check_simple_fc_convergence(True) + self.check_simple_fc_convergence(False) def test_simple_fc_with_new_strategy(self): - self.check_simple_fc_convergence(True, use_cuda=True) - self.check_simple_fc_convergence(True, use_cuda=False) + # use_cuda, use_reduce + self.check_simple_fc_convergence(True, True) + self.check_simple_fc_convergence(False, True) - def check_simple_fc_parallel_accuracy(self, - balance_parameter_opt_between_cards, - use_cuda=True): + def check_simple_fc_parallel_accuracy(self, use_cuda, use_reduce=False): img = np.zeros(shape=[32, 784], dtype='float32') label = np.ones(shape=[32, 1], dtype='int64') single_first_loss, single_last_loss = self.check_network_convergence( @@ -141,8 +138,7 @@ class TestMNIST(TestParallelExecutorBase): "label": label}, use_cuda=use_cuda, use_parallel_executor=True, - balance_parameter_opt_between_cards=balance_parameter_opt_between_cards - ) + use_reduce=use_reduce) for p_f in parallel_first_loss: self.assertAlmostEquals(p_f, single_first_loss[0], delta=1e-6) @@ -150,15 +146,15 @@ class TestMNIST(TestParallelExecutorBase): self.assertAlmostEquals(p_l, single_last_loss[0], delta=1e-6) def test_simple_fc_parallel_accuracy(self): - self.check_simple_fc_parallel_accuracy(False, use_cuda=True) - self.check_simple_fc_parallel_accuracy(False, use_cuda=False) + self.check_simple_fc_parallel_accuracy(True) + self.check_simple_fc_parallel_accuracy(False) def test_simple_fc_parallel_accuracy_with_new_strategy(self): - self.check_simple_fc_parallel_accuracy(True, use_cuda=True) - self.check_simple_fc_parallel_accuracy(True, use_cuda=False) + # use_cuda, use_reduce + self.check_simple_fc_parallel_accuracy(True, True) + self.check_simple_fc_parallel_accuracy(False, True) - def check_batchnorm_fc_convergence( - self, balance_parameter_opt_between_cards, use_cuda): + def check_batchnorm_fc_convergence(self, use_cuda, use_reduce=False): self.check_network_convergence(fc_with_batchnorm, use_cuda=use_cuda) img = np.zeros(shape=[32, 784], dtype='float32') label = np.ones(shape=[32, 1], dtype='int64') @@ -167,16 +163,16 @@ class TestMNIST(TestParallelExecutorBase): feed_dict={"image": img, "label": label}, use_cuda=use_cuda, - balance_parameter_opt_between_cards=balance_parameter_opt_between_cards - ) + use_reduce=use_reduce) def test_batchnorm_fc(self): - self.check_batchnorm_fc_convergence(False, use_cuda=True) - self.check_batchnorm_fc_convergence(False, use_cuda=False) + self.check_batchnorm_fc_convergence(True) + self.check_batchnorm_fc_convergence(False) def test_batchnorm_fc_with_new_strategy(self): - self.check_batchnorm_fc_convergence(True, use_cuda=True) - self.check_batchnorm_fc_convergence(True, use_cuda=False) + # use_cuda, use_reduce + self.check_batchnorm_fc_convergence(True, True) + self.check_batchnorm_fc_convergence(False, True) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py index 066299e6c6f7f6c159cb0886e86d3404b027b698..57ae36dbdd401afd34d06f460ae613db18240a2e 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py @@ -131,10 +131,7 @@ def SE_ResNeXt50Small(batch_size=2, use_feed=False): class TestResnet(TestParallelExecutorBase): - def check_resnet_convergence(self, - balance_parameter_opt_between_cards, - use_cuda=True, - iter=20): + def check_resnet_convergence(self, use_cuda, use_reduce=False, iter=20): os.environ['CPU_NUM'] = str(4) import functools @@ -145,16 +142,16 @@ class TestResnet(TestParallelExecutorBase): iter=iter, batch_size=batch_size, use_cuda=use_cuda, - balance_parameter_opt_between_cards=balance_parameter_opt_between_cards - ) + use_reduce=use_reduce) def test_resnet(self): - self.check_resnet_convergence(False, use_cuda=True) - self.check_resnet_convergence(False, use_cuda=False, iter=5) + self.check_resnet_convergence(True) + self.check_resnet_convergence(False, iter=5) def test_resnet_with_new_strategy(self): - self.check_resnet_convergence(True, use_cuda=True) - self.check_resnet_convergence(True, use_cuda=False, iter=5) + # use_cuda, use_reduce + self.check_resnet_convergence(True, True) + self.check_resnet_convergence(False, True, iter=5) if __name__ == '__main__': diff --git a/python/paddle/reader/__init__.py b/python/paddle/reader/__init__.py index 3b059735a924d58714cd88a761eb83143f1192d6..678026cf95970e8ff58c1bad20246059ffb464c1 100644 --- a/python/paddle/reader/__init__.py +++ b/python/paddle/reader/__init__.py @@ -66,9 +66,9 @@ An example implementation for multiple item data reader creator: TODO(yuyang18): Should we add whole design doc here? """ -import decorator -from decorator import * +import paddle.reader.decorator +from paddle.reader.decorator import * -import creator +import paddle.reader.creator __all__ = decorator.__all__ + ['creator'] diff --git a/python/paddle/reader/decorator.py b/python/paddle/reader/decorator.py index 1f83cabb8481451736944823be45185deea4f43b..4b1fe94222d35f8c0e4e4cccc364227a3f9509d0 100644 --- a/python/paddle/reader/decorator.py +++ b/python/paddle/reader/decorator.py @@ -20,7 +20,7 @@ __all__ = [ from threading import Thread import subprocess -from Queue import Queue +from six.moves.queue import Queue import itertools import random import zlib diff --git a/python/requirements.txt b/python/requirements.txt index ea827e9d5a0dcf8eb2ede1f6eaa88c777a138816..c091ecb111bda9d5e83c3ddcae93aed0745f9e4c 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -8,4 +8,4 @@ scipy>=0.19.0 Pillow nltk>=3.2.2 graphviz -LinkChecker +six diff --git a/python/setup.py.in b/python/setup.py.in index a064f36cc19dbc626dd85d76290280a01fa57215..a81cd19e10153be0d07badfa0c0fbcb01fe568f7 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -17,7 +17,8 @@ def git_commit(): git_commit = subprocess.Popen(cmd, stdout = subprocess.PIPE).communicate()[0].strip() except: git_commit = 'Unknown' - return git_commit + git_commit = git_commit.decode() + return str(git_commit) def _get_version_detail(idx): assert idx < 3, "vesion info consists of %(major)d.%(minor)d.%(patch)d, \ @@ -44,6 +45,7 @@ def is_taged(): try: cmd = ['git', 'describe', '--exact-match', '--tags', 'HEAD', '2>/dev/null'] git_tag = subprocess.Popen(cmd, stdout = subprocess.PIPE).communicate()[0].strip() + git_tag = git_tag.decode() except: return False @@ -67,13 +69,13 @@ with_mkl = '%(with_mkl)s' def show(): if istaged: - print 'full_version:', full_version - print 'major:', major - print 'minor:', minor - print 'patch:', patch - print 'rc:', rc + print('full_version:', full_version) + print('major:', major) + print('minor:', minor) + print('patch:', patch) + print('rc:', rc) else: - print 'commit:', commit + print('commit:', commit) def mkl(): return with_mkl