diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake index 8af2765f58717408e3a1ef6b500bb01511bfd8d3..4a49a92f2b131bbb38fcf93070ea811e0b1a14e8 100644 --- a/cmake/external/openblas.cmake +++ b/cmake/external/openblas.cmake @@ -29,6 +29,8 @@ IF(NOT ${CBLAS_FOUND}) "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}" CACHE FILEPATH "openblas library." FORCE) + ADD_DEFINITIONS(-DPADDLE_USE_OPENBLAS) + SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable") SET(OPENBLAS_COMMIT "v0.2.20") diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index ac4d1f58a5b3b11f034af7618681ebd913d8afb9..9406c6155da860c90739bddac1e81403b094e619 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -145,9 +145,9 @@ void ParallelExecutor::BCastParamsToGPUs( auto &dims = main_tensor.dims(); if (paddle::platform::is_gpu_place(main_tensor.place())) { #ifdef PADDLE_WITH_CUDA + std::vector buffers; size_t numel = main_tensor.numel(); ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type()); - platform::NCCLGroupGuard guard; for (size_t i = 0; i < member_->places_.size(); ++i) { auto place = member_->places_[i]; void *buffer; @@ -159,11 +159,21 @@ void ParallelExecutor::BCastParamsToGPUs( t->Resize(dims); buffer = t->mutable_data(place, main_tensor.type()); } - auto &nccl_ctx = member_->nccl_ctxs_->at(place); - platform::dynload::ncclBcast(buffer, numel, data_type, 0, - nccl_ctx.comm_, nccl_ctx.stream()); + buffers.push_back(buffer); } - member_->nccl_ctxs_->WaitAll(); + + PADDLE_ENFORCE_EQ(member_->places_.size(), buffers.size(), + "variables' buffer size to bcast NOT equal to places"); + { + platform::NCCLGroupGuard guard; + for (size_t i = 0; i < member_->places_.size(); ++i) { + auto &nccl_ctx = member_->nccl_ctxs_->at(member_->places_[i]); + platform::dynload::ncclBcast(buffers[i], numel, data_type, 0, + nccl_ctx.comm_, nccl_ctx.stream()); + } + member_->nccl_ctxs_->WaitAll(); + } + #else PADDLE_THROW("Not compiled with CUDA"); #endif diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc index 65db7c7b5008dcb301e741ec17c3623715e10bb8..6b03ac7119b117e442e6af34c719c8a4f736bde9 100644 --- a/paddle/fluid/inference/io.cc +++ b/paddle/fluid/inference/io.cc @@ -20,16 +20,20 @@ limitations under the License. */ #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/pybind/pybind.h" DEFINE_string(devices, "", "The devices to be used which is joined by comma."); DEFINE_bool(init_p2p, false, "Whether to init p2p."); +DEFINE_int32(math_num_threads, 1, + "Number of threads used to run math functions."); namespace paddle { namespace inference { void Init(const std::vector argv) { framework::InitGflags(argv); + operators::math::SetNumThreads(FLAGS_math_num_threads); // init devices std::vector devices; std::string token; diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc index 2d34f85838c34f1dfe43d2130e127d0258072fa7..5a87258901c6563fe793d4041f344011a56d9a01 100644 --- a/paddle/fluid/operators/detail/grpc_server.cc +++ b/paddle/fluid/operators/detail/grpc_server.cc @@ -169,7 +169,8 @@ class RequestPrefetch final : public RequestBase { auto scope = request_->GetMutableLocalScope(); auto invar = scope->FindVar(in_var_name); - framework::Variable* outvar = scope->FindVar(out_var_name); + // out var must be created in local scope! + framework::Variable* outvar = scope->Var(out_var_name); request_handler_->Handle(in_var_name, scope, invar, &outvar, out_var_name); diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h index 1a37cb39d56066b8380338b9710a441e41518c39..6207d14ecdc922cbca2d05d20e4b8a9da9b9d627 100644 --- a/paddle/fluid/operators/math/blas.h +++ b/paddle/fluid/operators/math/blas.h @@ -20,13 +20,16 @@ #ifdef PADDLE_WITH_MKLML #include #include +#include #include #endif #ifdef PADDLE_USE_OPENBLAS #include +#ifdef LAPACK_FOUND #include #endif +#endif #ifndef LAPACK_FOUND extern "C" { @@ -46,6 +49,18 @@ namespace paddle { namespace operators { namespace math { +static void SetNumThreads(int num_threads) { +#ifdef PADDLE_USE_OPENBLAS + int real_num_threads = num_threads > 1 ? num_threads : 1; + openblas_set_num_threads(real_num_threads); +#elif defined(PADDLE_WITH_MKLML) + int real_num_threads = num_threads > 1 ? num_threads : 1; + mkl_set_num_threads(real_num_threads); +#else + PADDLE_ENFORCE(false, "To be implemented."); +#endif +} + /** * Matrix Descriptor of a memory buffer. * diff --git a/paddle/fluid/operators/math/math_function.h b/paddle/fluid/operators/math/math_function.h index d4b0e17ed44da61e2633b9bd97faeb62f9967c3c..8b296b6a07ca222ddc08fedfd2eed423b46dc5c3 100644 --- a/paddle/fluid/operators/math/math_function.h +++ b/paddle/fluid/operators/math/math_function.h @@ -21,8 +21,10 @@ limitations under the License. */ #ifdef PADDLE_USE_OPENBLAS #include +#ifdef LAPACK_FOUND #include #endif +#endif #ifndef LAPACK_FOUND extern "C" { diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h index 6f8e3f22db54d166cf97cfdd3d009058207a7ca5..cc46c88fd1f9a5d1bacad26beed6fd0af6405310 100644 --- a/paddle/fluid/platform/nccl_helper.h +++ b/paddle/fluid/platform/nccl_helper.h @@ -41,6 +41,11 @@ inline ncclDataType_t ToNCCLDataType(std::type_index type) { } } +// NOTE(minqiyang): according to the ncclGroupEnd documentations: +// https://docs.nvidia.com/deeplearning/sdk/nccl-api/ncclapidoc.html, +// ncclGroupEnd will wait for all communicators to be initialized, which will +// cause blocking problem when a runtime_error was thrown, so try only guard +// NCCL actions when use it. class NCCLGroupGuard { public: static std::mutex &NCCLMutex() { diff --git a/paddle/math/MathFunctions.h b/paddle/math/MathFunctions.h index f3d8b1a39e849d5f5a9e79cf33252b60170ced81..854e4baa3987f61353038c7b26acf43943c89636 100644 --- a/paddle/math/MathFunctions.h +++ b/paddle/math/MathFunctions.h @@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifndef MATHFUNCTIONS_H_ -#define MATHFUNCTIONS_H_ +#pragma once #ifdef PADDLE_WITH_MKLML #include @@ -21,7 +20,7 @@ limitations under the License. */ #include #endif -#if defined(PADDLE_USE_VECLIB) +#ifdef PADDLE_USE_VECLIB extern "C" { #include #include @@ -30,8 +29,10 @@ extern "C" { #ifdef PADDLE_USE_OPENBLAS #include +#ifdef LAPACK_FOUND #include #endif +#endif #ifndef LAPACK_FOUND extern "C" { @@ -126,5 +127,3 @@ template void vTanh(const int n, const T* a, T* r); } // namespace paddle - -#endif // MATHFUNCTIONS_H_ diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 982340d7eeb872597f61b5ca351da019c65dd72f..2c1f9888282188b8066924cef0108ee697f91da6 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -4242,7 +4242,7 @@ def random_crop(x, shape, seed=None): seed_out = helper.create_tmp_variable(dtype="int64") helper.append_op( type="random_crop", - inputs={"X": input, + inputs={"X": x, "Seed": seed}, outputs={"Out": out, "SeedOut": seed_out},