提交 c4535732 编写于 作者: T tensor-tang

Merge remote-tracking branch 'ups/develop' into fix

...@@ -29,6 +29,8 @@ IF(NOT ${CBLAS_FOUND}) ...@@ -29,6 +29,8 @@ IF(NOT ${CBLAS_FOUND})
"${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}" "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
CACHE FILEPATH "openblas library." FORCE) CACHE FILEPATH "openblas library." FORCE)
ADD_DEFINITIONS(-DPADDLE_USE_OPENBLAS)
SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable") SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
SET(OPENBLAS_COMMIT "v0.2.20") SET(OPENBLAS_COMMIT "v0.2.20")
......
...@@ -145,9 +145,9 @@ void ParallelExecutor::BCastParamsToGPUs( ...@@ -145,9 +145,9 @@ void ParallelExecutor::BCastParamsToGPUs(
auto &dims = main_tensor.dims(); auto &dims = main_tensor.dims();
if (paddle::platform::is_gpu_place(main_tensor.place())) { if (paddle::platform::is_gpu_place(main_tensor.place())) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
std::vector<void *> buffers;
size_t numel = main_tensor.numel(); size_t numel = main_tensor.numel();
ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type()); ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
platform::NCCLGroupGuard guard;
for (size_t i = 0; i < member_->places_.size(); ++i) { for (size_t i = 0; i < member_->places_.size(); ++i) {
auto place = member_->places_[i]; auto place = member_->places_[i];
void *buffer; void *buffer;
...@@ -159,11 +159,21 @@ void ParallelExecutor::BCastParamsToGPUs( ...@@ -159,11 +159,21 @@ void ParallelExecutor::BCastParamsToGPUs(
t->Resize(dims); t->Resize(dims);
buffer = t->mutable_data(place, main_tensor.type()); buffer = t->mutable_data(place, main_tensor.type());
} }
auto &nccl_ctx = member_->nccl_ctxs_->at(place); buffers.push_back(buffer);
platform::dynload::ncclBcast(buffer, numel, data_type, 0,
nccl_ctx.comm_, nccl_ctx.stream());
} }
member_->nccl_ctxs_->WaitAll();
PADDLE_ENFORCE_EQ(member_->places_.size(), buffers.size(),
"variables' buffer size to bcast NOT equal to places");
{
platform::NCCLGroupGuard guard;
for (size_t i = 0; i < member_->places_.size(); ++i) {
auto &nccl_ctx = member_->nccl_ctxs_->at(member_->places_[i]);
platform::dynload::ncclBcast(buffers[i], numel, data_type, 0,
nccl_ctx.comm_, nccl_ctx.stream());
}
member_->nccl_ctxs_->WaitAll();
}
#else #else
PADDLE_THROW("Not compiled with CUDA"); PADDLE_THROW("Not compiled with CUDA");
#endif #endif
......
...@@ -20,16 +20,20 @@ limitations under the License. */ ...@@ -20,16 +20,20 @@ limitations under the License. */
#include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/pybind/pybind.h" #include "paddle/fluid/pybind/pybind.h"
DEFINE_string(devices, "", "The devices to be used which is joined by comma."); DEFINE_string(devices, "", "The devices to be used which is joined by comma.");
DEFINE_bool(init_p2p, false, "Whether to init p2p."); DEFINE_bool(init_p2p, false, "Whether to init p2p.");
DEFINE_int32(math_num_threads, 1,
"Number of threads used to run math functions.");
namespace paddle { namespace paddle {
namespace inference { namespace inference {
void Init(const std::vector<std::string> argv) { void Init(const std::vector<std::string> argv) {
framework::InitGflags(argv); framework::InitGflags(argv);
operators::math::SetNumThreads(FLAGS_math_num_threads);
// init devices // init devices
std::vector<int> devices; std::vector<int> devices;
std::string token; std::string token;
......
...@@ -169,7 +169,8 @@ class RequestPrefetch final : public RequestBase { ...@@ -169,7 +169,8 @@ class RequestPrefetch final : public RequestBase {
auto scope = request_->GetMutableLocalScope(); auto scope = request_->GetMutableLocalScope();
auto invar = scope->FindVar(in_var_name); auto invar = scope->FindVar(in_var_name);
framework::Variable* outvar = scope->FindVar(out_var_name); // out var must be created in local scope!
framework::Variable* outvar = scope->Var(out_var_name);
request_handler_->Handle(in_var_name, scope, invar, &outvar, out_var_name); request_handler_->Handle(in_var_name, scope, invar, &outvar, out_var_name);
......
...@@ -20,13 +20,16 @@ ...@@ -20,13 +20,16 @@
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
#include <mkl_cblas.h> #include <mkl_cblas.h>
#include <mkl_lapacke.h> #include <mkl_lapacke.h>
#include <mkl_service.h>
#include <mkl_vml_functions.h> #include <mkl_vml_functions.h>
#endif #endif
#ifdef PADDLE_USE_OPENBLAS #ifdef PADDLE_USE_OPENBLAS
#include <cblas.h> #include <cblas.h>
#ifdef LAPACK_FOUND
#include <lapacke.h> #include <lapacke.h>
#endif #endif
#endif
#ifndef LAPACK_FOUND #ifndef LAPACK_FOUND
extern "C" { extern "C" {
...@@ -46,6 +49,18 @@ namespace paddle { ...@@ -46,6 +49,18 @@ namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
static void SetNumThreads(int num_threads) {
#ifdef PADDLE_USE_OPENBLAS
int real_num_threads = num_threads > 1 ? num_threads : 1;
openblas_set_num_threads(real_num_threads);
#elif defined(PADDLE_WITH_MKLML)
int real_num_threads = num_threads > 1 ? num_threads : 1;
mkl_set_num_threads(real_num_threads);
#else
PADDLE_ENFORCE(false, "To be implemented.");
#endif
}
/** /**
* Matrix Descriptor of a memory buffer. * Matrix Descriptor of a memory buffer.
* *
......
...@@ -21,8 +21,10 @@ limitations under the License. */ ...@@ -21,8 +21,10 @@ limitations under the License. */
#ifdef PADDLE_USE_OPENBLAS #ifdef PADDLE_USE_OPENBLAS
#include <cblas.h> #include <cblas.h>
#ifdef LAPACK_FOUND
#include <lapacke.h> #include <lapacke.h>
#endif #endif
#endif
#ifndef LAPACK_FOUND #ifndef LAPACK_FOUND
extern "C" { extern "C" {
......
...@@ -41,6 +41,11 @@ inline ncclDataType_t ToNCCLDataType(std::type_index type) { ...@@ -41,6 +41,11 @@ inline ncclDataType_t ToNCCLDataType(std::type_index type) {
} }
} }
// NOTE(minqiyang): according to the ncclGroupEnd documentations:
// https://docs.nvidia.com/deeplearning/sdk/nccl-api/ncclapidoc.html,
// ncclGroupEnd will wait for all communicators to be initialized, which will
// cause blocking problem when a runtime_error was thrown, so try only guard
// NCCL actions when use it.
class NCCLGroupGuard { class NCCLGroupGuard {
public: public:
static std::mutex &NCCLMutex() { static std::mutex &NCCLMutex() {
......
...@@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifndef MATHFUNCTIONS_H_ #pragma once
#define MATHFUNCTIONS_H_
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
#include <mkl_cblas.h> #include <mkl_cblas.h>
...@@ -21,7 +20,7 @@ limitations under the License. */ ...@@ -21,7 +20,7 @@ limitations under the License. */
#include <mkl_vml_functions.h> #include <mkl_vml_functions.h>
#endif #endif
#if defined(PADDLE_USE_VECLIB) #ifdef PADDLE_USE_VECLIB
extern "C" { extern "C" {
#include <cblas.h> #include <cblas.h>
#include <clapack.h> #include <clapack.h>
...@@ -30,8 +29,10 @@ extern "C" { ...@@ -30,8 +29,10 @@ extern "C" {
#ifdef PADDLE_USE_OPENBLAS #ifdef PADDLE_USE_OPENBLAS
#include <cblas.h> #include <cblas.h>
#ifdef LAPACK_FOUND
#include <lapacke.h> #include <lapacke.h>
#endif #endif
#endif
#ifndef LAPACK_FOUND #ifndef LAPACK_FOUND
extern "C" { extern "C" {
...@@ -126,5 +127,3 @@ template <class T> ...@@ -126,5 +127,3 @@ template <class T>
void vTanh(const int n, const T* a, T* r); void vTanh(const int n, const T* a, T* r);
} // namespace paddle } // namespace paddle
#endif // MATHFUNCTIONS_H_
...@@ -4242,7 +4242,7 @@ def random_crop(x, shape, seed=None): ...@@ -4242,7 +4242,7 @@ def random_crop(x, shape, seed=None):
seed_out = helper.create_tmp_variable(dtype="int64") seed_out = helper.create_tmp_variable(dtype="int64")
helper.append_op( helper.append_op(
type="random_crop", type="random_crop",
inputs={"X": input, inputs={"X": x,
"Seed": seed}, "Seed": seed},
outputs={"Out": out, outputs={"Out": out,
"SeedOut": seed_out}, "SeedOut": seed_out},
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册