提交 c4535732 编写于 作者: T tensor-tang

Merge remote-tracking branch 'ups/develop' into fix

......@@ -29,6 +29,8 @@ IF(NOT ${CBLAS_FOUND})
"${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
CACHE FILEPATH "openblas library." FORCE)
ADD_DEFINITIONS(-DPADDLE_USE_OPENBLAS)
SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
SET(OPENBLAS_COMMIT "v0.2.20")
......
......@@ -145,9 +145,9 @@ void ParallelExecutor::BCastParamsToGPUs(
auto &dims = main_tensor.dims();
if (paddle::platform::is_gpu_place(main_tensor.place())) {
#ifdef PADDLE_WITH_CUDA
std::vector<void *> buffers;
size_t numel = main_tensor.numel();
ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
platform::NCCLGroupGuard guard;
for (size_t i = 0; i < member_->places_.size(); ++i) {
auto place = member_->places_[i];
void *buffer;
......@@ -159,11 +159,21 @@ void ParallelExecutor::BCastParamsToGPUs(
t->Resize(dims);
buffer = t->mutable_data(place, main_tensor.type());
}
auto &nccl_ctx = member_->nccl_ctxs_->at(place);
platform::dynload::ncclBcast(buffer, numel, data_type, 0,
nccl_ctx.comm_, nccl_ctx.stream());
buffers.push_back(buffer);
}
member_->nccl_ctxs_->WaitAll();
PADDLE_ENFORCE_EQ(member_->places_.size(), buffers.size(),
"variables' buffer size to bcast NOT equal to places");
{
platform::NCCLGroupGuard guard;
for (size_t i = 0; i < member_->places_.size(); ++i) {
auto &nccl_ctx = member_->nccl_ctxs_->at(member_->places_[i]);
platform::dynload::ncclBcast(buffers[i], numel, data_type, 0,
nccl_ctx.comm_, nccl_ctx.stream());
}
member_->nccl_ctxs_->WaitAll();
}
#else
PADDLE_THROW("Not compiled with CUDA");
#endif
......
......@@ -20,16 +20,20 @@ limitations under the License. */
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/pybind/pybind.h"
DEFINE_string(devices, "", "The devices to be used which is joined by comma.");
DEFINE_bool(init_p2p, false, "Whether to init p2p.");
DEFINE_int32(math_num_threads, 1,
"Number of threads used to run math functions.");
namespace paddle {
namespace inference {
void Init(const std::vector<std::string> argv) {
framework::InitGflags(argv);
operators::math::SetNumThreads(FLAGS_math_num_threads);
// init devices
std::vector<int> devices;
std::string token;
......
......@@ -169,7 +169,8 @@ class RequestPrefetch final : public RequestBase {
auto scope = request_->GetMutableLocalScope();
auto invar = scope->FindVar(in_var_name);
framework::Variable* outvar = scope->FindVar(out_var_name);
// out var must be created in local scope!
framework::Variable* outvar = scope->Var(out_var_name);
request_handler_->Handle(in_var_name, scope, invar, &outvar, out_var_name);
......
......@@ -20,13 +20,16 @@
#ifdef PADDLE_WITH_MKLML
#include <mkl_cblas.h>
#include <mkl_lapacke.h>
#include <mkl_service.h>
#include <mkl_vml_functions.h>
#endif
#ifdef PADDLE_USE_OPENBLAS
#include <cblas.h>
#ifdef LAPACK_FOUND
#include <lapacke.h>
#endif
#endif
#ifndef LAPACK_FOUND
extern "C" {
......@@ -46,6 +49,18 @@ namespace paddle {
namespace operators {
namespace math {
static void SetNumThreads(int num_threads) {
#ifdef PADDLE_USE_OPENBLAS
int real_num_threads = num_threads > 1 ? num_threads : 1;
openblas_set_num_threads(real_num_threads);
#elif defined(PADDLE_WITH_MKLML)
int real_num_threads = num_threads > 1 ? num_threads : 1;
mkl_set_num_threads(real_num_threads);
#else
PADDLE_ENFORCE(false, "To be implemented.");
#endif
}
/**
* Matrix Descriptor of a memory buffer.
*
......
......@@ -21,8 +21,10 @@ limitations under the License. */
#ifdef PADDLE_USE_OPENBLAS
#include <cblas.h>
#ifdef LAPACK_FOUND
#include <lapacke.h>
#endif
#endif
#ifndef LAPACK_FOUND
extern "C" {
......
......@@ -41,6 +41,11 @@ inline ncclDataType_t ToNCCLDataType(std::type_index type) {
}
}
// NOTE(minqiyang): according to the ncclGroupEnd documentations:
// https://docs.nvidia.com/deeplearning/sdk/nccl-api/ncclapidoc.html,
// ncclGroupEnd will wait for all communicators to be initialized, which will
// cause blocking problem when a runtime_error was thrown, so try only guard
// NCCL actions when use it.
class NCCLGroupGuard {
public:
static std::mutex &NCCLMutex() {
......
......@@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef MATHFUNCTIONS_H_
#define MATHFUNCTIONS_H_
#pragma once
#ifdef PADDLE_WITH_MKLML
#include <mkl_cblas.h>
......@@ -21,7 +20,7 @@ limitations under the License. */
#include <mkl_vml_functions.h>
#endif
#if defined(PADDLE_USE_VECLIB)
#ifdef PADDLE_USE_VECLIB
extern "C" {
#include <cblas.h>
#include <clapack.h>
......@@ -30,8 +29,10 @@ extern "C" {
#ifdef PADDLE_USE_OPENBLAS
#include <cblas.h>
#ifdef LAPACK_FOUND
#include <lapacke.h>
#endif
#endif
#ifndef LAPACK_FOUND
extern "C" {
......@@ -126,5 +127,3 @@ template <class T>
void vTanh(const int n, const T* a, T* r);
} // namespace paddle
#endif // MATHFUNCTIONS_H_
......@@ -4242,7 +4242,7 @@ def random_crop(x, shape, seed=None):
seed_out = helper.create_tmp_variable(dtype="int64")
helper.append_op(
type="random_crop",
inputs={"X": input,
inputs={"X": x,
"Seed": seed},
outputs={"Out": out,
"SeedOut": seed_out},
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册