未验证 提交 75d15719 编写于 作者: T Tao Luo 提交者: GitHub

refine PADDLE_ENFORCE codes for unify PADDLE_ASSERT_MSG (#19603)

test=develop
上级 1c2aae56
......@@ -66,12 +66,12 @@ class Node {
std::string Name() const { return name_; }
VarDesc* Var() const {
PADDLE_ENFORCE(IsVar());
PADDLE_ENFORCE_EQ(IsVar(), true);
return var_desc_.get();
}
OpDesc* Op() const {
PADDLE_ENFORCE(IsOp());
PADDLE_ENFORCE_EQ(IsOp(), true);
return op_desc_.get();
}
......
......@@ -53,7 +53,7 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
auto dst_cpu_place = boost::get<platform::CPUPlace>(dst_place);
auto ctx_place = ctx.GetPlace();
PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx_place), true);
auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
auto stream =
......@@ -64,7 +64,7 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
auto src_cpu_place = boost::get<platform::CPUPlace>(src_place);
auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
auto ctx_place = ctx.GetPlace();
PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx_place), true);
auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place);
auto stream =
......@@ -75,7 +75,7 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
auto ctx_place = ctx.GetPlace();
PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx_place), true);
auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
if (platform::is_same_place(src_place, dst_place)) {
......
......@@ -146,7 +146,7 @@ void TensorToVector(const Tensor& src, std::vector<T>* dst) {
dst->resize(src.numel());
auto dst_ptr = static_cast<void*>(dst->data());
PADDLE_ENFORCE(platform::is_cpu_place(src.place()));
PADDLE_ENFORCE_EQ(platform::is_cpu_place(src.place()), true);
memory::Copy(dst_place, dst_ptr, boost::get<platform::CPUPlace>(src.place()),
src_ptr, size);
......
......@@ -25,7 +25,7 @@ namespace detail {
*/
template <typename T, typename... ARGS>
inline T& Ref(T* ptr, ARGS&&... args) {
PADDLE_ENFORCE(ptr != nullptr, ::paddle::string::Sprintf(args...));
PADDLE_ENFORCE_NOT_NULL(ptr, ::paddle::string::Sprintf(args...));
return *ptr;
}
......
......@@ -23,14 +23,14 @@ namespace operators {
inline float get_period_sparcity(const std::vector<float>& sparsity,
float cur_step, float rampup_steps) {
PADDLE_ENFORCE(static_cast<int>(cur_step) >= 0);
PADDLE_ENFORCE_GE(static_cast<int>(cur_step), 0);
size_t idx = static_cast<int>(cur_step * sparsity.size() / rampup_steps);
if (idx >= sparsity.size()) {
return 0.999;
}
PADDLE_ENFORCE(idx < sparsity.size());
PADDLE_ENFORCE_LT(idx, sparsity.size());
return sparsity[idx];
}
......@@ -63,7 +63,8 @@ class DGCOpKernel : public framework::OpKernel<T> {
float ratio =
1 - get_period_sparcity(sparsity, static_cast<float>(*current_step),
rampup_step);
PADDLE_ENFORCE(ratio > 0.0 && ratio < 1.0);
PADDLE_ENFORCE_GE(ratio, 0.0);
PADDLE_ENFORCE_LT(ratio, 1.0);
int k = static_cast<int>(g->numel() * ratio);
VLOG(10) << "m:" << m << ", use_nesterov:" << use_nesterov
......
......@@ -86,9 +86,10 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
auto* x_data = x->data<T>();
auto* y_data = y->mutable_data<T>(context.GetPlace());
if (dropout_prob == 1.0f) {
PADDLE_ENFORCE(cudaMemsetAsync(y_data, 0, x_numel * sizeof(T), stream));
PADDLE_ENFORCE(cudaMemsetAsync(mask_data, 0,
x_numel * sizeof(*mask_data), stream));
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaMemsetAsync(y_data, 0, x_numel * sizeof(T), stream));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemsetAsync(
mask_data, 0, x_numel * sizeof(*mask_data), stream));
return;
}
......
......@@ -66,8 +66,8 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
const framework::Tensor& im, const std::vector<int>& dilation,
const std::vector<int>& stride,
const std::vector<int>& padding, framework::Tensor* col) {
PADDLE_ENFORCE(im.dims().size() == 3);
PADDLE_ENFORCE(col->dims().size() == 5);
PADDLE_ENFORCE_EQ(im.dims().size(), 3);
PADDLE_ENFORCE_EQ(col->dims().size(), 5);
int im_channels = im.dims()[0];
int im_height = im.dims()[1];
......@@ -152,8 +152,8 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
const std::vector<int>& dilation,
const std::vector<int>& stride,
const std::vector<int>& padding, framework::Tensor* im) {
PADDLE_ENFORCE(im->dims().size() == 3);
PADDLE_ENFORCE(col.dims().size() == 5);
PADDLE_ENFORCE_EQ(im->dims().size(), 3);
PADDLE_ENFORCE_EQ(col.dims().size(), 5);
int im_channels = im->dims()[0];
int im_height = im->dims()[1];
......@@ -249,8 +249,8 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
const framework::Tensor& im, const std::vector<int>& dilation,
const std::vector<int>& stride,
const std::vector<int>& padding, framework::Tensor* col) {
PADDLE_ENFORCE(im.dims().size() == 3);
PADDLE_ENFORCE(col->dims().size() == 5);
PADDLE_ENFORCE_EQ(im.dims().size(), 3);
PADDLE_ENFORCE_EQ(col->dims().size(), 5);
int im_channels = im.dims()[0];
int im_height = im.dims()[1];
int im_width = im.dims()[2];
......@@ -331,8 +331,8 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
const std::vector<int>& dilation,
const std::vector<int>& stride,
const std::vector<int>& padding, framework::Tensor* im) {
PADDLE_ENFORCE(im->dims().size() == 3);
PADDLE_ENFORCE(col.dims().size() == 5);
PADDLE_ENFORCE_EQ(im->dims().size(), 3);
PADDLE_ENFORCE_EQ(col.dims().size(), 5);
int im_channels = im->dims()[0];
int im_height = im->dims()[1];
int im_width = im->dims()[2];
......
......@@ -142,9 +142,9 @@ void GPUSampleWithProb<T>::operator()(
int num_tries = UniqSampler<T>(sampler, num_samples, s_data);
VLOG(1) << "num_tries: " << num_tries;
PADDLE_ENFORCE(cudaMemcpy(samples_data + num_true, s_data,
sizeof(int64_t) * num_samples,
cudaMemcpyHostToDevice));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpy(samples_data + num_true, s_data,
sizeof(int64_t) * num_samples,
cudaMemcpyHostToDevice));
int threads = 512;
const size_t size = batch_size * num_sampled_classes;
......
......@@ -55,11 +55,11 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
auto* in1_data = in1_value.data<T>();
auto in1_place = input1.place();
PADDLE_ENFORCE(platform::is_gpu_place(in1_place));
PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true);
auto in2_place = input2.place();
PADDLE_ENFORCE(platform::is_gpu_place(in2_place));
PADDLE_ENFORCE_EQ(platform::is_gpu_place(in2_place), true);
auto out_place = context.GetPlace();
PADDLE_ENFORCE(platform::is_gpu_place(out_place));
PADDLE_ENFORCE_EQ(platform::is_gpu_place(out_place), true);
memory::Copy(boost::get<platform::CUDAPlace>(out_place), out_data,
boost::get<platform::CUDAPlace>(in1_place), in1_data,
......@@ -162,9 +162,9 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
}
auto in1_place = input1.place();
PADDLE_ENFORCE(platform::is_gpu_place(in1_place));
PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true);
auto in2_place = input2->place();
PADDLE_ENFORCE(platform::is_gpu_place(in2_place));
PADDLE_ENFORCE_EQ(platform::is_gpu_place(in2_place), true);
auto* in1_data = in1_value.data<T>();
auto* in2_data = in2_value->data<T>();
......
......@@ -78,8 +78,8 @@ class Vol2ColFunctor<platform::CUDADeviceContext, T> {
const std::vector<int>& strides,
const std::vector<int>& paddings,
framework::Tensor* col) const {
PADDLE_ENFORCE(vol.dims().size() == 4);
PADDLE_ENFORCE(col->dims().size() == 7);
PADDLE_ENFORCE_EQ(vol.dims().size(), 4);
PADDLE_ENFORCE_EQ(col->dims().size(), 7);
int input_channels = vol.dims()[0];
int input_depth = vol.dims()[1];
......@@ -204,8 +204,8 @@ class Col2VolFunctor<platform::CUDADeviceContext, T> {
const std::vector<int>& strides,
const std::vector<int>& paddings,
framework::Tensor* vol) const {
PADDLE_ENFORCE(vol->dims().size() == 4);
PADDLE_ENFORCE(col.dims().size() == 7);
PADDLE_ENFORCE_EQ(vol->dims().size(), 4);
PADDLE_ENFORCE_EQ(col.dims().size(), 7);
int input_channels = vol->dims()[0];
int input_depth = vol->dims()[1];
......
......@@ -30,7 +30,7 @@ class LarsMomentumOpKernel : public framework::OpKernel<T> {
auto learning_rate = ctx.Input<framework::LoDTensor>("LearningRate");
auto* grad_var = ctx.InputVar("Grad");
// only support dense for now.
PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>());
PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true);
auto grad = ctx.Input<framework::LoDTensor>("Grad");
param_out->mutable_data<T>(ctx.GetPlace());
......
......@@ -49,11 +49,12 @@ static void CPUTakeAlongD1(const platform::DeviceContext& ctx,
const framework::Tensor& array,
const framework::Tensor& index,
framework::Tensor* value) {
PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()));
PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true);
// UNDERSTAND: check shape src(B, C), index(B, K), out should also be (B, K)
PADDLE_ENFORCE(index.dims().size() == 2 && array.dims().size() == 2 &&
index.dims()[0] == array.dims()[0] &&
index.dims() == value->dims());
PADDLE_ENFORCE_EQ(index.dims().size(), 2);
PADDLE_ENFORCE_EQ(array.dims().size(), 2);
PADDLE_ENFORCE_EQ(index.dims()[0], array.dims()[0]);
PADDLE_ENFORCE_EQ(index.dims(), value->dims());
const auto batch_size = index.dims()[0];
const auto num_take = index.dims()[1];
......@@ -88,11 +89,12 @@ static void CPUPutAlongD1(const platform::DeviceContext& ctx,
framework::Tensor* array,
const framework::Tensor& index,
const framework::Tensor& value) {
PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()));
PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true);
// UNDERSTAND: check shape src(B, C), index(B, K), out should also be (B, K)
PADDLE_ENFORCE(index.dims().size() == 2 && array->dims().size() == 2 &&
index.dims()[0] == array->dims()[0] &&
index.dims() == value.dims());
PADDLE_ENFORCE_EQ(index.dims().size(), 2);
PADDLE_ENFORCE_EQ(array->dims().size(), 2);
PADDLE_ENFORCE_EQ(index.dims()[0], array->dims()[0]);
PADDLE_ENFORCE_EQ(index.dims(), value.dims());
const auto batch_size = index.dims()[0];
const auto num_put = index.dims()[1];
auto array_dims = array->dims();
......@@ -147,8 +149,8 @@ class SampleLogitsKernel : public framework::OpKernel<T> {
public:
using Tensor = framework::Tensor;
void Compute(const framework::ExecutionContext& context) const override {
PADDLE_ENFORCE(platform::is_cpu_place(context.GetPlace()),
"This kernel only runs on CPU.");
PADDLE_ENFORCE_EQ(platform::is_cpu_place(context.GetPlace()), true,
"This kernel only runs on CPU.");
VLOG(3) << "Enter SampleLogitsKernel";
// get necessary inputs
const Tensor* logits = context.Input<Tensor>("Logits");
......
......@@ -92,8 +92,8 @@ void LodTensorArrayCompute(const framework::ExecutionContext &context) {
bool in_place = out_var == in_vars[0];
auto &out_array = *out_var->GetMutable<framework::LoDTensorArray>();
for (size_t i = in_place ? 1 : 0; i < in_vars.size(); ++i) {
PADDLE_ENFORCE(in_vars[i]->IsType<framework::LoDTensorArray>(),
"Only support all inputs are TensorArray");
PADDLE_ENFORCE_EQ(in_vars[i]->IsType<framework::LoDTensorArray>(), true,
"Only support all inputs are TensorArray");
auto &in_array = in_vars[i]->Get<framework::LoDTensorArray>();
for (size_t i = 0; i < in_array.size(); ++i) {
......@@ -106,7 +106,7 @@ void LodTensorArrayCompute(const framework::ExecutionContext &context) {
context.device_context(), &out_array[i]);
out_array[i].set_lod(in_array[i].lod());
} else {
PADDLE_ENFORCE(out_array[i].lod() == in_array[i].lod());
PADDLE_ENFORCE_EQ(out_array[i].lod(), in_array[i].lod());
auto in = EigenVector<T>::Flatten(in_array[i]);
auto result = EigenVector<T>::Flatten(out_array[i]);
result.device(*context.template device_context<DeviceContext>()
......
......@@ -178,7 +178,7 @@ class SyncBatchNormKernel : public framework::OpKernel<T> {
int dtype = platform::ToNCCLDataType(x->type());
// In-place operation
PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
stats, stats, 2 * C + 1, static_cast<ncclDataType_t>(dtype), ncclSum,
comm, stream));
......@@ -398,7 +398,7 @@ class SyncBatchNormGradKernel : public framework::OpKernel<T> {
}
int dtype = platform::ToNCCLDataType(x->type());
// In-place operation
PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
stats, stats, 2 * C + 1, static_cast<ncclDataType_t>(dtype), ncclSum,
comm, stream));
......
......@@ -29,17 +29,19 @@ namespace platform {
class CublasHandleHolder {
public:
CublasHandleHolder(cudaStream_t stream, cublasMath_t math_type) {
PADDLE_ENFORCE(dynload::cublasCreate(&handle_));
PADDLE_ENFORCE(dynload::cublasSetStream(handle_, stream));
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cublasCreate(&handle_));
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cublasSetStream(handle_, stream));
#if CUDA_VERSION >= 9000
if (math_type == CUBLAS_TENSOR_OP_MATH) {
PADDLE_ENFORCE(
PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::cublasSetMathMode(handle_, CUBLAS_TENSOR_OP_MATH));
}
#endif
}
~CublasHandleHolder() { PADDLE_ENFORCE(dynload::cublasDestroy(handle_)); }
~CublasHandleHolder() {
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cublasDestroy(handle_));
}
template <typename Callback>
inline void Call(Callback &&callback) const {
......
......@@ -221,10 +221,10 @@ inline cudnnTensorFormat_t GetCudnnTensorFormat(
class ScopedTensorDescriptor {
public:
ScopedTensorDescriptor() {
PADDLE_ENFORCE(dynload::cudnnCreateTensorDescriptor(&desc_));
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateTensorDescriptor(&desc_));
}
~ScopedTensorDescriptor() {
PADDLE_ENFORCE(dynload::cudnnDestroyTensorDescriptor(desc_));
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyTensorDescriptor(desc_));
}
inline cudnnTensorDescriptor_t descriptor(const cudnnTensorFormat_t format,
......@@ -243,7 +243,7 @@ class ScopedTensorDescriptor {
if (groups > 1) {
dims_with_group[1] = dims_with_group[1] / groups;
}
PADDLE_ENFORCE(dynload::cudnnSetTensorNdDescriptor(
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetTensorNdDescriptor(
desc_, type, dims_with_group.size(), dims_with_group.data(),
strides.data()));
return desc_;
......@@ -265,10 +265,10 @@ class ScopedTensorDescriptor {
class ScopedFilterDescriptor {
public:
ScopedFilterDescriptor() {
PADDLE_ENFORCE(dynload::cudnnCreateFilterDescriptor(&desc_));
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateFilterDescriptor(&desc_));
}
~ScopedFilterDescriptor() {
PADDLE_ENFORCE(dynload::cudnnDestroyFilterDescriptor(desc_));
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyFilterDescriptor(desc_));
}
inline cudnnFilterDescriptor_t descriptor(const cudnnTensorFormat_t format,
......@@ -284,7 +284,7 @@ class ScopedFilterDescriptor {
kernel_with_group[0] /= groups;
// NOTE: input filter(C) of the filter is already asserted to be C/groups.
}
PADDLE_ENFORCE(dynload::cudnnSetFilterNdDescriptor(
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetFilterNdDescriptor(
desc_, type, format, kernel_with_group.size(),
kernel_with_group.data()));
return desc_;
......@@ -306,10 +306,12 @@ class ScopedFilterDescriptor {
class ScopedConvolutionDescriptor {
public:
ScopedConvolutionDescriptor() {
PADDLE_ENFORCE(dynload::cudnnCreateConvolutionDescriptor(&desc_));
PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::cudnnCreateConvolutionDescriptor(&desc_));
}
~ScopedConvolutionDescriptor() {
PADDLE_ENFORCE(dynload::cudnnDestroyConvolutionDescriptor(desc_));
PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::cudnnDestroyConvolutionDescriptor(desc_));
}
inline cudnnConvolutionDescriptor_t descriptor(
......@@ -332,7 +334,7 @@ class ScopedConvolutionDescriptor {
cudnnDataType_t compute_type =
(type == CUDNN_DATA_DOUBLE) ? CUDNN_DATA_DOUBLE : CUDNN_DATA_FLOAT;
PADDLE_ENFORCE(dynload::cudnnSetConvolutionNdDescriptor(
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetConvolutionNdDescriptor(
desc_, pads.size(), pads.data(), strides.data(), dilations.data(),
CUDNN_CROSS_CORRELATION, compute_type));
return desc_;
......@@ -353,10 +355,10 @@ class ScopedConvolutionDescriptor {
class ScopedPoolingDescriptor {
public:
ScopedPoolingDescriptor() {
PADDLE_ENFORCE(dynload::cudnnCreatePoolingDescriptor(&desc_));
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreatePoolingDescriptor(&desc_));
}
~ScopedPoolingDescriptor() {
PADDLE_ENFORCE(dynload::cudnnDestroyPoolingDescriptor(desc_));
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyPoolingDescriptor(desc_));
}
inline cudnnPoolingDescriptor_t descriptor(const PoolingMode& mode,
......@@ -365,7 +367,7 @@ class ScopedPoolingDescriptor {
const std::vector<int>& strides) {
PADDLE_ENFORCE_EQ(kernel.size(), pads.size());
PADDLE_ENFORCE_EQ(kernel.size(), strides.size());
PADDLE_ENFORCE(dynload::cudnnSetPoolingNdDescriptor(
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetPoolingNdDescriptor(
desc_, (GetPoolingMode(mode)),
CUDNN_PROPAGATE_NAN, // Always propagate nans.
kernel.size(), kernel.data(), pads.data(), strides.data()));
......@@ -380,16 +382,18 @@ class ScopedPoolingDescriptor {
class ScopedSpatialTransformerDescriptor {
public:
ScopedSpatialTransformerDescriptor() {
PADDLE_ENFORCE(dynload::cudnnCreateSpatialTransformerDescriptor(&desc_));
PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::cudnnCreateSpatialTransformerDescriptor(&desc_));
}
~ScopedSpatialTransformerDescriptor() {
PADDLE_ENFORCE(dynload::cudnnDestroySpatialTransformerDescriptor(desc_));
PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::cudnnDestroySpatialTransformerDescriptor(desc_));
}
template <typename T>
inline cudnnSpatialTransformerDescriptor_t descriptor(const int nbDims,
const int dimA[]) {
PADDLE_ENFORCE(dynload::cudnnSetSpatialTransformerNdDescriptor(
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetSpatialTransformerNdDescriptor(
desc_, CUDNN_SAMPLER_BILINEAR, CudnnDataType<T>::type, nbDims, dimA));
return desc_;
}
......@@ -402,10 +406,12 @@ class ScopedSpatialTransformerDescriptor {
class ScopedActivationDescriptor {
public:
ScopedActivationDescriptor() {
PADDLE_ENFORCE(dynload::cudnnCreateActivationDescriptor(&desc_));
PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::cudnnCreateActivationDescriptor(&desc_));
}
~ScopedActivationDescriptor() {
PADDLE_ENFORCE(dynload::cudnnDestroyActivationDescriptor(desc_));
PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::cudnnDestroyActivationDescriptor(desc_));
}
template <typename T>
......@@ -467,15 +473,15 @@ inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) {
class ScopedCTCLossDescriptor {
public:
ScopedCTCLossDescriptor() {
PADDLE_ENFORCE(dynload::cudnnCreateCTCLossDescriptor(&desc_));
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateCTCLossDescriptor(&desc_));
}
~ScopedCTCLossDescriptor() {
PADDLE_ENFORCE(dynload::cudnnDestroyCTCLossDescriptor(desc_));
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyCTCLossDescriptor(desc_));
}
template <typename T>
inline cudnnCTCLossDescriptor_t descriptor() {
PADDLE_ENFORCE(
PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::cudnnSetCTCLossDescriptor(desc_, CudnnDataType<T>::type));
return desc_;
}
......
......@@ -167,7 +167,7 @@ class CudnnHolder {
inline void ResetWorkspace() {
if (workspace_) {
// Maybe someone is using the current workspace
PADDLE_ENFORCE(cudaStreamSynchronize(*stream_));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(*stream_));
workspace_ = nullptr;
}
}
......@@ -306,7 +306,7 @@ class CUDADeviceContext : public DeviceContext {
template <typename Callback>
void RecordEvent(cudaEvent_t ev, Callback callback) {
callback();
PADDLE_ENFORCE(cudaEventRecord(ev, stream_));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(ev, stream_));
}
template <typename Callback>
......
......@@ -63,11 +63,11 @@ class NCCLGroupGuard {
inline NCCLGroupGuard() {
NCCLMutex().lock();
PADDLE_ENFORCE(dynload::ncclGroupStart());
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclGroupStart());
}
inline ~NCCLGroupGuard() {
PADDLE_ENFORCE(dynload::ncclGroupEnd());
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclGroupEnd());
NCCLMutex().unlock();
}
};
......@@ -94,7 +94,7 @@ struct NCCLContextMap {
explicit NCCLContextMap(const std::vector<platform::Place> &places,
ncclUniqueId *nccl_id = nullptr,
size_t num_trainers = 1, size_t trainer_id = 0) {
PADDLE_ENFORCE(!places.empty());
PADDLE_ENFORCE_EQ(!places.empty(), true);
order_.reserve(places.size());
for (auto &p : places) {
int dev_id = boost::get<CUDAPlace>(p).device;
......@@ -109,7 +109,7 @@ struct NCCLContextMap {
// if num_trainers == 1, should create a new nccl id for local comms.
if (num_trainers == 1 && nccl_id == nullptr) {
std::lock_guard<std::mutex> guard(NCCLGroupGuard::NCCLMutex());
PADDLE_ENFORCE(platform::dynload::ncclCommInitAll(
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclCommInitAll(
comms.get(), static_cast<int>(order_.size()), order_.data()));
} else {
PADDLE_ENFORCE_NOT_NULL(nccl_id);
......@@ -126,8 +126,8 @@ struct NCCLContextMap {
}
VLOG(1) << "init nccl rank:" << rank << ", nranks:" << nranks
<< ", gpu_id:" << gpu_id << ", dev_id:" << order_[i];
PADDLE_ENFORCE(cudaSetDevice(gpu_id));
PADDLE_ENFORCE(platform::dynload::ncclCommInitRank(
PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(gpu_id));
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclCommInitRank(
comms.get() + i, nranks, *nccl_id, rank));
}
}
......@@ -249,13 +249,13 @@ class NCCLCommunicator {
size_t trainers_num, size_t trainer_id,
size_t inter_trainers_num,
size_t exter_trainers_num) {
PADDLE_ENFORCE(trainers_num == inter_trainers_num * exter_trainers_num,
"trainers_num:%llu != inter_trainers_num:%llu * "
"exter_trainers_num:%llu",
trainers_num, inter_trainers_num, exter_trainers_num);
PADDLE_ENFORCE_EQ(trainers_num, inter_trainers_num * exter_trainers_num,
"trainers_num:%llu != inter_trainers_num:%llu * "
"exter_trainers_num:%llu",
trainers_num, inter_trainers_num, exter_trainers_num);
PADDLE_ENFORCE(inter_trainers_num > 1, "inter_trainers_num:%llu must > 1",
inter_trainers_num);
PADDLE_ENFORCE_GT(inter_trainers_num, 1, "inter_trainers_num:%llu must > 1",
inter_trainers_num);
int inter_trainer_id = trainer_id % inter_trainers_num;
for (size_t i = 0; i < inter_nccl_ids.size(); i++) {
......
......@@ -35,13 +35,13 @@ void DummyKernelAndEvent() {
ForEachDevice([](int d) {
platform::SetDeviceId(d);
cudaStream_t stream;
PADDLE_ENFORCE(cudaStreamCreate(&stream));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream));
Mark("_cuda_startup_");
int *ptr;
PADDLE_ENFORCE(cudaMalloc(&ptr, sizeof(int)));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc(&ptr, sizeof(int)));
DummyKernel<<<1, 1, 0, stream>>>(ptr);
PADDLE_ENFORCE(cudaStreamSynchronize(stream));
PADDLE_ENFORCE(cudaFree(ptr));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaFree(ptr));
});
}
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册