未验证 提交 75d15719 编写于 作者: T Tao Luo 提交者: GitHub

refine PADDLE_ENFORCE codes for unify PADDLE_ASSERT_MSG (#19603)

test=develop
上级 1c2aae56
...@@ -66,12 +66,12 @@ class Node { ...@@ -66,12 +66,12 @@ class Node {
std::string Name() const { return name_; } std::string Name() const { return name_; }
VarDesc* Var() const { VarDesc* Var() const {
PADDLE_ENFORCE(IsVar()); PADDLE_ENFORCE_EQ(IsVar(), true);
return var_desc_.get(); return var_desc_.get();
} }
OpDesc* Op() const { OpDesc* Op() const {
PADDLE_ENFORCE(IsOp()); PADDLE_ENFORCE_EQ(IsOp(), true);
return op_desc_.get(); return op_desc_.get();
} }
......
...@@ -53,7 +53,7 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, ...@@ -53,7 +53,7 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place); auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
auto dst_cpu_place = boost::get<platform::CPUPlace>(dst_place); auto dst_cpu_place = boost::get<platform::CPUPlace>(dst_place);
auto ctx_place = ctx.GetPlace(); auto ctx_place = ctx.GetPlace();
PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx_place), true);
auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place); auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place); PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
auto stream = auto stream =
...@@ -64,7 +64,7 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, ...@@ -64,7 +64,7 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
auto src_cpu_place = boost::get<platform::CPUPlace>(src_place); auto src_cpu_place = boost::get<platform::CPUPlace>(src_place);
auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place); auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
auto ctx_place = ctx.GetPlace(); auto ctx_place = ctx.GetPlace();
PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx_place), true);
auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place); auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place); PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place);
auto stream = auto stream =
...@@ -75,7 +75,7 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, ...@@ -75,7 +75,7 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place); auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place); auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
auto ctx_place = ctx.GetPlace(); auto ctx_place = ctx.GetPlace();
PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx_place), true);
auto stream = auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream(); reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
if (platform::is_same_place(src_place, dst_place)) { if (platform::is_same_place(src_place, dst_place)) {
......
...@@ -146,7 +146,7 @@ void TensorToVector(const Tensor& src, std::vector<T>* dst) { ...@@ -146,7 +146,7 @@ void TensorToVector(const Tensor& src, std::vector<T>* dst) {
dst->resize(src.numel()); dst->resize(src.numel());
auto dst_ptr = static_cast<void*>(dst->data()); auto dst_ptr = static_cast<void*>(dst->data());
PADDLE_ENFORCE(platform::is_cpu_place(src.place())); PADDLE_ENFORCE_EQ(platform::is_cpu_place(src.place()), true);
memory::Copy(dst_place, dst_ptr, boost::get<platform::CPUPlace>(src.place()), memory::Copy(dst_place, dst_ptr, boost::get<platform::CPUPlace>(src.place()),
src_ptr, size); src_ptr, size);
......
...@@ -25,7 +25,7 @@ namespace detail { ...@@ -25,7 +25,7 @@ namespace detail {
*/ */
template <typename T, typename... ARGS> template <typename T, typename... ARGS>
inline T& Ref(T* ptr, ARGS&&... args) { inline T& Ref(T* ptr, ARGS&&... args) {
PADDLE_ENFORCE(ptr != nullptr, ::paddle::string::Sprintf(args...)); PADDLE_ENFORCE_NOT_NULL(ptr, ::paddle::string::Sprintf(args...));
return *ptr; return *ptr;
} }
......
...@@ -23,14 +23,14 @@ namespace operators { ...@@ -23,14 +23,14 @@ namespace operators {
inline float get_period_sparcity(const std::vector<float>& sparsity, inline float get_period_sparcity(const std::vector<float>& sparsity,
float cur_step, float rampup_steps) { float cur_step, float rampup_steps) {
PADDLE_ENFORCE(static_cast<int>(cur_step) >= 0); PADDLE_ENFORCE_GE(static_cast<int>(cur_step), 0);
size_t idx = static_cast<int>(cur_step * sparsity.size() / rampup_steps); size_t idx = static_cast<int>(cur_step * sparsity.size() / rampup_steps);
if (idx >= sparsity.size()) { if (idx >= sparsity.size()) {
return 0.999; return 0.999;
} }
PADDLE_ENFORCE(idx < sparsity.size()); PADDLE_ENFORCE_LT(idx, sparsity.size());
return sparsity[idx]; return sparsity[idx];
} }
...@@ -63,7 +63,8 @@ class DGCOpKernel : public framework::OpKernel<T> { ...@@ -63,7 +63,8 @@ class DGCOpKernel : public framework::OpKernel<T> {
float ratio = float ratio =
1 - get_period_sparcity(sparsity, static_cast<float>(*current_step), 1 - get_period_sparcity(sparsity, static_cast<float>(*current_step),
rampup_step); rampup_step);
PADDLE_ENFORCE(ratio > 0.0 && ratio < 1.0); PADDLE_ENFORCE_GE(ratio, 0.0);
PADDLE_ENFORCE_LT(ratio, 1.0);
int k = static_cast<int>(g->numel() * ratio); int k = static_cast<int>(g->numel() * ratio);
VLOG(10) << "m:" << m << ", use_nesterov:" << use_nesterov VLOG(10) << "m:" << m << ", use_nesterov:" << use_nesterov
......
...@@ -86,9 +86,10 @@ class GPUDropoutKernel : public framework::OpKernel<T> { ...@@ -86,9 +86,10 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
auto* x_data = x->data<T>(); auto* x_data = x->data<T>();
auto* y_data = y->mutable_data<T>(context.GetPlace()); auto* y_data = y->mutable_data<T>(context.GetPlace());
if (dropout_prob == 1.0f) { if (dropout_prob == 1.0f) {
PADDLE_ENFORCE(cudaMemsetAsync(y_data, 0, x_numel * sizeof(T), stream)); PADDLE_ENFORCE_CUDA_SUCCESS(
PADDLE_ENFORCE(cudaMemsetAsync(mask_data, 0, cudaMemsetAsync(y_data, 0, x_numel * sizeof(T), stream));
x_numel * sizeof(*mask_data), stream)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemsetAsync(
mask_data, 0, x_numel * sizeof(*mask_data), stream));
return; return;
} }
......
...@@ -66,8 +66,8 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO, ...@@ -66,8 +66,8 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
const framework::Tensor& im, const std::vector<int>& dilation, const framework::Tensor& im, const std::vector<int>& dilation,
const std::vector<int>& stride, const std::vector<int>& stride,
const std::vector<int>& padding, framework::Tensor* col) { const std::vector<int>& padding, framework::Tensor* col) {
PADDLE_ENFORCE(im.dims().size() == 3); PADDLE_ENFORCE_EQ(im.dims().size(), 3);
PADDLE_ENFORCE(col->dims().size() == 5); PADDLE_ENFORCE_EQ(col->dims().size(), 5);
int im_channels = im.dims()[0]; int im_channels = im.dims()[0];
int im_height = im.dims()[1]; int im_height = im.dims()[1];
...@@ -152,8 +152,8 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO, ...@@ -152,8 +152,8 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
const std::vector<int>& dilation, const std::vector<int>& dilation,
const std::vector<int>& stride, const std::vector<int>& stride,
const std::vector<int>& padding, framework::Tensor* im) { const std::vector<int>& padding, framework::Tensor* im) {
PADDLE_ENFORCE(im->dims().size() == 3); PADDLE_ENFORCE_EQ(im->dims().size(), 3);
PADDLE_ENFORCE(col.dims().size() == 5); PADDLE_ENFORCE_EQ(col.dims().size(), 5);
int im_channels = im->dims()[0]; int im_channels = im->dims()[0];
int im_height = im->dims()[1]; int im_height = im->dims()[1];
...@@ -249,8 +249,8 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF, ...@@ -249,8 +249,8 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
const framework::Tensor& im, const std::vector<int>& dilation, const framework::Tensor& im, const std::vector<int>& dilation,
const std::vector<int>& stride, const std::vector<int>& stride,
const std::vector<int>& padding, framework::Tensor* col) { const std::vector<int>& padding, framework::Tensor* col) {
PADDLE_ENFORCE(im.dims().size() == 3); PADDLE_ENFORCE_EQ(im.dims().size(), 3);
PADDLE_ENFORCE(col->dims().size() == 5); PADDLE_ENFORCE_EQ(col->dims().size(), 5);
int im_channels = im.dims()[0]; int im_channels = im.dims()[0];
int im_height = im.dims()[1]; int im_height = im.dims()[1];
int im_width = im.dims()[2]; int im_width = im.dims()[2];
...@@ -331,8 +331,8 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF, ...@@ -331,8 +331,8 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
const std::vector<int>& dilation, const std::vector<int>& dilation,
const std::vector<int>& stride, const std::vector<int>& stride,
const std::vector<int>& padding, framework::Tensor* im) { const std::vector<int>& padding, framework::Tensor* im) {
PADDLE_ENFORCE(im->dims().size() == 3); PADDLE_ENFORCE_EQ(im->dims().size(), 3);
PADDLE_ENFORCE(col.dims().size() == 5); PADDLE_ENFORCE_EQ(col.dims().size(), 5);
int im_channels = im->dims()[0]; int im_channels = im->dims()[0];
int im_height = im->dims()[1]; int im_height = im->dims()[1];
int im_width = im->dims()[2]; int im_width = im->dims()[2];
......
...@@ -142,9 +142,9 @@ void GPUSampleWithProb<T>::operator()( ...@@ -142,9 +142,9 @@ void GPUSampleWithProb<T>::operator()(
int num_tries = UniqSampler<T>(sampler, num_samples, s_data); int num_tries = UniqSampler<T>(sampler, num_samples, s_data);
VLOG(1) << "num_tries: " << num_tries; VLOG(1) << "num_tries: " << num_tries;
PADDLE_ENFORCE(cudaMemcpy(samples_data + num_true, s_data, PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpy(samples_data + num_true, s_data,
sizeof(int64_t) * num_samples, sizeof(int64_t) * num_samples,
cudaMemcpyHostToDevice)); cudaMemcpyHostToDevice));
int threads = 512; int threads = 512;
const size_t size = batch_size * num_sampled_classes; const size_t size = batch_size * num_sampled_classes;
......
...@@ -55,11 +55,11 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> { ...@@ -55,11 +55,11 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
auto* in1_data = in1_value.data<T>(); auto* in1_data = in1_value.data<T>();
auto in1_place = input1.place(); auto in1_place = input1.place();
PADDLE_ENFORCE(platform::is_gpu_place(in1_place)); PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true);
auto in2_place = input2.place(); auto in2_place = input2.place();
PADDLE_ENFORCE(platform::is_gpu_place(in2_place)); PADDLE_ENFORCE_EQ(platform::is_gpu_place(in2_place), true);
auto out_place = context.GetPlace(); auto out_place = context.GetPlace();
PADDLE_ENFORCE(platform::is_gpu_place(out_place)); PADDLE_ENFORCE_EQ(platform::is_gpu_place(out_place), true);
memory::Copy(boost::get<platform::CUDAPlace>(out_place), out_data, memory::Copy(boost::get<platform::CUDAPlace>(out_place), out_data,
boost::get<platform::CUDAPlace>(in1_place), in1_data, boost::get<platform::CUDAPlace>(in1_place), in1_data,
...@@ -162,9 +162,9 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> { ...@@ -162,9 +162,9 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
} }
auto in1_place = input1.place(); auto in1_place = input1.place();
PADDLE_ENFORCE(platform::is_gpu_place(in1_place)); PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true);
auto in2_place = input2->place(); auto in2_place = input2->place();
PADDLE_ENFORCE(platform::is_gpu_place(in2_place)); PADDLE_ENFORCE_EQ(platform::is_gpu_place(in2_place), true);
auto* in1_data = in1_value.data<T>(); auto* in1_data = in1_value.data<T>();
auto* in2_data = in2_value->data<T>(); auto* in2_data = in2_value->data<T>();
......
...@@ -78,8 +78,8 @@ class Vol2ColFunctor<platform::CUDADeviceContext, T> { ...@@ -78,8 +78,8 @@ class Vol2ColFunctor<platform::CUDADeviceContext, T> {
const std::vector<int>& strides, const std::vector<int>& strides,
const std::vector<int>& paddings, const std::vector<int>& paddings,
framework::Tensor* col) const { framework::Tensor* col) const {
PADDLE_ENFORCE(vol.dims().size() == 4); PADDLE_ENFORCE_EQ(vol.dims().size(), 4);
PADDLE_ENFORCE(col->dims().size() == 7); PADDLE_ENFORCE_EQ(col->dims().size(), 7);
int input_channels = vol.dims()[0]; int input_channels = vol.dims()[0];
int input_depth = vol.dims()[1]; int input_depth = vol.dims()[1];
...@@ -204,8 +204,8 @@ class Col2VolFunctor<platform::CUDADeviceContext, T> { ...@@ -204,8 +204,8 @@ class Col2VolFunctor<platform::CUDADeviceContext, T> {
const std::vector<int>& strides, const std::vector<int>& strides,
const std::vector<int>& paddings, const std::vector<int>& paddings,
framework::Tensor* vol) const { framework::Tensor* vol) const {
PADDLE_ENFORCE(vol->dims().size() == 4); PADDLE_ENFORCE_EQ(vol->dims().size(), 4);
PADDLE_ENFORCE(col.dims().size() == 7); PADDLE_ENFORCE_EQ(col.dims().size(), 7);
int input_channels = vol->dims()[0]; int input_channels = vol->dims()[0];
int input_depth = vol->dims()[1]; int input_depth = vol->dims()[1];
......
...@@ -30,7 +30,7 @@ class LarsMomentumOpKernel : public framework::OpKernel<T> { ...@@ -30,7 +30,7 @@ class LarsMomentumOpKernel : public framework::OpKernel<T> {
auto learning_rate = ctx.Input<framework::LoDTensor>("LearningRate"); auto learning_rate = ctx.Input<framework::LoDTensor>("LearningRate");
auto* grad_var = ctx.InputVar("Grad"); auto* grad_var = ctx.InputVar("Grad");
// only support dense for now. // only support dense for now.
PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>()); PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true);
auto grad = ctx.Input<framework::LoDTensor>("Grad"); auto grad = ctx.Input<framework::LoDTensor>("Grad");
param_out->mutable_data<T>(ctx.GetPlace()); param_out->mutable_data<T>(ctx.GetPlace());
......
...@@ -49,11 +49,12 @@ static void CPUTakeAlongD1(const platform::DeviceContext& ctx, ...@@ -49,11 +49,12 @@ static void CPUTakeAlongD1(const platform::DeviceContext& ctx,
const framework::Tensor& array, const framework::Tensor& array,
const framework::Tensor& index, const framework::Tensor& index,
framework::Tensor* value) { framework::Tensor* value) {
PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace())); PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true);
// UNDERSTAND: check shape src(B, C), index(B, K), out should also be (B, K) // UNDERSTAND: check shape src(B, C), index(B, K), out should also be (B, K)
PADDLE_ENFORCE(index.dims().size() == 2 && array.dims().size() == 2 && PADDLE_ENFORCE_EQ(index.dims().size(), 2);
index.dims()[0] == array.dims()[0] && PADDLE_ENFORCE_EQ(array.dims().size(), 2);
index.dims() == value->dims()); PADDLE_ENFORCE_EQ(index.dims()[0], array.dims()[0]);
PADDLE_ENFORCE_EQ(index.dims(), value->dims());
const auto batch_size = index.dims()[0]; const auto batch_size = index.dims()[0];
const auto num_take = index.dims()[1]; const auto num_take = index.dims()[1];
...@@ -88,11 +89,12 @@ static void CPUPutAlongD1(const platform::DeviceContext& ctx, ...@@ -88,11 +89,12 @@ static void CPUPutAlongD1(const platform::DeviceContext& ctx,
framework::Tensor* array, framework::Tensor* array,
const framework::Tensor& index, const framework::Tensor& index,
const framework::Tensor& value) { const framework::Tensor& value) {
PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace())); PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true);
// UNDERSTAND: check shape src(B, C), index(B, K), out should also be (B, K) // UNDERSTAND: check shape src(B, C), index(B, K), out should also be (B, K)
PADDLE_ENFORCE(index.dims().size() == 2 && array->dims().size() == 2 && PADDLE_ENFORCE_EQ(index.dims().size(), 2);
index.dims()[0] == array->dims()[0] && PADDLE_ENFORCE_EQ(array->dims().size(), 2);
index.dims() == value.dims()); PADDLE_ENFORCE_EQ(index.dims()[0], array->dims()[0]);
PADDLE_ENFORCE_EQ(index.dims(), value.dims());
const auto batch_size = index.dims()[0]; const auto batch_size = index.dims()[0];
const auto num_put = index.dims()[1]; const auto num_put = index.dims()[1];
auto array_dims = array->dims(); auto array_dims = array->dims();
...@@ -147,8 +149,8 @@ class SampleLogitsKernel : public framework::OpKernel<T> { ...@@ -147,8 +149,8 @@ class SampleLogitsKernel : public framework::OpKernel<T> {
public: public:
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
PADDLE_ENFORCE(platform::is_cpu_place(context.GetPlace()), PADDLE_ENFORCE_EQ(platform::is_cpu_place(context.GetPlace()), true,
"This kernel only runs on CPU."); "This kernel only runs on CPU.");
VLOG(3) << "Enter SampleLogitsKernel"; VLOG(3) << "Enter SampleLogitsKernel";
// get necessary inputs // get necessary inputs
const Tensor* logits = context.Input<Tensor>("Logits"); const Tensor* logits = context.Input<Tensor>("Logits");
......
...@@ -92,8 +92,8 @@ void LodTensorArrayCompute(const framework::ExecutionContext &context) { ...@@ -92,8 +92,8 @@ void LodTensorArrayCompute(const framework::ExecutionContext &context) {
bool in_place = out_var == in_vars[0]; bool in_place = out_var == in_vars[0];
auto &out_array = *out_var->GetMutable<framework::LoDTensorArray>(); auto &out_array = *out_var->GetMutable<framework::LoDTensorArray>();
for (size_t i = in_place ? 1 : 0; i < in_vars.size(); ++i) { for (size_t i = in_place ? 1 : 0; i < in_vars.size(); ++i) {
PADDLE_ENFORCE(in_vars[i]->IsType<framework::LoDTensorArray>(), PADDLE_ENFORCE_EQ(in_vars[i]->IsType<framework::LoDTensorArray>(), true,
"Only support all inputs are TensorArray"); "Only support all inputs are TensorArray");
auto &in_array = in_vars[i]->Get<framework::LoDTensorArray>(); auto &in_array = in_vars[i]->Get<framework::LoDTensorArray>();
for (size_t i = 0; i < in_array.size(); ++i) { for (size_t i = 0; i < in_array.size(); ++i) {
...@@ -106,7 +106,7 @@ void LodTensorArrayCompute(const framework::ExecutionContext &context) { ...@@ -106,7 +106,7 @@ void LodTensorArrayCompute(const framework::ExecutionContext &context) {
context.device_context(), &out_array[i]); context.device_context(), &out_array[i]);
out_array[i].set_lod(in_array[i].lod()); out_array[i].set_lod(in_array[i].lod());
} else { } else {
PADDLE_ENFORCE(out_array[i].lod() == in_array[i].lod()); PADDLE_ENFORCE_EQ(out_array[i].lod(), in_array[i].lod());
auto in = EigenVector<T>::Flatten(in_array[i]); auto in = EigenVector<T>::Flatten(in_array[i]);
auto result = EigenVector<T>::Flatten(out_array[i]); auto result = EigenVector<T>::Flatten(out_array[i]);
result.device(*context.template device_context<DeviceContext>() result.device(*context.template device_context<DeviceContext>()
......
...@@ -178,7 +178,7 @@ class SyncBatchNormKernel : public framework::OpKernel<T> { ...@@ -178,7 +178,7 @@ class SyncBatchNormKernel : public framework::OpKernel<T> {
int dtype = platform::ToNCCLDataType(x->type()); int dtype = platform::ToNCCLDataType(x->type());
// In-place operation // In-place operation
PADDLE_ENFORCE(platform::dynload::ncclAllReduce( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
stats, stats, 2 * C + 1, static_cast<ncclDataType_t>(dtype), ncclSum, stats, stats, 2 * C + 1, static_cast<ncclDataType_t>(dtype), ncclSum,
comm, stream)); comm, stream));
...@@ -398,7 +398,7 @@ class SyncBatchNormGradKernel : public framework::OpKernel<T> { ...@@ -398,7 +398,7 @@ class SyncBatchNormGradKernel : public framework::OpKernel<T> {
} }
int dtype = platform::ToNCCLDataType(x->type()); int dtype = platform::ToNCCLDataType(x->type());
// In-place operation // In-place operation
PADDLE_ENFORCE(platform::dynload::ncclAllReduce( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
stats, stats, 2 * C + 1, static_cast<ncclDataType_t>(dtype), ncclSum, stats, stats, 2 * C + 1, static_cast<ncclDataType_t>(dtype), ncclSum,
comm, stream)); comm, stream));
......
...@@ -29,17 +29,19 @@ namespace platform { ...@@ -29,17 +29,19 @@ namespace platform {
class CublasHandleHolder { class CublasHandleHolder {
public: public:
CublasHandleHolder(cudaStream_t stream, cublasMath_t math_type) { CublasHandleHolder(cudaStream_t stream, cublasMath_t math_type) {
PADDLE_ENFORCE(dynload::cublasCreate(&handle_)); PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cublasCreate(&handle_));
PADDLE_ENFORCE(dynload::cublasSetStream(handle_, stream)); PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cublasSetStream(handle_, stream));
#if CUDA_VERSION >= 9000 #if CUDA_VERSION >= 9000
if (math_type == CUBLAS_TENSOR_OP_MATH) { if (math_type == CUBLAS_TENSOR_OP_MATH) {
PADDLE_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::cublasSetMathMode(handle_, CUBLAS_TENSOR_OP_MATH)); dynload::cublasSetMathMode(handle_, CUBLAS_TENSOR_OP_MATH));
} }
#endif #endif
} }
~CublasHandleHolder() { PADDLE_ENFORCE(dynload::cublasDestroy(handle_)); } ~CublasHandleHolder() {
PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cublasDestroy(handle_));
}
template <typename Callback> template <typename Callback>
inline void Call(Callback &&callback) const { inline void Call(Callback &&callback) const {
......
...@@ -221,10 +221,10 @@ inline cudnnTensorFormat_t GetCudnnTensorFormat( ...@@ -221,10 +221,10 @@ inline cudnnTensorFormat_t GetCudnnTensorFormat(
class ScopedTensorDescriptor { class ScopedTensorDescriptor {
public: public:
ScopedTensorDescriptor() { ScopedTensorDescriptor() {
PADDLE_ENFORCE(dynload::cudnnCreateTensorDescriptor(&desc_)); PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateTensorDescriptor(&desc_));
} }
~ScopedTensorDescriptor() { ~ScopedTensorDescriptor() {
PADDLE_ENFORCE(dynload::cudnnDestroyTensorDescriptor(desc_)); PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyTensorDescriptor(desc_));
} }
inline cudnnTensorDescriptor_t descriptor(const cudnnTensorFormat_t format, inline cudnnTensorDescriptor_t descriptor(const cudnnTensorFormat_t format,
...@@ -243,7 +243,7 @@ class ScopedTensorDescriptor { ...@@ -243,7 +243,7 @@ class ScopedTensorDescriptor {
if (groups > 1) { if (groups > 1) {
dims_with_group[1] = dims_with_group[1] / groups; dims_with_group[1] = dims_with_group[1] / groups;
} }
PADDLE_ENFORCE(dynload::cudnnSetTensorNdDescriptor( PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetTensorNdDescriptor(
desc_, type, dims_with_group.size(), dims_with_group.data(), desc_, type, dims_with_group.size(), dims_with_group.data(),
strides.data())); strides.data()));
return desc_; return desc_;
...@@ -265,10 +265,10 @@ class ScopedTensorDescriptor { ...@@ -265,10 +265,10 @@ class ScopedTensorDescriptor {
class ScopedFilterDescriptor { class ScopedFilterDescriptor {
public: public:
ScopedFilterDescriptor() { ScopedFilterDescriptor() {
PADDLE_ENFORCE(dynload::cudnnCreateFilterDescriptor(&desc_)); PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateFilterDescriptor(&desc_));
} }
~ScopedFilterDescriptor() { ~ScopedFilterDescriptor() {
PADDLE_ENFORCE(dynload::cudnnDestroyFilterDescriptor(desc_)); PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyFilterDescriptor(desc_));
} }
inline cudnnFilterDescriptor_t descriptor(const cudnnTensorFormat_t format, inline cudnnFilterDescriptor_t descriptor(const cudnnTensorFormat_t format,
...@@ -284,7 +284,7 @@ class ScopedFilterDescriptor { ...@@ -284,7 +284,7 @@ class ScopedFilterDescriptor {
kernel_with_group[0] /= groups; kernel_with_group[0] /= groups;
// NOTE: input filter(C) of the filter is already asserted to be C/groups. // NOTE: input filter(C) of the filter is already asserted to be C/groups.
} }
PADDLE_ENFORCE(dynload::cudnnSetFilterNdDescriptor( PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetFilterNdDescriptor(
desc_, type, format, kernel_with_group.size(), desc_, type, format, kernel_with_group.size(),
kernel_with_group.data())); kernel_with_group.data()));
return desc_; return desc_;
...@@ -306,10 +306,12 @@ class ScopedFilterDescriptor { ...@@ -306,10 +306,12 @@ class ScopedFilterDescriptor {
class ScopedConvolutionDescriptor { class ScopedConvolutionDescriptor {
public: public:
ScopedConvolutionDescriptor() { ScopedConvolutionDescriptor() {
PADDLE_ENFORCE(dynload::cudnnCreateConvolutionDescriptor(&desc_)); PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::cudnnCreateConvolutionDescriptor(&desc_));
} }
~ScopedConvolutionDescriptor() { ~ScopedConvolutionDescriptor() {
PADDLE_ENFORCE(dynload::cudnnDestroyConvolutionDescriptor(desc_)); PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::cudnnDestroyConvolutionDescriptor(desc_));
} }
inline cudnnConvolutionDescriptor_t descriptor( inline cudnnConvolutionDescriptor_t descriptor(
...@@ -332,7 +334,7 @@ class ScopedConvolutionDescriptor { ...@@ -332,7 +334,7 @@ class ScopedConvolutionDescriptor {
cudnnDataType_t compute_type = cudnnDataType_t compute_type =
(type == CUDNN_DATA_DOUBLE) ? CUDNN_DATA_DOUBLE : CUDNN_DATA_FLOAT; (type == CUDNN_DATA_DOUBLE) ? CUDNN_DATA_DOUBLE : CUDNN_DATA_FLOAT;
PADDLE_ENFORCE(dynload::cudnnSetConvolutionNdDescriptor( PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetConvolutionNdDescriptor(
desc_, pads.size(), pads.data(), strides.data(), dilations.data(), desc_, pads.size(), pads.data(), strides.data(), dilations.data(),
CUDNN_CROSS_CORRELATION, compute_type)); CUDNN_CROSS_CORRELATION, compute_type));
return desc_; return desc_;
...@@ -353,10 +355,10 @@ class ScopedConvolutionDescriptor { ...@@ -353,10 +355,10 @@ class ScopedConvolutionDescriptor {
class ScopedPoolingDescriptor { class ScopedPoolingDescriptor {
public: public:
ScopedPoolingDescriptor() { ScopedPoolingDescriptor() {
PADDLE_ENFORCE(dynload::cudnnCreatePoolingDescriptor(&desc_)); PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreatePoolingDescriptor(&desc_));
} }
~ScopedPoolingDescriptor() { ~ScopedPoolingDescriptor() {
PADDLE_ENFORCE(dynload::cudnnDestroyPoolingDescriptor(desc_)); PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyPoolingDescriptor(desc_));
} }
inline cudnnPoolingDescriptor_t descriptor(const PoolingMode& mode, inline cudnnPoolingDescriptor_t descriptor(const PoolingMode& mode,
...@@ -365,7 +367,7 @@ class ScopedPoolingDescriptor { ...@@ -365,7 +367,7 @@ class ScopedPoolingDescriptor {
const std::vector<int>& strides) { const std::vector<int>& strides) {
PADDLE_ENFORCE_EQ(kernel.size(), pads.size()); PADDLE_ENFORCE_EQ(kernel.size(), pads.size());
PADDLE_ENFORCE_EQ(kernel.size(), strides.size()); PADDLE_ENFORCE_EQ(kernel.size(), strides.size());
PADDLE_ENFORCE(dynload::cudnnSetPoolingNdDescriptor( PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetPoolingNdDescriptor(
desc_, (GetPoolingMode(mode)), desc_, (GetPoolingMode(mode)),
CUDNN_PROPAGATE_NAN, // Always propagate nans. CUDNN_PROPAGATE_NAN, // Always propagate nans.
kernel.size(), kernel.data(), pads.data(), strides.data())); kernel.size(), kernel.data(), pads.data(), strides.data()));
...@@ -380,16 +382,18 @@ class ScopedPoolingDescriptor { ...@@ -380,16 +382,18 @@ class ScopedPoolingDescriptor {
class ScopedSpatialTransformerDescriptor { class ScopedSpatialTransformerDescriptor {
public: public:
ScopedSpatialTransformerDescriptor() { ScopedSpatialTransformerDescriptor() {
PADDLE_ENFORCE(dynload::cudnnCreateSpatialTransformerDescriptor(&desc_)); PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::cudnnCreateSpatialTransformerDescriptor(&desc_));
} }
~ScopedSpatialTransformerDescriptor() { ~ScopedSpatialTransformerDescriptor() {
PADDLE_ENFORCE(dynload::cudnnDestroySpatialTransformerDescriptor(desc_)); PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::cudnnDestroySpatialTransformerDescriptor(desc_));
} }
template <typename T> template <typename T>
inline cudnnSpatialTransformerDescriptor_t descriptor(const int nbDims, inline cudnnSpatialTransformerDescriptor_t descriptor(const int nbDims,
const int dimA[]) { const int dimA[]) {
PADDLE_ENFORCE(dynload::cudnnSetSpatialTransformerNdDescriptor( PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetSpatialTransformerNdDescriptor(
desc_, CUDNN_SAMPLER_BILINEAR, CudnnDataType<T>::type, nbDims, dimA)); desc_, CUDNN_SAMPLER_BILINEAR, CudnnDataType<T>::type, nbDims, dimA));
return desc_; return desc_;
} }
...@@ -402,10 +406,12 @@ class ScopedSpatialTransformerDescriptor { ...@@ -402,10 +406,12 @@ class ScopedSpatialTransformerDescriptor {
class ScopedActivationDescriptor { class ScopedActivationDescriptor {
public: public:
ScopedActivationDescriptor() { ScopedActivationDescriptor() {
PADDLE_ENFORCE(dynload::cudnnCreateActivationDescriptor(&desc_)); PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::cudnnCreateActivationDescriptor(&desc_));
} }
~ScopedActivationDescriptor() { ~ScopedActivationDescriptor() {
PADDLE_ENFORCE(dynload::cudnnDestroyActivationDescriptor(desc_)); PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::cudnnDestroyActivationDescriptor(desc_));
} }
template <typename T> template <typename T>
...@@ -467,15 +473,15 @@ inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) { ...@@ -467,15 +473,15 @@ inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) {
class ScopedCTCLossDescriptor { class ScopedCTCLossDescriptor {
public: public:
ScopedCTCLossDescriptor() { ScopedCTCLossDescriptor() {
PADDLE_ENFORCE(dynload::cudnnCreateCTCLossDescriptor(&desc_)); PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateCTCLossDescriptor(&desc_));
} }
~ScopedCTCLossDescriptor() { ~ScopedCTCLossDescriptor() {
PADDLE_ENFORCE(dynload::cudnnDestroyCTCLossDescriptor(desc_)); PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyCTCLossDescriptor(desc_));
} }
template <typename T> template <typename T>
inline cudnnCTCLossDescriptor_t descriptor() { inline cudnnCTCLossDescriptor_t descriptor() {
PADDLE_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
dynload::cudnnSetCTCLossDescriptor(desc_, CudnnDataType<T>::type)); dynload::cudnnSetCTCLossDescriptor(desc_, CudnnDataType<T>::type));
return desc_; return desc_;
} }
......
...@@ -167,7 +167,7 @@ class CudnnHolder { ...@@ -167,7 +167,7 @@ class CudnnHolder {
inline void ResetWorkspace() { inline void ResetWorkspace() {
if (workspace_) { if (workspace_) {
// Maybe someone is using the current workspace // Maybe someone is using the current workspace
PADDLE_ENFORCE(cudaStreamSynchronize(*stream_)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(*stream_));
workspace_ = nullptr; workspace_ = nullptr;
} }
} }
...@@ -306,7 +306,7 @@ class CUDADeviceContext : public DeviceContext { ...@@ -306,7 +306,7 @@ class CUDADeviceContext : public DeviceContext {
template <typename Callback> template <typename Callback>
void RecordEvent(cudaEvent_t ev, Callback callback) { void RecordEvent(cudaEvent_t ev, Callback callback) {
callback(); callback();
PADDLE_ENFORCE(cudaEventRecord(ev, stream_)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(ev, stream_));
} }
template <typename Callback> template <typename Callback>
......
...@@ -63,11 +63,11 @@ class NCCLGroupGuard { ...@@ -63,11 +63,11 @@ class NCCLGroupGuard {
inline NCCLGroupGuard() { inline NCCLGroupGuard() {
NCCLMutex().lock(); NCCLMutex().lock();
PADDLE_ENFORCE(dynload::ncclGroupStart()); PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclGroupStart());
} }
inline ~NCCLGroupGuard() { inline ~NCCLGroupGuard() {
PADDLE_ENFORCE(dynload::ncclGroupEnd()); PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclGroupEnd());
NCCLMutex().unlock(); NCCLMutex().unlock();
} }
}; };
...@@ -94,7 +94,7 @@ struct NCCLContextMap { ...@@ -94,7 +94,7 @@ struct NCCLContextMap {
explicit NCCLContextMap(const std::vector<platform::Place> &places, explicit NCCLContextMap(const std::vector<platform::Place> &places,
ncclUniqueId *nccl_id = nullptr, ncclUniqueId *nccl_id = nullptr,
size_t num_trainers = 1, size_t trainer_id = 0) { size_t num_trainers = 1, size_t trainer_id = 0) {
PADDLE_ENFORCE(!places.empty()); PADDLE_ENFORCE_EQ(!places.empty(), true);
order_.reserve(places.size()); order_.reserve(places.size());
for (auto &p : places) { for (auto &p : places) {
int dev_id = boost::get<CUDAPlace>(p).device; int dev_id = boost::get<CUDAPlace>(p).device;
...@@ -109,7 +109,7 @@ struct NCCLContextMap { ...@@ -109,7 +109,7 @@ struct NCCLContextMap {
// if num_trainers == 1, should create a new nccl id for local comms. // if num_trainers == 1, should create a new nccl id for local comms.
if (num_trainers == 1 && nccl_id == nullptr) { if (num_trainers == 1 && nccl_id == nullptr) {
std::lock_guard<std::mutex> guard(NCCLGroupGuard::NCCLMutex()); std::lock_guard<std::mutex> guard(NCCLGroupGuard::NCCLMutex());
PADDLE_ENFORCE(platform::dynload::ncclCommInitAll( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclCommInitAll(
comms.get(), static_cast<int>(order_.size()), order_.data())); comms.get(), static_cast<int>(order_.size()), order_.data()));
} else { } else {
PADDLE_ENFORCE_NOT_NULL(nccl_id); PADDLE_ENFORCE_NOT_NULL(nccl_id);
...@@ -126,8 +126,8 @@ struct NCCLContextMap { ...@@ -126,8 +126,8 @@ struct NCCLContextMap {
} }
VLOG(1) << "init nccl rank:" << rank << ", nranks:" << nranks VLOG(1) << "init nccl rank:" << rank << ", nranks:" << nranks
<< ", gpu_id:" << gpu_id << ", dev_id:" << order_[i]; << ", gpu_id:" << gpu_id << ", dev_id:" << order_[i];
PADDLE_ENFORCE(cudaSetDevice(gpu_id)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(gpu_id));
PADDLE_ENFORCE(platform::dynload::ncclCommInitRank( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclCommInitRank(
comms.get() + i, nranks, *nccl_id, rank)); comms.get() + i, nranks, *nccl_id, rank));
} }
} }
...@@ -249,13 +249,13 @@ class NCCLCommunicator { ...@@ -249,13 +249,13 @@ class NCCLCommunicator {
size_t trainers_num, size_t trainer_id, size_t trainers_num, size_t trainer_id,
size_t inter_trainers_num, size_t inter_trainers_num,
size_t exter_trainers_num) { size_t exter_trainers_num) {
PADDLE_ENFORCE(trainers_num == inter_trainers_num * exter_trainers_num, PADDLE_ENFORCE_EQ(trainers_num, inter_trainers_num * exter_trainers_num,
"trainers_num:%llu != inter_trainers_num:%llu * " "trainers_num:%llu != inter_trainers_num:%llu * "
"exter_trainers_num:%llu", "exter_trainers_num:%llu",
trainers_num, inter_trainers_num, exter_trainers_num); trainers_num, inter_trainers_num, exter_trainers_num);
PADDLE_ENFORCE(inter_trainers_num > 1, "inter_trainers_num:%llu must > 1", PADDLE_ENFORCE_GT(inter_trainers_num, 1, "inter_trainers_num:%llu must > 1",
inter_trainers_num); inter_trainers_num);
int inter_trainer_id = trainer_id % inter_trainers_num; int inter_trainer_id = trainer_id % inter_trainers_num;
for (size_t i = 0; i < inter_nccl_ids.size(); i++) { for (size_t i = 0; i < inter_nccl_ids.size(); i++) {
......
...@@ -35,13 +35,13 @@ void DummyKernelAndEvent() { ...@@ -35,13 +35,13 @@ void DummyKernelAndEvent() {
ForEachDevice([](int d) { ForEachDevice([](int d) {
platform::SetDeviceId(d); platform::SetDeviceId(d);
cudaStream_t stream; cudaStream_t stream;
PADDLE_ENFORCE(cudaStreamCreate(&stream)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream));
Mark("_cuda_startup_"); Mark("_cuda_startup_");
int *ptr; int *ptr;
PADDLE_ENFORCE(cudaMalloc(&ptr, sizeof(int))); PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc(&ptr, sizeof(int)));
DummyKernel<<<1, 1, 0, stream>>>(ptr); DummyKernel<<<1, 1, 0, stream>>>(ptr);
PADDLE_ENFORCE(cudaStreamSynchronize(stream)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
PADDLE_ENFORCE(cudaFree(ptr)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaFree(ptr));
}); });
} }
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册