diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index 8a927d8e282a03e8a74c0814ee8d9b247451a091..07fe7d245ef57814d704d11be6f6fe45cf514b2d 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -4,7 +4,7 @@ endif() INCLUDE(ExternalProject) SET(XPU_PROJECT "extern_xpu") -SET(XPU_URL "https://kunlun1.su.bcebos.com/xpu.tar.gz" CACHE STRING "" FORCE) +SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu.tar.gz" CACHE STRING "" FORCE) SET(XPU_SOURCE_DIR "${THIRD_PARTY_PATH}/xpu") SET(XPU_DOWNLOAD_DIR "${XPU_SOURCE_DIR}/src/${XPU_PROJECT}") SET(XPU_INSTALL_DIR "${THIRD_PARTY_PATH}/install/xpu") diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 21080fbe8fd2e14cf7fd805e01948f2f28535c22..7aa2766763ce9441b0e4de969930af50fb7a55e0 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -62,9 +62,9 @@ function(op_library TARGET) endif() endif() if(WITH_XPU) - string(REPLACE "_op" "_xpu_op" XPU_FILE "${TARGET}") - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${XPU_FILE}.cc) - list(APPEND xpu_cc_srcs xpu/${XPU_FILE}.cc) + string(REPLACE "_op" "_op_xpu" XPU_FILE "${TARGET}") + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${XPU_FILE}.cc) + list(APPEND xpu_cc_srcs ${XPU_FILE}.cc) endif() endif() else() @@ -83,7 +83,7 @@ function(op_library TARGET) list(APPEND mkldnn_cc_srcs ${src}) elseif(${src} MATCHES ".*\\.cu.cc$") list(APPEND cu_cc_srcs ${src}) - elseif(WITH_XPU AND ${src} MATCHES ".*_xpu_op.cc$") + elseif(WITH_XPU AND ${src} MATCHES ".*_op_xpu.cc$") list(APPEND xpu_cc_srcs ${src}) elseif(${src} MATCHES ".*\\.cc$") list(APPEND cc_srcs ${src}) diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake index ffd32cc78f087504dde0e9f5b2b1f39a8bff557e..1eb2096af91dc99ac22b000d2de269bde2efcbbf 100644 --- a/cmake/third_party.cmake +++ b/cmake/third_party.cmake @@ -270,6 +270,10 @@ if(WITH_PSLIB) endif() endif(WITH_PSLIB) +if(NOT WIN32 AND NOT APPLE) + include(external/gloo) + list(APPEND third_party_deps extern_gloo) +endif() if(WITH_BOX_PS) include(external/box_ps) @@ -277,10 +281,6 @@ if(WITH_BOX_PS) endif(WITH_BOX_PS) if(WITH_DISTRIBUTE) - if(WITH_GLOO) - include(external/gloo) - list(APPEND third_party_deps extern_gloo) - endif() if(WITH_GRPC) list(APPEND third_party_deps extern_grpc) diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index 939a2fc8fc9c73472ff5c25633610fa70c7cec6d..78887f3ac5195893ca304ea97d5bf4218c5952f8 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -76,7 +76,7 @@ void AllReduceOpHandle::AllReduceImpl( platform::errors::InvalidArgument( "The NoDummyInputSize should be equal " "to the number of places, but got NoDummyInputSize is " - "%d and the number of place is %d.", + "%d and the number of places is %d.", in_var_handles.size(), num_places)); PADDLE_ENFORCE_EQ( in_var_handles.size(), out_var_handles.size(), @@ -89,7 +89,7 @@ void AllReduceOpHandle::AllReduceImpl( platform::errors::InvalidArgument( "The number of local scopes should be equal " "to the number of places, but got the number of local scopes is " - "%d and the number of place is %d.", + "%d and the number of places is %d.", in_var_handles.size(), num_places)); std::vector lod_tensor_data; diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc index 4c3b0a7c6a44ca0d304113e57ebb3be9e1a7de27..35b106606740556481cd98ce76955e953f7e0ee7 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.cc +++ b/paddle/fluid/framework/details/broadcast_op_handle.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/details/broadcast_op_handle.h" + #include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/platform/profiler.h" @@ -31,10 +32,15 @@ void BroadcastOpHandle::RunImpl() { auto out_var_handles = DynamicCast(outputs_); PADDLE_ENFORCE_EQ(in_var_handles.size(), 1UL, - "The number of input should be one."); - PADDLE_ENFORCE_EQ( - out_var_handles.size(), places_.size(), - "The number of output should equal to the number of places."); + platform::errors::PreconditionNotMet( + "The number of inputs should be 1, but got %d.", + in_var_handles.size())); + PADDLE_ENFORCE_EQ(out_var_handles.size(), places_.size(), + platform::errors::PreconditionNotMet( + "The number of outputs and the number of places should " + "be equal, but got the number of outputs is %d and the " + "number of places is %d.", + out_var_handles.size(), places_.size())); VarHandle *in_var_handle = in_var_handles[0]; @@ -47,7 +53,9 @@ void BroadcastOpHandle::BroadcastOneVar( const std::vector &var_scopes) { auto *in_var = var_scopes.at(in_var_handle.scope_idx())->FindVar(in_var_handle.name()); - PADDLE_ENFORCE_NOT_NULL(in_var); + PADDLE_ENFORCE_NOT_NULL( + in_var, platform::errors::NotFound("Variable %s is not found in scopes.", + in_var_handle.name())); Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var); if (UNLIKELY(!in_tensor.IsInitialized())) { VLOG(3) << "in var " << in_var_handle.name() << "not inited, return!"; @@ -103,7 +111,7 @@ void BroadcastOpHandle::BroadcastOneVar( broadcast_calls.emplace_back( [send_recv_buffer, numel, type, root_id, &nccl_ctx] { - PADDLE_ENFORCE(platform::dynload::ncclBcast( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast( send_recv_buffer, numel, static_cast(type), root_id, nccl_ctx.comm_, nccl_ctx.stream())); }); @@ -131,7 +139,8 @@ void BroadcastOpHandle::BroadcastOneVar( nccl_ctxs_->DevCtx(p)->Wait(); } #else - PADDLE_THROW("CUDA is not enabled."); + PADDLE_THROW( + platform::errors::PreconditionNotMet("Not compiled with NCLL.")); #endif } } @@ -154,10 +163,13 @@ void BroadcastOpHandle::InitOutputValue( auto t_out_p = out_var_handle->place(); auto *out_var = var_scopes.at(out_var_handle->scope_idx()) ->FindVar(out_var_handle->name()); - PADDLE_ENFORCE_NOT_NULL(out_var); + PADDLE_ENFORCE_NOT_NULL(out_var, platform::errors::NotFound( + "Variable %s is not found in scopes.", + out_var_handle->name())); if (is_gpu_place(in_tensor.place())) { - PADDLE_ENFORCE(platform::is_gpu_place(t_out_p), - "Places of input and output must be all on GPU."); + PADDLE_ENFORCE_EQ(platform::is_gpu_place(t_out_p), true, + platform::errors::PreconditionNotMet( + "Places of input and output must be all on GPU.")); } else { t_out_p = platform::CPUPlace(); } diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.h b/paddle/fluid/framework/details/broadcast_op_handle_test.h index e455879a68f70b3f4f33fb5e6ede0fd9e9f22d5f..4fdc420e1e0752ac3122d25db5ed1423bb47c69e 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle_test.h +++ b/paddle/fluid/framework/details/broadcast_op_handle_test.h @@ -79,7 +79,8 @@ struct TestBroadcastOpHandle { } nccl_ctxs_.reset(new platform::NCCLContextMap(place_list_)); #else - PADDLE_THROW("CUDA is not support."); + PADDLE_THROW( + platform::errors::PreconditionNotMet("Not compiled with NCLL.")); #endif } else { int count = 8; @@ -113,7 +114,8 @@ struct TestBroadcastOpHandle { op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get()); #else - PADDLE_THROW("CUDA is not support."); + PADDLE_THROW( + platform::errors::PreconditionNotMet("Not compiled with NCLL.")); #endif } else { #if defined(PADDLE_WITH_NCCL) @@ -171,7 +173,9 @@ struct TestBroadcastOpHandle { float val_scalar = 0.0) { auto var = param_scopes_[input_scope_idx]->FindVar(varname); - PADDLE_ENFORCE_NOT_NULL(var); + PADDLE_ENFORCE_NOT_NULL( + var, platform::errors::NotFound("Variable %s is not found in scope.", + varname)); auto lod_tensor = var->GetMutable(); std::vector send_vector(static_cast(f::product(kDims))); for (size_t k = 0; k < send_vector.size(); ++k) { @@ -194,7 +198,9 @@ struct TestBroadcastOpHandle { } auto var = param_scopes_[input_scope_idx]->FindVar(varname); - PADDLE_ENFORCE_NOT_NULL(var); + PADDLE_ENFORCE_NOT_NULL( + var, platform::errors::NotFound("Variable %s is not found in scope.", + varname)); auto selected_rows = var->GetMutable(); auto value = selected_rows->mutable_value(); value->mutable_data(kDims, place_list_[input_scope_idx]); @@ -211,13 +217,24 @@ struct TestBroadcastOpHandle { const std::vector& send_vector, const std::vector& rows, int height) { auto var = param_scopes_[input_scope_idx]->FindVar(varname); - PADDLE_ENFORCE_NOT_NULL(var); + PADDLE_ENFORCE_NOT_NULL( + var, platform::errors::NotFound("Variable %s is not found in scope.", + varname)); auto& selected_rows = var->Get(); auto rt = selected_rows.value(); - PADDLE_ENFORCE_EQ(selected_rows.height(), height, "height is not equal."); + PADDLE_ENFORCE_EQ(selected_rows.height(), height, + platform::errors::InvalidArgument( + "The height of SelectedRows is not equal to " + "the expected, expect %d, but got %ld.", + height, selected_rows.height())); for (size_t k = 0; k < selected_rows.rows().size(); ++k) { - PADDLE_ENFORCE_EQ(selected_rows.rows()[k], rows[k]); + PADDLE_ENFORCE_EQ( + selected_rows.rows()[k], rows[k], + platform::errors::InvalidArgument( + "The item at position %zu of rows of SelectedRows " + "is not equal to the expected, expect %ld, but got %ld.", + k, rows[k], selected_rows.rows()[k])); } p::CPUPlace cpu_place; @@ -235,9 +252,15 @@ struct TestBroadcastOpHandle { framework::Scope* scope) { p::CPUPlace cpu_place; auto var = scope->FindVar(varname); - PADDLE_ENFORCE_NOT_NULL(var); + PADDLE_ENFORCE_NOT_NULL( + var, platform::errors::NotFound("Variable %s is not found in scope.", + varname)); auto tensor = var->Get(); - PADDLE_ENFORCE_EQ(tensor.lod(), lod, "lod is not equal."); + PADDLE_ENFORCE_EQ(tensor.lod(), lod, + platform::errors::InvalidArgument( + "The LoD of tensor is not equal to " + "the expected, expect %s, but got %s.", + lod, tensor.lod())); f::Tensor result_tensor; f::TensorCopySync(tensor, cpu_place, &result_tensor); float* ct = result_tensor.mutable_data(cpu_place); diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index ecdb8cc9b8cdf38385f9bc6005bf6a889dfa7741..962f968c84ea46f0476feec70a5d3f53ba8b65ea 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -235,7 +235,8 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { AppendPass("reduce_mode_multi_devices_pass").get(); break; default: - PADDLE_THROW("Unknown reduce strategy."); + PADDLE_THROW( + platform::errors::Unimplemented("Unknown reduce strategy.")); } } multi_devices_pass->SetNotOwned("strategy", diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc index 7735f9720c109407249ea19f0e5e609a02cfd22e..266557cb8554ae310ae27046075cfcb35b05d9da 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc @@ -12,11 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/framework/details/eager_deletion_op_handle.h" + #include #include #include -#include "paddle/fluid/framework/details/eager_deletion_op_handle.h" #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h" #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/scope.h" @@ -47,15 +48,19 @@ EagerDeletionOpHandle::EagerDeletionOpHandle( if (dynamic_cast(gc_)) { platform::CUDADeviceGuard guard( BOOST_GET_CONST(platform::CUDAPlace, place).device); - PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); - PADDLE_ENFORCE_NOT_NULL(event_); + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); + PADDLE_ENFORCE_NOT_NULL(event_, platform::errors::InvalidArgument( + "The cuda envet created is NULL.")); } } #endif - PADDLE_ENFORCE_NE(vars.empty(), true, platform::errors::InvalidArgument( - "Variable names are empty.")); + PADDLE_ENFORCE_NE(vars.empty(), true, + platform::errors::InvalidArgument( + "The variables to be deleted are empty.")); for (auto *var : var_infos_) { - PADDLE_ENFORCE_NOT_NULL(var); + PADDLE_ENFORCE_NOT_NULL(var, platform::errors::InvalidArgument( + "The memory optimization info is NULL.")); } } @@ -64,7 +69,7 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() { if (event_) { auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx_->GetPlace()); platform::CUDADeviceGuard guard(gpu_place.device); - PADDLE_ENFORCE(cudaEventDestroy(event_)); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event_)); } #endif } @@ -78,12 +83,17 @@ void EagerDeletionOpHandle::InitCUDA() { } void EagerDeletionOpHandle::CallOnce() { - PADDLE_ENFORCE(vars_.empty(), "vars_ must be initialized here"); + PADDLE_ENFORCE_EQ( + vars_.empty(), true, + platform::errors::InvalidArgument( + "The variables to be deleted should be initialized here.")); Scope *exec_scope = local_exec_scopes_[0]; for (auto *var_info : var_infos_) { auto *var = exec_scope->FindVar(var_info->Name()); - PADDLE_ENFORCE_NOT_NULL(var, "Variable %s should not be nullptr", - var_info->Name()); + PADDLE_ENFORCE_NOT_NULL( + var, platform::errors::NotFound( + "The variable(%s) to be inplaced is not found in scope.", + var_info->Name())); vars_.emplace_back(var); } } @@ -119,8 +129,9 @@ void EagerDeletionOpHandle::RunImpl() { garbages.emplace_back(t.MoveMemoryHolder()); } } else { - PADDLE_THROW("Type %s of %s is not supported eager deletion", - framework::ToTypeName(var->Type()), var_info->Name()); + PADDLE_THROW(platform::errors::Unimplemented( + "The variable(%s) of type %s is not supported in eager deletion.", + framework::ToTypeName(var->Type()), var_info->Name())); } } @@ -137,8 +148,9 @@ void EagerDeletionOpHandle::ClearGarbages( auto callback_stream = reinterpret_cast(gc_)->stream(); auto callback_func = [=]() { - PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream)); - PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0)); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event_, compute_stream)); + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaStreamWaitEvent(callback_stream, event_, 0)); }; gc_->Add(std::move(*garbages), callback_func); } else { diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc index c67e21d5c4728110a800d4fd0367ee33441ce3c7..c538811669924aae2e33a6c18d7b1eb1ca9268cb 100644 --- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc @@ -12,8 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. #include "paddle/fluid/framework/details/fused_all_reduce_op_handle.h" + #include #include + #include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/details/variable_visitor.h" @@ -56,10 +58,20 @@ void FusedAllReduceOpHandle::RunImpl() { size_t place_num = places_.size(); PADDLE_ENFORCE_EQ( in_var_handles.size(), place_num * num_of_all_reduce_, - "The NoDummyInputSize should be equal to the number of places."); + platform::errors::PreconditionNotMet( + "The number of input variable handles should be equal to the number " + "of places plus the number of all reduce handles, " + "but got the number of input variable handles is %d, the " + "number of places is %d, and the number of all reduce handles " + "is %d.", + in_var_handles.size(), place_num, num_of_all_reduce_)); PADDLE_ENFORCE_EQ( in_var_handles.size(), out_var_handles.size(), - "The NoDummyInputSize and NoDummyOutputSize should be equal."); + platform::errors::PreconditionNotMet( + "The number of input variable handles should be equal to the number " + "of output variable handles, but got the number of input variable " + "handles is %d, and the number of output variable handles is %d.", + in_var_handles.size(), out_var_handles.size())); // Note: some gradient op doesn't have CUDAKernel, so the gradients of // those op are in CPUPlace, in this case, the all reduce should not be fused. @@ -106,7 +118,13 @@ void FusedAllReduceOpHandle::FusedAllReduceFunc( dtype = ele_dtype; } - PADDLE_ENFORCE_EQ(ele_dtype, dtype); + PADDLE_ENFORCE_EQ( + ele_dtype, dtype, + platform::errors::InvalidArgument( + "The DataType of grad tensors of fused_all_reduce_op_handle " + "must be consistent. The current dtype is %s, but the " + "previous dtype is %s.", + DataTypeToString(ele_dtype), DataTypeToString(dtype))); // Check whether the address space is contiguous. std::sort( @@ -130,16 +148,29 @@ void FusedAllReduceOpHandle::FusedAllReduceFunc( "input[%d] address: 0X%02x. The offset: %d", k - 1, g_tensor.at(k - 1).first, cur_address, g_tensor.at(k).first, k, next_address, k, infer_next_address, offset); - PADDLE_ENFORCE_EQ(infer_next_address, next_address, - "The address is not consistent."); + PADDLE_ENFORCE_EQ( + infer_next_address, next_address, + platform::errors::InvalidArgument( + "The infered address of the next tensor should be equal to the " + "real address of the next tensor. But got infered address is %p " + "and real address is %p.", + infer_next_address, next_address)); } } if (!FLAGS_skip_fused_all_reduce_check) { for (size_t scope_idx = 0; scope_idx < place_num; ++scope_idx) { for (size_t j = 1; j < num_of_all_reduce_; ++j) { - PADDLE_ENFORCE_EQ(grads_tensor.at(0).at(j).first, - grads_tensor.at(scope_idx).at(j).first); + PADDLE_ENFORCE_EQ( + grads_tensor.at(0).at(j).first, + grads_tensor.at(scope_idx).at(j).first, + platform::errors::InvalidArgument( + "The variable name of grad tensors of " + "fused_all_reduce_op_handle " + "must be consistent. The current name is %s, but the " + "previous name is %s.", + grads_tensor.at(0).at(j).first, + grads_tensor.at(scope_idx).at(j).first)); } } } @@ -167,7 +198,9 @@ bool FusedAllReduceOpHandle::InputIsInDifferentPlace( for (size_t j = 0; j < in_var_handles.size(); j += place_num) { auto var_name = in_var_handles[j]->name(); auto var = local_scope->FindVar(var_name); - PADDLE_ENFORCE_NOT_NULL(var, "%s is not found in local scope.", var_name); + PADDLE_ENFORCE_NOT_NULL( + var, platform::errors::NotFound( + "The variable '%s' is not found in local scope.", var_name)); auto &lod_tensor = var->Get(); if (!is_same_place(lod_tensor.place(), places_.at(scope_idx))) { return true; @@ -185,14 +218,24 @@ void FusedAllReduceOpHandle::GetGradLoDTensor( size_t place_num = places_.size(); for (size_t j = 0; j < in_var_handles.size(); j += place_num) { auto var_name = in_var_handles[j]->name(); - PADDLE_ENFORCE_EQ(var_name, out_var_handles[j]->name()); + PADDLE_ENFORCE_EQ( + var_name, out_var_handles[j]->name(), + platform::errors::InvalidArgument( + "The name of input variable should be equal " + "to the name of output variable. But got the name of input " + "variable is %s and the name of output variable is %s.", + var_name, out_var_handles[j]->name())); auto var = local_scope->FindVar(var_name); - PADDLE_ENFORCE_NOT_NULL(var, "%s is not found in local scope.", var_name); + PADDLE_ENFORCE_NOT_NULL( + var, platform::errors::NotFound( + "The variable '%s' is not found in local scope.", var_name)); auto &lod_tensor = var->Get(); PADDLE_ENFORCE_EQ( platform::is_same_place(lod_tensor.place(), places_.at(scope_idx)), - true, "%s(%d) is not in the right place.", var_name, scope_idx); + true, platform::errors::InvalidArgument( + "The variable '%s' at scope %d is not in the right place.", + var_name, scope_idx)); grad_tensor->emplace_back(std::make_pair(var_name, &lod_tensor)); } } @@ -204,16 +247,26 @@ void FusedAllReduceOpHandle::GetDTypeAndNumel( size_t size_of_dtype = 0; for (size_t i = 0; i < grad_tensor.size(); ++i) { // Get dtype - auto ele_type = grad_tensor.at(i).second->type(); + auto ele_dtype = grad_tensor.at(i).second->type(); if (i == 0) { - *dtype = ele_type; - size_of_dtype = framework::SizeOfType(ele_type); + *dtype = ele_dtype; + size_of_dtype = framework::SizeOfType(ele_dtype); } - PADDLE_ENFORCE_EQ(ele_type, *dtype); + PADDLE_ENFORCE_EQ( + ele_dtype, *dtype, + platform::errors::InvalidArgument( + "The DataType of grad tensors of fused_all_reduce_op_handle " + "must be consistent. The current dtype is %s, but the " + "previous dtype is %s.", + DataTypeToString(ele_dtype), DataTypeToString(*dtype))); // Get element number int64_t len = grad_tensor.at(i).second->numel(); - PADDLE_ENFORCE_GT(len, 0); + PADDLE_ENFORCE_GT( + len, 0, platform::errors::InvalidArgument( + "The size of grad tensors of fused_all_reduce_op_handle " + "must be > 0, but got %d.", + len)); *numel += platform::Alignment(len * size_of_dtype, places_[0]) / size_of_dtype; } diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle.cc b/paddle/fluid/framework/details/fused_broadcast_op_handle.cc index 59c5da0de8c114823a1cad3e6d65c92081b5a2b6..1ae09dcde9fc8e4a413f8876dd33d0ac53f181c3 100644 --- a/paddle/fluid/framework/details/fused_broadcast_op_handle.cc +++ b/paddle/fluid/framework/details/fused_broadcast_op_handle.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/details/fused_broadcast_op_handle.h" + #include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/platform/profiler.h" @@ -32,7 +33,15 @@ void FusedBroadcastOpHandle::RunImpl() { WaitInputVarGenerated(); size_t place_num = places_.size(); - PADDLE_ENFORCE_EQ(in_var_handles.size() * place_num, out_var_handles.size()); + PADDLE_ENFORCE_EQ( + in_var_handles.size() * place_num, out_var_handles.size(), + platform::errors::PreconditionNotMet( + "The number of input variable handles plus the number " + "of places should be equal to the number of output variable handles, " + "but got the number of input variable handles is %d, the " + "number of places is %d, and the number of output variable handles " + "is %d.", + in_var_handles.size(), place_num, out_var_handles.size())); for (size_t i = 0; i < in_var_handles.size(); ++i) { BroadcastOneVar( diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc index 761a5b5a30a0e04690a7dc94752179130c85320a..ce7621d4e35a3f139047b543c4e77d805841e459 100644 --- a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc +++ b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc @@ -13,8 +13,10 @@ // limitations under the License. #include "paddle/fluid/framework/details/fused_broadcast_op_handle.h" + #include #include + #include "gtest/gtest.h" #include "paddle/fluid/framework/details/broadcast_op_handle_test.h" #include "paddle/fluid/framework/details/op_handle_base.h" @@ -58,7 +60,8 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle { op_handle_ = new FusedBroadcastOpHandle( nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get()); #else - PADDLE_THROW("CUDA is not supported."); + PADDLE_THROW( + platform::errors::PreconditionNotMet("Not compiled with CUDA.")); #endif } else { #if defined(PADDLE_WITH_NCCL) diff --git a/paddle/fluid/framework/details/gather_op_handle.cc b/paddle/fluid/framework/details/gather_op_handle.cc index a039c6200e394eebf6c44846ce2b0bf5d773e764..2d3b2fb39afbe7be680704e63e52d1951f3d8946 100644 --- a/paddle/fluid/framework/details/gather_op_handle.cc +++ b/paddle/fluid/framework/details/gather_op_handle.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/details/gather_op_handle.h" + #include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/variable_visitor.h" @@ -32,13 +33,20 @@ void GatherOpHandle::RunImpl() { PADDLE_ENFORCE_EQ( in_var_handles.size(), places_.size(), - "The number of output should equal to the number of places."); + platform::errors::InvalidArgument( + "The number of input variables should be equal " + "to the number of places, but got the number of input variables is " + "%d and the number of places is %d.", + in_var_handles.size(), places_.size())); VarHandle *out_var_handle; { auto out_var_handles = DynamicCast(this->Outputs()); - PADDLE_ENFORCE_EQ(out_var_handles.size(), 1, - "The number of output should be one."); + PADDLE_ENFORCE_EQ( + out_var_handles.size(), 1, + platform::errors::InvalidArgument( + "The number of output variables should be 1, but got %d.", + out_var_handles.size())); out_var_handle = out_var_handles.front(); } @@ -47,10 +55,14 @@ void GatherOpHandle::RunImpl() { auto in_0_handle = in_var_handles[0]; auto pre_in_var = var_scopes.at(in_0_handle->scope_idx())->FindVar(in_0_handle->name()); - PADDLE_ENFORCE_NOT_NULL(pre_in_var); + PADDLE_ENFORCE_NOT_NULL( + pre_in_var, + platform::errors::NotFound("The variable '%s' is not found in the scope.", + in_0_handle->name())); - PADDLE_ENFORCE(pre_in_var->IsType(), - "Currently, gather_op only can gather SelectedRows."); + PADDLE_ENFORCE_EQ(pre_in_var->IsType(), true, + platform::errors::Unimplemented( + "Currently, gather_op only supports SelectedRows.")); // Wait input done, this Wait is asynchronous operation WaitInputVarGenerated(); @@ -63,7 +75,10 @@ void GatherOpHandle::RunImpl() { for (auto *in_handle : in_var_handles) { auto *in_var = var_scopes.at(in_handle->scope_idx())->FindVar(in_handle->name()); - PADDLE_ENFORCE_NOT_NULL(in_var); + PADDLE_ENFORCE_NOT_NULL( + in_var, + platform::errors::NotFound( + "The variable '%s' is not found in the scope.", in_handle->name())); VariableVisitor::EnforceShapeAndDTypeEQ(*in_var, *pre_in_var); auto &in_sr_value = in_var->Get(); @@ -76,15 +91,19 @@ void GatherOpHandle::RunImpl() { // NOTE: The Places of all input tensor must be all on CPU or all on GPU. platform::Place t_out_p = out_var_handle->place(); if (platform::is_gpu_place(pre_in_value.place())) { - PADDLE_ENFORCE(platform::is_gpu_place(t_out_p), - "Places of input and output must be all on GPU."); + PADDLE_ENFORCE_EQ(platform::is_gpu_place(t_out_p), true, + platform::errors::PreconditionNotMet( + "Places of input and output must be all on GPU.")); } else { t_out_p = platform::CPUPlace(); } auto out_var = var_scopes.at(out_var_handle->scope_idx()) ->FindVar(out_var_handle->name()); - PADDLE_ENFORCE_NOT_NULL(out_var); + PADDLE_ENFORCE_NOT_NULL( + out_var, + platform::errors::NotFound("The variable '%s' is not found in the scope.", + out_var_handle->name())); auto out_value = out_var->GetMutable(); out_value->set_height(pre_in_value.height()); out_value->set_rows(out_rows); diff --git a/paddle/fluid/framework/details/gather_op_handle_test.cc b/paddle/fluid/framework/details/gather_op_handle_test.cc index f3fcc1a436df38986e1202755cd88f14069028a8..60c1d0d39a551fb1cec523109e76c309d11ea248 100644 --- a/paddle/fluid/framework/details/gather_op_handle_test.cc +++ b/paddle/fluid/framework/details/gather_op_handle_test.cc @@ -13,8 +13,10 @@ // limitations under the License. #include "paddle/fluid/framework/details/gather_op_handle.h" + #include #include + #include "gtest/gtest.h" namespace paddle { @@ -60,7 +62,8 @@ struct TestGatherOpHandle { ctxs_.emplace_back(new p::CUDADeviceContext(p)); } #else - PADDLE_THROW("CUDA is not support."); + PADDLE_THROW( + platform::errors::PreconditionNotMet("Not compiled with CUDA.")); #endif } else { int count = 8; @@ -141,7 +144,9 @@ struct TestGatherOpHandle { for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size(); ++input_scope_idx) { auto in_var = param_scopes_.at(input_scope_idx)->FindVar("input"); - PADDLE_ENFORCE_NOT_NULL(in_var); + PADDLE_ENFORCE_NOT_NULL( + in_var, platform::errors::NotFound( + "The variable '%s' is not found in the scope.", "input")); auto in_selected_rows = in_var->GetMutable(); auto value = in_selected_rows->mutable_value(); value->mutable_data(kDims, gpu_list_[input_scope_idx]); @@ -155,7 +160,9 @@ struct TestGatherOpHandle { } auto out_var = param_scopes_.at(output_scope_idx)->FindVar("out"); - PADDLE_ENFORCE_NOT_NULL(out_var); + PADDLE_ENFORCE_NOT_NULL( + out_var, platform::errors::NotFound( + "The variable '%s' is not found in the scope.", "out")); auto out_selected_rows = out_var->GetMutable(); auto in_var = param_scopes_.at(output_scope_idx)->FindVar("input"); @@ -173,9 +180,19 @@ struct TestGatherOpHandle { auto& out_select_rows = out_var->Get(); auto rt = out_select_rows.value(); - PADDLE_ENFORCE_EQ(out_select_rows.height(), height, "height is not equal."); + PADDLE_ENFORCE_EQ(out_select_rows.height(), height, + platform::errors::InvalidArgument( + "The height of SelectedRows is not equal to " + "the expected, expect %d, but got %d.", + height, out_select_rows.height())); + for (size_t k = 0; k < out_select_rows.rows().size(); ++k) { - PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k % rows.size()]); + PADDLE_ENFORCE_EQ( + out_select_rows.rows()[k], rows[k % rows.size()], + platform::errors::InvalidArgument( + "The item at position %d of rows of SelectedRows is not equal to " + "the expected, expect %d, but got %d.", + k, rows[k % rows.size()], out_select_rows.rows()[k])); } f::Tensor result_tensor; @@ -207,6 +224,7 @@ TEST(GatherTester, TestGPUGatherTestSelectedRows) { test_op.TestGatherSelectedRows(input_scope_idx); } #endif + } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/nccl_op_handle.h b/paddle/fluid/framework/details/nccl_op_handle.h index 2d4d4122a3c0f60e5e207556d20886985f72a30a..22a059773f513f1a4a3aef0d3ca9b603fcd30bf8 100644 --- a/paddle/fluid/framework/details/nccl_op_handle.h +++ b/paddle/fluid/framework/details/nccl_op_handle.h @@ -46,14 +46,17 @@ class NCCLOpHandleBase : public OpHandleBase { } virtual ~NCCLOpHandleBase() { for (auto& ev : inter_events_) { - PADDLE_ENFORCE(cudaEventDestroy(ev.second)); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(ev.second)); } for (auto& ev : exter_events_) { - PADDLE_ENFORCE(cudaEventDestroy(ev.second)); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(ev.second)); } } void SetRunEnv(int run_order, bool use_hierarchical_allreduce) { - PADDLE_ENFORCE(run_order >= 0, "run_order must >= 0"); + PADDLE_ENFORCE_GE( + run_order, 0, + platform::errors::InvalidArgument( + "The argument run_order must be >= 0, but got %d.", run_order)); run_order_ = run_order; use_hierarchical_allreduce_ = use_hierarchical_allreduce; @@ -74,8 +77,11 @@ class NCCLOpHandleBase : public OpHandleBase { return; } - PADDLE_ENFORCE(places_.size() == 1, - "HierarchicalAllReduce run one proc with one card mode."); + PADDLE_ENFORCE_EQ(places_.size(), 1, + platform::errors::InvalidArgument( + "HierarchicalAllReduce can only run " + "one proccess with one card mode, but got %d cards.", + places_.size())); for (auto& p : places_) { auto ctxs = nccl_ctxs_->GetHierarchicalInterCtx(run_order); @@ -88,11 +94,11 @@ class NCCLOpHandleBase : public OpHandleBase { continue; } - PADDLE_ENFORCE(cudaSetDevice(dev_id)); - PADDLE_ENFORCE(cudaEventCreateWithFlags(&inter_events_[dev_id], - cudaEventDisableTiming)); - PADDLE_ENFORCE(cudaEventCreateWithFlags(&exter_events_[dev_id], - cudaEventDisableTiming)); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(dev_id)); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventCreateWithFlags( + &inter_events_[dev_id], cudaEventDisableTiming)); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventCreateWithFlags( + &exter_events_[dev_id], cudaEventDisableTiming)); VLOG(10) << "Create events on dev_id:" << dev_id << ", inter_event:" << &inter_events_[dev_id] << ", exter_event:" << &exter_events_[dev_id]; @@ -102,7 +108,10 @@ class NCCLOpHandleBase : public OpHandleBase { void FlatNCCLAllReduce(platform::Place place, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op) { - PADDLE_ENFORCE(run_order_ >= 0, "run_order must > 0"); + PADDLE_ENFORCE_GE( + run_order_, 0, + platform::errors::InvalidArgument( + "The argument run_order_ must be >= 0, but got %d.", run_order_)); auto flat_nccl_ctxs = nccl_ctxs_->GetFlatCtx(run_order_); int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; auto& nccl_ctx = flat_nccl_ctxs->at(dev_id); @@ -113,14 +122,17 @@ class NCCLOpHandleBase : public OpHandleBase { << ", dev_id:" << dev_id << ", dtype:" << datatype << ", place:" << place; - PADDLE_ENFORCE(platform::dynload::ncclAllReduce( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce( sendbuff, recvbuff, count, datatype, op, comm, stream)); } void NCCLAllReduce(platform::Place place, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op) { - PADDLE_ENFORCE(run_order_ >= 0, "run_order must > 0"); + PADDLE_ENFORCE_GE( + run_order_, 0, + platform::errors::InvalidArgument( + "The argument run_order_ must be >= 0, but got %d.", run_order_)); if (!use_hierarchical_allreduce_) { FlatNCCLAllReduce(place, sendbuff, recvbuff, count, datatype, op); return; @@ -132,7 +144,10 @@ class NCCLOpHandleBase : public OpHandleBase { void HierarchicalAllReduce(platform::Place place, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op) { - PADDLE_ENFORCE(run_order_ >= 0, "run_order must > 0"); + PADDLE_ENFORCE_GE( + run_order_, 0, + platform::errors::InvalidArgument( + "The argument run_order_ must be >= 0, but got %d.", run_order_)); InterReduce(place, sendbuff, recvbuff, count, datatype, op); // When a trainer is not in exter allreduce ring // they need not to call this. @@ -157,14 +172,13 @@ class NCCLOpHandleBase : public OpHandleBase { << ", dtype:" << datatype << ", place:" << place << ", stream:" << stream; - PADDLE_ENFORCE(platform::dynload::ncclReduce( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduce( sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream)); cudaEventRecord(inter_events_.at(dev_id), stream); if (FLAGS_sync_nccl_allreduce) { - PADDLE_ENFORCE(cudaStreamSynchronize(stream), - "sync HierarchicalAllReduce inter stream error"); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream)); } } @@ -172,7 +186,9 @@ class NCCLOpHandleBase : public OpHandleBase { void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op) { auto nccl_ctxs = nccl_ctxs_->GetHierarchicalExterCtx(run_order_); - PADDLE_ENFORCE(nccl_ctxs_, "can't get exter %d nccl_ctxs", run_order_); + PADDLE_ENFORCE_NOT_NULL( + nccl_ctxs_, platform::errors::NotFound( + "Can't get exter %d nccl contexts.", run_order_)); int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; auto& nccl_ctx = nccl_ctxs->at(dev_id); auto stream = nccl_ctx.stream(); @@ -185,14 +201,13 @@ class NCCLOpHandleBase : public OpHandleBase { cudaStreamWaitEvent(stream, inter_events_.at(dev_id), 0); - PADDLE_ENFORCE(platform::dynload::ncclAllReduce( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce( sendbuff, recvbuff, count, datatype, op, comm, stream)); cudaEventRecord(exter_events_.at(dev_id), stream); if (FLAGS_sync_nccl_allreduce) { - PADDLE_ENFORCE(cudaStreamSynchronize(stream), - "sync HierarchicalAllReduce exter stream error"); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream)); } } @@ -210,8 +225,8 @@ class NCCLOpHandleBase : public OpHandleBase { << ", stream:" << stream; cudaStreamWaitEvent(stream, exter_events_.at(dev_id), 0); - PADDLE_ENFORCE(platform::dynload::ncclBcast(sendbuff, count, datatype, 0, - comm, stream)); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast( + sendbuff, count, datatype, 0, comm, stream)); } protected: diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index 459bcff5c0b740be0d495a6ad648da7424bd1a42..105c37192f57c365abc1429afa7e6627b95eef90 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -47,8 +47,8 @@ void OpHandleBase::InitCUDA() { #ifdef PADDLE_WITH_CUDA for (auto &p : dev_ctxes_) { int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p.first).device; - PADDLE_ENFORCE(cudaSetDevice(dev_id)); - PADDLE_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(dev_id)); + PADDLE_ENFORCE_CUDA_SUCCESS( cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming)); } if (IsMultiDeviceTransfer() && dev_ctxes_.size() > 0) { @@ -62,17 +62,22 @@ void OpHandleBase::InitCUDA() { } } } else { - PADDLE_ENFORCE_EQ(dev_ctxes_.size(), 1UL, - "%s should have only one dev_ctx.", Name()); + PADDLE_ENFORCE_EQ( + dev_ctxes_.size(), 1UL, + platform::errors::InvalidArgument( + "Operator %s should have only one dev_ctx, but got %d.", Name(), + dev_ctxes_.size())); auto &place = dev_ctxes_.begin()->first; int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; for (auto &out_var : outputs_) { auto *out_var_handle = dynamic_cast(out_var); if (out_var_handle) { - PADDLE_ENFORCE(platform::is_same_place(place, out_var_handle->place()), - "The place of output(%s) is not consistent with the " - "place of current op(%s).", - out_var_handle->Name(), Name()); + PADDLE_ENFORCE_EQ( + platform::is_same_place(place, out_var_handle->place()), true, + platform::errors::InvalidArgument( + "The place of output(%s) is not consistent with the " + "place of current op(%s).", + out_var_handle->Name(), Name())); out_var_handle->SetGenerateEvent(events_.at(dev_id)); } } @@ -86,7 +91,10 @@ void OpHandleBase::Run(bool use_cuda) { InitCUDA(); } #else - PADDLE_ENFORCE(!use_cuda); + PADDLE_ENFORCE_EQ(use_cuda, false, + platform::errors::InvalidArgument( + "Argument use_cuda should be false when Paddle is not " + "compiled with CUDA.")); #endif // skip running current op, used with inplace_addto_op_pass @@ -100,17 +108,20 @@ void OpHandleBase::Run(bool use_cuda) { void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) { #ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_NOT_NULL(waited_ctx); + PADDLE_ENFORCE_NOT_NULL(waited_ctx, platform::errors::InvalidArgument( + "Argument waited_ctx is NULL.")); if (platform::is_cpu_place(waited_ctx->GetPlace()) || events_.empty()) { for (auto &dev_ctx : dev_ctxes_) { - PADDLE_ENFORCE_NOT_NULL(dev_ctx.second); + PADDLE_ENFORCE_NOT_NULL( + dev_ctx.second, + platform::errors::InvalidArgument("The device context is NULL.")); dev_ctx.second->Wait(); } } else { auto stream = static_cast(waited_ctx)->stream(); for (auto &ev : events_) { - PADDLE_ENFORCE(cudaStreamWaitEvent(stream, ev.second, 0)); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamWaitEvent(stream, ev.second, 0)); } } #else @@ -145,10 +156,11 @@ void OpHandleBase::WaitInputVarGenerated() { auto stream = static_cast(dev_ctxes_.at(place)) ->stream(); - PADDLE_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); #else - PADDLE_THROW("Doesn't compile the GPU."); + PADDLE_THROW( + platform::errors::PreconditionNotMet("Not compiled with CUDA.")); #endif } // There are nothing to do when the place is CPUPlace. @@ -169,10 +181,11 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) { auto stream = static_cast( dev_ctxes_.at(in_var_handle->place())) ->stream(); - PADDLE_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); #else - PADDLE_THROW("Doesn't compile the GPU."); + PADDLE_THROW( + platform::errors::PreconditionNotMet("Not compiled with CUDA.")); #endif } // There are nothing to do when the place is CPUPlace. @@ -242,7 +255,9 @@ void OpHandleBase::SetLocalExecScopes( auto scopes = GetLocalScopes(); for (auto *scope : scopes) { auto iter = scope_map.find(scope); - PADDLE_ENFORCE(iter != scope_map.end(), "Local scope not found"); + PADDLE_ENFORCE_NE( + iter, scope_map.end(), + platform::errors::NotFound("Local scope not found in scope map.")); local_exec_scopes_.emplace_back(iter->second); } } diff --git a/paddle/fluid/framework/details/op_registry.h b/paddle/fluid/framework/details/op_registry.h index 1e608000e0ac93bb9be3308a89362c321d6a11f9..453a25166b56e9755528fa17f203557680ebabe6 100644 --- a/paddle/fluid/framework/details/op_registry.h +++ b/paddle/fluid/framework/details/op_registry.h @@ -21,6 +21,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/grad_op_desc_maker.h" #include "paddle/fluid/framework/inplace_op_inference.h" #include "paddle/fluid/framework/no_need_buffer_vars_inference.h" @@ -186,19 +187,20 @@ struct OpInfoFiller { void operator()(const char* op_type, OpInfo* info) const { PADDLE_ENFORCE_EQ(info->proto_, nullptr, platform::errors::AlreadyExists( - "OpProto of %s has been registered", op_type)); + "OpProto of %s has been registered.", op_type)); PADDLE_ENFORCE_EQ(info->checker_, nullptr, platform::errors::AlreadyExists( - "OpAttrChecker of %s has been registered", op_type)); + "OpAttrChecker of %s has been registered.", op_type)); info->proto_ = new proto::OpProto; info->checker_ = new OpAttrChecker(); T maker; maker(info->proto_, info->checker_); info->proto_->set_type(op_type); - PADDLE_ENFORCE( - info->proto_->IsInitialized(), - "Fail to initialize %s's OpProto, because %s is not initialized", - op_type, info->proto_->InitializationErrorString()); + PADDLE_ENFORCE_EQ( + info->proto_->IsInitialized(), true, + platform::errors::PreconditionNotMet( + "Fail to initialize %s's OpProto, because %s is not initialized.", + op_type, info->proto_->InitializationErrorString())); } }; diff --git a/paddle/fluid/framework/details/reduce_and_gather.h b/paddle/fluid/framework/details/reduce_and_gather.h index 11c4621fde394057144462bb513aab63187512e3..9ecb2d8dbdd1c26e827961cbc9b569d92992c8e3 100644 --- a/paddle/fluid/framework/details/reduce_and_gather.h +++ b/paddle/fluid/framework/details/reduce_and_gather.h @@ -16,6 +16,7 @@ #include #include #include + #include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/selected_rows.h" @@ -32,9 +33,13 @@ struct ReduceLoDTensor { template void apply() const { - PADDLE_ENFORCE(!src_tensors_.empty()); + PADDLE_ENFORCE_NE(src_tensors_.empty(), true, + platform::errors::InvalidArgument( + "The number of tensors to be reduced is 0.")); auto &t0 = *src_tensors_[0]; - PADDLE_ENFORCE_NE(t0.numel(), 0); + PADDLE_ENFORCE_NE(t0.numel(), 0, + platform::errors::InvalidArgument( + "The size of first tensor to be reduced is 0.")); dst_tensor_.Resize(t0.dims()); T *dst = dst_tensor_.mutable_data(platform::CPUPlace()); @@ -45,8 +50,19 @@ struct ReduceLoDTensor { continue; } - PADDLE_ENFORCE_EQ(t.dims(), t0.dims()); - PADDLE_ENFORCE_EQ(t.type(), t0.type()); + PADDLE_ENFORCE_EQ(t.dims(), t0.dims(), + platform::errors::InvalidArgument( + "The shape of tensors to be reduced must be " + "consistent. The shape of current tensor is %s, " + "but the shape of the first tensor is %s.", + t.dims(), t0.dims())); + + PADDLE_ENFORCE_EQ(t.type(), t0.type(), + platform::errors::InvalidArgument( + "The type of tensors to be reduced must be " + "consistent. The type of current tensor is %s, " + "but the type of the first tensor is %s.", + t.type(), t0.type())); std::transform(t.data(), t.data() + t.numel(), dst, dst, [](T a, T b) -> T { return a + b; }); } @@ -88,7 +104,9 @@ struct GatherLocalSelectedRowsFunctor { in_places_(in_places), out_place_(out_place), dst_selected_rows_(dst_selected_rows) { - PADDLE_ENFORCE_EQ(src_selected_rows.empty(), false); + PADDLE_ENFORCE_NE(src_selected_rows.empty(), true, + platform::errors::InvalidArgument( + "The number of selected_rows to be gathered is 0.")); std::vector out_rows; diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc index d8f8cc994c080953c92adc02943bc6828aa645a6..d7f13f79f68ebe629cc67d5e6c868d331d3c917c 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.cc +++ b/paddle/fluid/framework/details/reduce_op_handle.cc @@ -13,7 +13,9 @@ // limitations under the License. #include "paddle/fluid/framework/details/reduce_op_handle.h" + #include + #include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/details/variable_visitor.h" @@ -116,8 +118,15 @@ void ReduceOpHandle::GatherSelectedRows( merged_dev_ctx->Wait(); scope->EraseVars(std::vector{gathered_var_name}); - PADDLE_ENFORCE(client->Gather(vars, &remote, *merged_dev_ctx, scope)); - PADDLE_ENFORCE(remote.size() == vars.size()); + PADDLE_ENFORCE_EQ( + client->Gather(vars, &remote, *merged_dev_ctx, scope), true, + platform::errors::PreconditionNotMet("Gather SelectedRows failed.")); + PADDLE_ENFORCE_EQ(remote.size(), vars.size(), + platform::errors::PreconditionNotMet( + "The number of remotes should be equal to the number " + "of variables to be gathered, but got the number of " + "remotes is %d and the number of variables is %d.", + remote.size(), vars.size())); // 4. merged local selected rows. std::vector all; @@ -151,14 +160,19 @@ void ReduceOpHandle::RunImpl() { PADDLE_ENFORCE_EQ( in_var_handles.size(), places_.size(), - "The number of output should equal to the number of places."); + platform::errors::InvalidArgument( + "The number of inputs should equal to the number of places, but got " + "the number of inputs is %d and the number of places is %d.", + in_var_handles.size(), places_.size())); VarHandle *out_var_handle; { auto out_var_handles = DynamicCast(outputs_); PADDLE_ENFORCE_EQ(out_var_handles.size(), 1UL, - "The number of output should be one."); + platform::errors::InvalidArgument( + "The number of output should be one, but got %d.", + out_var_handles.size())); out_var_handle = out_var_handles.front(); } @@ -168,7 +182,10 @@ void ReduceOpHandle::RunImpl() { auto pre_in_var = var_scopes.at(in_0_handle->scope_idx())->FindVar(in_0_handle->name()); - PADDLE_ENFORCE_NOT_NULL(pre_in_var); + + PADDLE_ENFORCE_NOT_NULL(pre_in_var, platform::errors::NotFound( + "Variable %s is not found in scope.", + in_0_handle->name())); // NOTE: The Places of all input tensor must be all on CPU or all on GPU. std::vector in_places; // used to get dev_ctx @@ -176,21 +193,29 @@ void ReduceOpHandle::RunImpl() { in_places.emplace_back(in_handle->place()); auto in_var = var_scopes.at(in_handle->scope_idx())->FindVar(in_handle->name()); - PADDLE_ENFORCE_NOT_NULL(in_var); + + PADDLE_ENFORCE_NOT_NULL( + in_var, platform::errors::NotFound("Variable %s is not found in scope.", + in_handle->name())); + VariableVisitor::EnforceShapeAndDTypeEQ(*pre_in_var, *in_var); } auto out_var = var_scopes.at(out_var_handle->scope_idx()) ->FindVar(out_var_handle->name()); - PADDLE_ENFORCE_NOT_NULL(out_var); + + PADDLE_ENFORCE_NOT_NULL( + out_var, platform::errors::NotFound("Variable %s is not found in scope.", + out_var_handle->name())); // NOTE: The tensors' Place of input and output must be all on GPU or all on // CPU. auto in_p = VariableVisitor::GetMutableTensor(pre_in_var).place(); platform::Place t_out_p; if (platform::is_gpu_place(in_p)) { - PADDLE_ENFORCE(platform::is_gpu_place(out_var_handle->place()), - "Places of input and output must be all on GPU."); + PADDLE_ENFORCE_EQ(platform::is_gpu_place(out_var_handle->place()), true, + platform::errors::PreconditionNotMet( + "Places of input and output must be all on GPU.")); t_out_p = out_var_handle->place(); } else { t_out_p = platform::CPUPlace(); @@ -229,7 +254,10 @@ void ReduceOpHandle::RunImpl() { in_selected_rows, in_places, dev_ctxes_, out_var_handle, t_out_p, out_var->GetMutable()); } else { - PADDLE_THROW("only support double or float when gather SelectedRows"); + PADDLE_THROW(platform::errors::Unimplemented( + "Only support double or float when gather SelectedRows, but got " + "%s.", + framework::DataTypeToString(in_selected_rows[0]->value().type()))); } #endif }); @@ -292,7 +320,7 @@ void ReduceOpHandle::RunImpl() { size_t numel = static_cast(lod_tensor.numel()); all_reduce_calls.emplace_back( [buffer, recvbuffer, type, numel, root_id, &nccl_ctx] { - PADDLE_ENFORCE(platform::dynload::ncclReduce( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduce( buffer, recvbuffer, numel, static_cast(type), ncclSum, root_id, nccl_ctx.comm_, nccl_ctx.stream())); }); @@ -306,10 +334,13 @@ void ReduceOpHandle::RunImpl() { } }); #else - PADDLE_THROW("CUDA is not enabled."); + PADDLE_THROW( + platform::errors::PreconditionNotMet("Not compiled with CUDA.")); #endif } else { - PADDLE_THROW("Place should be CPUPlace or CUDAPlace."); + PADDLE_THROW(platform::errors::InvalidArgument( + "The place of tensor should be CPUPlace or CUDAPlace, but got %s.", + lod_tensors[0]->place())); } } } diff --git a/paddle/fluid/framework/details/reduce_op_handle_test.cc b/paddle/fluid/framework/details/reduce_op_handle_test.cc index d71251b76c75b08e35c6b2ba3af2f8ab2e53308c..ba03c3a267aec821f83f70e694b79833989743c4 100644 --- a/paddle/fluid/framework/details/reduce_op_handle_test.cc +++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc @@ -13,7 +13,9 @@ // limitations under the License. #include "paddle/fluid/framework/details/reduce_op_handle.h" + #include + #include "gtest/gtest.h" #include "paddle/fluid/platform/device_context.h" @@ -69,7 +71,8 @@ struct TestReduceOpHandle { } nccl_ctxs_.reset(new platform::NCCLContextMap(gpu_list_)); #else - PADDLE_THROW("CUDA is not support."); + PADDLE_THROW( + platform::errors::PreconditionNotMet("Not compiled with NCLL.")); #endif } else { int count = 8; @@ -103,7 +106,8 @@ struct TestReduceOpHandle { op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_, gpu_list_, nccl_ctxs_.get())); #else - PADDLE_THROW("CUDA is not support."); + PADDLE_THROW( + platform::errors::PreconditionNotMet("Not compiled with NCLL.")); #endif } else { #if defined(PADDLE_WITH_NCCL) @@ -164,7 +168,10 @@ struct TestReduceOpHandle { for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size(); ++input_scope_idx) { auto in_var = param_scopes_[input_scope_idx]->FindVar("input"); - PADDLE_ENFORCE_NOT_NULL(in_var); + + PADDLE_ENFORCE_NOT_NULL( + in_var, platform::errors::NotFound( + "Variable %s is not found in scope.", "input")); auto in_selected_rows = in_var->GetMutable(); auto value = in_selected_rows->mutable_value(); value->mutable_data(kDims, gpu_list_[input_scope_idx]); @@ -178,7 +185,9 @@ struct TestReduceOpHandle { } auto out_var = param_scopes_[output_scope_idx]->FindVar("out"); - PADDLE_ENFORCE_NOT_NULL(out_var); + PADDLE_ENFORCE_NOT_NULL(out_var, + platform::errors::NotFound( + "Variable %s is not found in scope.", "out")); auto out_selected_rows = out_var->GetMutable(); auto in_var = param_scopes_[output_scope_idx]->FindVar("input"); @@ -196,9 +205,18 @@ struct TestReduceOpHandle { auto &out_select_rows = out_var->Get(); auto rt = out_select_rows.value(); - PADDLE_ENFORCE_EQ(out_select_rows.height(), height, "height is not equal."); + PADDLE_ENFORCE_EQ(out_select_rows.height(), height, + platform::errors::InvalidArgument( + "The height of SelectedRows is not equal to " + "the expected, expect %d, but got %d.", + height, out_select_rows.height())); for (size_t k = 0; k < out_select_rows.rows().size(); ++k) { - PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k % rows.size()]); + PADDLE_ENFORCE_EQ( + out_select_rows.rows()[k], rows[k % rows.size()], + platform::errors::InvalidArgument( + "The item at position %d of rows of SelectedRows is not equal to " + "the expected, expect %d, but got %d.", + k, rows[k % rows.size()], out_select_rows.rows()[k])); } f::Tensor result_tensor; @@ -208,7 +226,7 @@ struct TestReduceOpHandle { for (int64_t j = 0; j < f::product(result_tensor.dims()); ++j) { ASSERT_NEAR(ct[j], send_vector[j % send_vector.size()], 1e-5); } - } + } // namespace details void TestReduceLodTensors(size_t output_scope_idx) { std::vector send_vector(static_cast(f::product(kDims))); @@ -220,7 +238,9 @@ struct TestReduceOpHandle { for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size(); ++input_scope_idx) { auto in_var = param_scopes_[input_scope_idx]->FindVar("input"); - PADDLE_ENFORCE_NOT_NULL(in_var); + PADDLE_ENFORCE_NOT_NULL( + in_var, platform::errors::NotFound( + "Variable %s is not found in scope.", "input")); auto in_lod_tensor = in_var->GetMutable(); in_lod_tensor->mutable_data(kDims, gpu_list_[input_scope_idx]); in_lod_tensor->set_lod(lod); @@ -230,7 +250,9 @@ struct TestReduceOpHandle { } auto out_var = param_scopes_[output_scope_idx]->FindVar("out"); - PADDLE_ENFORCE_NOT_NULL(out_var); + PADDLE_ENFORCE_NOT_NULL(out_var, + platform::errors::NotFound( + "Variable %s is not found in scope.", "out")); auto out_lodtensor = out_var->GetMutable(); auto in_var = param_scopes_[output_scope_idx]->FindVar("input"); @@ -254,7 +276,7 @@ struct TestReduceOpHandle { ASSERT_NEAR(ct[j], send_vector[j] * gpu_list_.size(), 1e-5); } } -}; +}; // namespace details TEST(ReduceTester, TestCPUReduceTestSelectedRows) { TestReduceOpHandle test_op; diff --git a/paddle/fluid/framework/details/share_tensor_buffer_functor.cc b/paddle/fluid/framework/details/share_tensor_buffer_functor.cc index bf93d8f85b16cbe47373c1982c4eff2d678158c8..079e9abc895ca560c2f607b225dc6f6587b75dfb 100644 --- a/paddle/fluid/framework/details/share_tensor_buffer_functor.cc +++ b/paddle/fluid/framework/details/share_tensor_buffer_functor.cc @@ -111,13 +111,12 @@ void ShareTensorBufferFunctor::CallOnce() { auto *out_var = exec_scope_->FindVar(out_var_names_[i]); PADDLE_ENFORCE_NOT_NULL( in_var, platform::errors::NotFound( - "The input variable(%s)to be inplaced should not be NULL.", + "The variable(%s) to be inplaced is not found in scope.", in_var_infos_[i]->Name())); PADDLE_ENFORCE_NOT_NULL( - out_var, - platform::errors::NotFound( - "The output variable(%s) to be inplaced should not be NULL.", - out_var_names_[i])); + out_var, platform::errors::NotFound( + "The variable(%s) to be inplaced is not found in scope.", + out_var_names_[i])); PADDLE_ENFORCE_NE( in_var, out_var, platform::errors::PreconditionNotMet( diff --git a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc index 3f9af1c3a1289cc3453d850a2ffa9f78900f3367..37399e5ddc09d9c8237319682e80d352d658411d 100644 --- a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc @@ -12,8 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. #include "paddle/fluid/framework/details/sparse_all_reduce_op_handle.h" + #include #include + #include "dgc/dgc.h" #include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/reduce_and_gather.h" @@ -38,18 +40,23 @@ SparseAllReduceOpHandle::SparseAllReduceOpHandle( is_encoded_(is_encoded), nranks_(nranks) { // TODO(gongwb) :polish them! - PADDLE_ENFORCE_EQ(is_encoded, true); + PADDLE_ENFORCE_EQ(is_encoded, true, platform::errors::InvalidArgument( + "The argument is_encoded is false.")); VLOG(1) << "Use dgc allreduce mode" << ", nranks:" << nranks_; - PADDLE_ENFORCE_GT(local_scopes_.size(), 0); + PADDLE_ENFORCE_GT(local_scopes_.size(), 0, + platform::errors::PreconditionNotMet( + "The number of local scope should be > 0, but got %zu.", + local_scopes_.size())); auto nranks_name = g_dgc_nranks; for (size_t i = 0; i < local_scopes_.size(); ++i) { auto *local_scope = local_scopes_[i]; auto nranks_var = local_scope->FindVar(nranks_name); - if (nranks_var == nullptr) { - PADDLE_THROW("not find nranks_var:%s", nranks_name); - } + + PADDLE_ENFORCE_NOT_NULL( + nranks_var, platform::errors::NotFound( + "Variable %s is not found in scope.", nranks_name)); float *dgc_nranks = nranks_var->GetMutable()->data(); *dgc_nranks = nranks; @@ -64,10 +71,18 @@ void SparseAllReduceOpHandle::RunImplEncoded() { auto out_var_handles = DynamicCast(this->Outputs()); PADDLE_ENFORCE_EQ( in_var_handles.size(), places_.size(), - "The NoDummyInputSize should be equal to the number of places."); + platform::errors::PreconditionNotMet( + "The number of input variables should be equal to the number of " + "places, but got the number of input variables is %zu and the the " + "number of places is %zu.", + in_var_handles.size(), places_.size())); PADDLE_ENFORCE_EQ( in_var_handles.size(), out_var_handles.size(), - "The NoDummyInputSize and NoDummyOutputSize should be equal."); + platform::errors::PreconditionNotMet( + "The number of input variables should be equal to the number of " + "output variables, but got the number of input variables is %zu and " + "the the number of output variables is %zu.", + in_var_handles.size(), out_var_handles.size())); std::vector ins; std::vector gathers; @@ -80,14 +95,17 @@ void SparseAllReduceOpHandle::RunImplEncoded() { auto encode_var_name = original_name + g_dgc_encoded; auto *in_var = local_scope->FindVar(encode_var_name); - PADDLE_ENFORCE_NOT_NULL(in_var, "%s should not be null", encode_var_name); + PADDLE_ENFORCE_NOT_NULL( + in_var, platform::errors::NotFound("Variable %s is not found in scope.", + encode_var_name)); auto &in = in_var->Get(); ins.emplace_back(&in); auto gather_var_name = original_name + g_dgc_gather; auto *gather_var = local_scope->FindVar(gather_var_name); - PADDLE_ENFORCE_NOT_NULL(gather_var, "%s should not be null", - gather_var_name); + PADDLE_ENFORCE_NOT_NULL( + gather_var, platform::errors::NotFound( + "Variable %s is not found in scope.", gather_var)); auto *gather = gather_var->GetMutable(); gathers.emplace_back(gather); @@ -100,14 +118,26 @@ void SparseAllReduceOpHandle::RunImplEncoded() { } } - PADDLE_ENFORCE(platform::is_gpu_place(ins[0]->place())); - PADDLE_ENFORCE(platform::is_gpu_place(outs[0]->place())); - PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr."); + PADDLE_ENFORCE_EQ( + platform::is_gpu_place(ins[0]->place()), true, + platform::errors::InvalidArgument( + "The place of input variable should be CUDAPlace, but got %s.", + ins[0]->place())); + PADDLE_ENFORCE_EQ( + platform::is_gpu_place(outs[0]->place()), true, + platform::errors::InvalidArgument( + "The place of input variable should be CUDAPlace, but got %s.", + outs[0]->place())); + PADDLE_ENFORCE_NOT_NULL(nccl_ctxs_, platform::errors::PreconditionNotMet( + "The nccl contexts are NULL.")); int dtype = -1; size_t in_numel = 0; size_t out_numel = 0; - PADDLE_ENFORCE(nranks_ > 1); + PADDLE_ENFORCE_GT( + nranks_, 1, + platform::errors::PreconditionNotMet( + "The number of ranks should be > 1, but got %d.", nranks_)); std::vector> all_gather_calls; std::vector> sparse_reduce_calls; @@ -123,8 +153,16 @@ void SparseAllReduceOpHandle::RunImplEncoded() { dtype = (dtype == -1) ? platform::ToNCCLDataType(in.type()) : dtype; in_numel = (in_numel == 0) ? static_cast(in.numel()) : in_numel; - PADDLE_ENFORCE(in_numel % 2 == 0); - PADDLE_ENFORCE(in_numel / 2 == static_cast(k)); + PADDLE_ENFORCE_EQ(in_numel % 2, 0, + platform::errors::InvalidArgument( + "The number of elements of input variable should be " + "even, but got %zu.", + in_numel)); + PADDLE_ENFORCE_EQ(in_numel / 2, static_cast(k), + platform::errors::InvalidArgument( + "The number of elements of input variable should be " + "even, but got %zu.", + in_numel)); out_numel = (out_numel == 0) ? static_cast(out.numel()) : out_numel; int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; @@ -154,7 +192,8 @@ void SparseAllReduceOpHandle::RunImplEncoded() { PADDLE_ENFORCE_EQ(paddle::communication::dgc::sparseReduce( gather_buff, k, out_tensor_buf, static_cast(out_numel), nranks_, stream), - true); + true, platform::errors::Unavailable( + "Calling sparseReduce() failed.")); }); } @@ -187,11 +226,16 @@ void SparseAllReduceOpHandle::SparseAllReduceFunc( int SparseAllReduceOpHandle::GetKValue(const std::string &grad_name) { auto original_name = paddle::framework::GradOriginalVarName(grad_name); auto var_name = original_name + g_dgc_k; - PADDLE_ENFORCE(local_scopes_.size() > 0); + PADDLE_ENFORCE_GT(local_scopes_.size(), 0, + platform::errors::PreconditionNotMet( + "The number of local scope should be > 0, but got %zu.", + local_scopes_.size())); auto *scope = local_exec_scopes_[0]; auto var = scope->FindVar(var_name); - PADDLE_ENFORCE_NOT_NULL(var); + PADDLE_ENFORCE_NOT_NULL( + var, platform::errors::NotFound("Variable %s is not found in scope.", + var_name)); auto tensor = var->Get().data(); return *tensor; } @@ -202,15 +246,22 @@ bool SparseAllReduceOpHandle::IsEncoded() { } auto counter_name = g_dgc_counter_name; auto step_name = g_dgc_rampup_begin_step; - PADDLE_ENFORCE(local_scopes_.size() > 0); + + PADDLE_ENFORCE_GT(local_scopes_.size(), 0, + platform::errors::PreconditionNotMet( + "The number of local scope should be > 0, but got %zu.", + local_scopes_.size())); auto *local_scope = local_exec_scopes_[0]; auto count_var = local_scope->FindVar(counter_name); auto step_var = local_scope->FindVar(step_name); - if (count_var == nullptr || step_var == nullptr) { - PADDLE_THROW("not find count_var:%s or step_var:%s", counter_name, - step_var); - } + + PADDLE_ENFORCE_NOT_NULL( + count_var, platform::errors::NotFound( + "Variable %s is not found in scope.", counter_name)); + PADDLE_ENFORCE_NOT_NULL( + step_var, platform::errors::NotFound("Variable %s is not found in scope.", + step_var)); float count = *count_var->Get().data(); float step = *step_var->Get().data(); diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h index ee2ef9a0c3d3595a26c248e63a9e19af784f8bf7..f6f3098613ba194bea90a36efc3153cf63d2db5b 100644 --- a/paddle/fluid/framework/device_worker.h +++ b/paddle/fluid/framework/device_worker.h @@ -74,7 +74,9 @@ class PullDenseWorker { virtual void Initialize(const TrainerDesc& param); #ifdef PADDLE_WITH_CUDA void AddStream(const cudaStream_t stream) { copy_streams_.push_back(stream); } +#endif +#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU) void AddPlace(const paddle::platform::Place place) { places_.push_back(place); } @@ -135,9 +137,9 @@ class PullDenseWorker { #ifdef PADDLE_WITH_CUDA std::vector copy_streams_; +#endif std::vector places_; std::vector thread_scopes_; -#endif }; // should incorporate different type of device @@ -161,6 +163,7 @@ class DeviceWorker { virtual void SetDataFeed(DataFeed* data_feed); virtual void SetWorkerNum(int num) {} virtual void CacheProgram(const ProgramDesc& main_program) {} + virtual void GetXpuOpIndex() {} virtual void SetNeedDumpField(bool need_dump_field) { need_dump_field_ = need_dump_field; } diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto index c9ae5a67950c8d305cedf23654a56d4f48d8dcba..21e28d7ac86d06571e49a522db13c17c0ebf33be 100644 --- a/paddle/fluid/framework/distributed_strategy.proto +++ b/paddle/fluid/framework/distributed_strategy.proto @@ -97,6 +97,7 @@ message AsyncConfig { optional int32 thread_pool_size = 6 [ default = 1 ]; optional int32 send_wait_times = 7 [ default = 1 ]; optional bool runtime_split_send_recv = 8 [ default = false ]; + optional bool launch_barrier = 9 [ default = true ]; } message PipelineConfig { optional int32 micro_batch = 1 [ default = 1 ]; } diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc index 3c076805932d6489676bb9290468821c96931c3c..693073d1fc73a65bd17e34da864f9d8df019043c 100644 --- a/paddle/fluid/framework/fleet/fleet_wrapper.cc +++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc @@ -745,7 +745,57 @@ void FleetWrapper::PushDenseVarsAsync( push_sparse_status->push_back(std::move(status)); } } +#endif + +#ifdef PADDLE_WITH_XPU +void FleetWrapper::PushDenseVarsAsync( + const Scope& scope, const uint64_t table_id, + const std::vector& var_names, + std::vector<::std::future>* push_sparse_status, + float scale_datanorm, int batch_size, + const paddle::platform::Place& place) { +#ifdef PADDLE_WITH_PSLIB + std::vector regions; + for (auto& t : var_names) { + Variable* var = scope.FindVar(t); + LoDTensor* tensor = var->GetMutable(); + int count = tensor->numel(); + float* g_data = tensor->data(); + + Variable* pin_var = scope.FindVar(t + "pin"); + LoDTensor* pin_tensor = pin_var->GetMutable(); + float* pin_g = + pin_tensor->mutable_data(tensor->dims(), platform::CPUPlace()); + memory::Copy(platform::CPUPlace(), pin_g, + BOOST_GET_CONST(platform::XPUPlace, place), g_data, + sizeof(float) * count); + + float* g = pin_g; + if (scale_datanorm >= 0) { + if (t.find(".batch_size@GRAD") != std::string::npos || + t.find(".batch_sum@GRAD") != std::string::npos) { + Eigen::Map mat(g, 1, count); + float scale = 1.0 / batch_size; + mat *= scale; + } else if (t.find(".batch_square_sum@GRAD") != std::string::npos) { + VLOG(3) << "epsilon: " << scale_datanorm; + for (int i = 0; i < count; ++i) { + g[i] = (g[i] - batch_size * scale_datanorm) / batch_size + + batch_size * scale_datanorm; + } + } + } + paddle::ps::Region reg(g, count); + regions.emplace_back(std::move(reg)); + } + auto status = pslib_ptr_->_worker_ptr->push_dense(regions.data(), + regions.size(), table_id); + if (push_sparse_status) { + push_sparse_status->push_back(std::move(status)); + } +#endif +} #endif void FleetWrapper::PushDenseVarsAsync( const Scope& scope, const uint64_t table_id, diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h index be87bdf1e755c63e55ba59d702f43f4aeba42130..ae86835f38df77a3a7661433501cdd2440553d17 100644 --- a/paddle/fluid/framework/fleet/fleet_wrapper.h +++ b/paddle/fluid/framework/fleet/fleet_wrapper.h @@ -160,6 +160,14 @@ class FleetWrapper { float scale_datanorm, int batch_size, const paddle::platform::Place& place, cudaStream_t stream, cudaEvent_t event); +#endif +#ifdef PADDLE_WITH_XPU + void PushDenseVarsAsync( + const Scope& scope, const uint64_t table_id, + const std::vector& var_names, + std::vector<::std::future>* push_sparse_status, + float scale_datanorm, int batch_size, + const paddle::platform::Place& place); #endif void PushDenseVarsAsync( const Scope& scope, const uint64_t table_id, diff --git a/paddle/fluid/framework/fleet/heter_wrapper.cc b/paddle/fluid/framework/fleet/heter_wrapper.cc index 7a27b6a9d7a7b82e84438d101774591dd2e732af..8e232560ab6876995a735b6901a5459265f9cb05 100644 --- a/paddle/fluid/framework/fleet/heter_wrapper.cc +++ b/paddle/fluid/framework/fleet/heter_wrapper.cc @@ -113,30 +113,66 @@ void HeterWrapper::SerializeToReq(const std::string& varname, Scope* scope, if (platform::is_cpu_place(tensor->place())) { memcpy(data_ptr, tensor->data(), tensor->numel() * SizeOfType(tensor->type())); -#ifdef PADDLE_WITH_CUDA } else { +#ifdef PADDLE_WITH_CUDA memory::Copy(platform::CPUPlace(), data_ptr, BOOST_GET_CONST(platform::CUDAPlace, tensor->place()), tensor->data(), tensor->numel() * SizeOfType(tensor->type()), nullptr); - } -#else - } #endif +#ifdef PADDLE_WITH_XPU + memory::Copy(platform::CPUPlace(), data_ptr, + BOOST_GET_CONST(platform::XPUPlace, tensor->place()), + tensor->data(), + tensor->numel() * SizeOfType(tensor->type())); +#endif + } } -// void HeterWrapper::DeSerializeToTensor(Scope* scope, -// const HeterRequest* request) { #ifdef PADDLE_WITH_CUDA void HeterWrapper::DeSerializeToTensor(Scope* scope, const VariableMessage& req_var, platform::Place place, cudaStream_t stream) { + // const VariableMessage& req_var = request->vars(); + auto* var = scope->FindVar(req_var.varname()); + auto* tensor = var->GetMutable(); + + std::vector vec_dim; + for (auto& x : req_var.dims()) { + vec_dim.push_back(x); + } + tensor->Resize(make_ddim(vec_dim)); + + LoD lod; + for (int i = 0; i < req_var.lod_level(); ++i) { + framework::Vector v; + for (int j = 0; j < req_var.lod(i).lod_data_size(); ++j) { + v.push_back(req_var.lod(i).lod_data(j)); + } + lod.push_back(v); + } + tensor->set_lod(lod); + + void* tensor_data = + tensor->mutable_data(place, ToVarType(req_var.data_type())); + +#ifdef PADDLE_WITH_CUDA + memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), tensor_data, + platform::CPUPlace(), req_var.data().data(), + tensor->numel() * SizeOfType(tensor->type()), stream); #else + memcpy(tensor_data, req_var.data().data(), + tensor->numel() * SizeOfType(tensor->type())); +#endif +} +#endif + +// void HeterWrapper::DeSerializeToTensor(Scope* scope, +// const HeterRequest* request) { void HeterWrapper::DeSerializeToTensor(Scope* scope, const VariableMessage& req_var, platform::Place place) { -#endif // const VariableMessage& req_var = request->vars(); auto* var = scope->FindVar(req_var.varname()); auto* tensor = var->GetMutable(); @@ -160,10 +196,10 @@ void HeterWrapper::DeSerializeToTensor(Scope* scope, void* tensor_data = tensor->mutable_data(place, ToVarType(req_var.data_type())); -#ifdef PADDLE_WITH_CUDA - memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place), tensor_data, +#ifdef PADDLE_WITH_XPU + memory::Copy(BOOST_GET_CONST(platform::XPUPlace, place), tensor_data, platform::CPUPlace(), req_var.data().data(), - tensor->numel() * SizeOfType(tensor->type()), stream); + tensor->numel() * SizeOfType(tensor->type())); #else memcpy(tensor_data, req_var.data().data(), tensor->numel() * SizeOfType(tensor->type())); @@ -184,7 +220,8 @@ framework::proto::VarType::Type HeterWrapper::ToVarType( case VariableMessage::BOOL: return framework::proto::VarType::BOOL; // NOLINT default: - VLOG(0) << "Not support type " << type; + PADDLE_THROW(platform::errors::InvalidArgument( + "ToVarType:Unsupported type %d", type)); } } diff --git a/paddle/fluid/framework/heterxpu_trainer.cc b/paddle/fluid/framework/heterxpu_trainer.cc index fbed74800b4ccd10d67f3d6d8212eca73a566629..6bbbaacdde3b30a8956794b650e2ff7b1f503a59 100644 --- a/paddle/fluid/framework/heterxpu_trainer.cc +++ b/paddle/fluid/framework/heterxpu_trainer.cc @@ -12,9 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB) +#include +#include +#include +#include +#include "io/fs.h" +#include "paddle/fluid/framework/data_feed_factory.h" +#include "paddle/fluid/framework/data_set.h" +#include "paddle/fluid/framework/device_worker_factory.h" +#include "paddle/fluid/framework/fleet/fleet_wrapper.h" +#include "paddle/fluid/framework/trainer.h" +#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \ + (defined PADDLE_WITH_PSLIB) +#ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/cuda_device_guard.h" - +#endif namespace paddle { namespace framework { @@ -34,6 +46,7 @@ void HeterXpuTrainer::Initialize(const TrainerDesc& trainer_desc, int place_num = trainer_desc.worker_places_size(); for (int i = 0; i < place_num; ++i) { int num = trainer_desc.worker_places(i); +#ifdef PADDLE_WITH_CUDA platform::CUDAPlace place = platform::CUDAPlace(num); platform::CUDADeviceGuard guard(place.device); cudaStream_t stream; @@ -44,6 +57,11 @@ void HeterXpuTrainer::Initialize(const TrainerDesc& trainer_desc, PADDLE_ENFORCE_CUDA_SUCCESS( cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); events_.push_back(event); +#endif +#ifdef PADDLE_WITH_XPU + platform::XPUPlace place = platform::XPUPlace(num); + places_.push_back(place); +#endif } // thread_num_ = trainer_desc.thread_num(); // SetDataset(dataset); @@ -95,11 +113,17 @@ void HeterXpuTrainer::Initialize(const TrainerDesc& trainer_desc, void HeterXpuTrainer::CreateThreadParam(const ProgramDesc& program, int num) { auto place = places_[num]; Scope* scope = place_scopes_[num]; +#ifdef PADDLE_WITH_CUDA auto stream = copy_streams_[num]; auto event = events_[num]; - auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; platform::CUDADeviceGuard guard(dev_id); +#endif + +#ifdef PADDLE_WITH_XPU + xpu_set_device(BOOST_GET_CONST(platform::XPUPlace, place).device); +#endif + auto& block = program.Block(0); for (auto& var : block.AllVars()) { if (var->Persistable()) { @@ -116,13 +140,28 @@ void HeterXpuTrainer::CreateThreadParam(const ProgramDesc& program, int num) { HeterMemCpy(thread_tensor, root_tensor, place, stream); \ } \ } while (0) + +#define HeterMemcpyXpuFunc(cpp_type, proto_type) \ + do { \ + if (root_tensor->type() == proto_type) { \ + HeterMemCpy(thread_tensor, root_tensor, place); \ + } \ + } while (0) +#ifdef PADDLE_WITH_CUDA _ForEachDataType_(HeterMemcpyFunc); +#endif +#ifdef PADDLE_WITH_XPU + _ForEachDataType_(HeterMemcpyXpuFunc); +#endif } } +#ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, stream)); cudaEventSynchronize(event); +#endif } +#ifdef PADDLE_WITH_CUDA template void HeterXpuTrainer::HeterMemCpy(LoDTensor* thread_tensor, LoDTensor* root_tensor, @@ -141,6 +180,27 @@ void HeterXpuTrainer::HeterMemCpy(LoDTensor* thread_tensor, root_ptr, sizeof(T) * root_tensor->numel(), stream); } } +#endif + +#ifdef PADDLE_WITH_XPU +template +void HeterXpuTrainer::HeterMemCpy(LoDTensor* thread_tensor, + LoDTensor* root_tensor, + const paddle::platform::Place& thread_place) { + T* thread_ptr = + thread_tensor->mutable_data(root_tensor->dims(), thread_place); + T* root_ptr = root_tensor->data(); + if (platform::is_cpu_place(root_tensor->place())) { + memory::Copy(BOOST_GET_CONST(platform::XPUPlace, thread_place), thread_ptr, + platform::CPUPlace(), root_ptr, + sizeof(T) * root_tensor->numel()); + } else { + memory::Copy(BOOST_GET_CONST(platform::XPUPlace, thread_place), thread_ptr, + BOOST_GET_CONST(platform::XPUPlace, root_tensor->place()), + root_ptr, sizeof(T) * root_tensor->numel()); + } +} +#endif void HeterXpuTrainer::DumpWork(int tid) {} @@ -171,13 +231,16 @@ void HeterXpuTrainer::InitOtherEnv(const ProgramDesc& main_program) { CreateThreadParam(main_program, i); pull_dense_worker_->AddThreadScope(scope); pull_dense_worker_->AddPlace(places_[i]); +#ifdef PADDLE_WITH_CUDA pull_dense_worker_->AddStream(copy_streams_[i]); +#endif } - pull_dense_worker_->Start(); +#ifdef PADDLE_WITH_CUDA for (auto& stream : copy_streams_) { cudaStreamSynchronize(stream); } +#endif op_names_.clear(); for (auto& op_desc : block.AllOps()) { std::unique_ptr local_op = OpRegistry::CreateOp(*op_desc); @@ -220,10 +283,12 @@ void HeterXpuTrainer::InitOtherEnv(const ProgramDesc& main_program) { OperatorBase* local_op_ptr = local_op.release(); (context->ops_).push_back(local_op_ptr); } +#ifdef PADDLE_WITH_CUDA auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; platform::CUDADeviceGuard guard(dev_id); PADDLE_ENFORCE_CUDA_SUCCESS( cudaEventCreateWithFlags(&context->event_, cudaEventDisableTiming)); +#endif object_pool_.Push(context); } } @@ -267,12 +332,25 @@ int HeterXpuTrainer::EndPass(const HeterRequest* request, } \ } while (0) _ForEachDataType_(MergeCallback); - if (platform::is_gpu_place(thread_tensor->place())) { + if (!platform::is_cpu_place(thread_tensor->place())) { +#ifdef PADDLE_WITH_CUDA auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, thread_tensor->place()).device; platform::CUDADeviceGuard guard(dev_id); cudaMemset(thread_tensor->data(), 0, thread_tensor->numel() * SizeOfType(thread_tensor->type())); +#endif +#ifdef PADDLE_WITH_XPU + auto place = thread_tensor->place(); + xpu_set_device(BOOST_GET_CONST(platform::XPUPlace, place).device); + platform::DeviceContextPool& pool = + platform::DeviceContextPool::Instance(); + platform::DeviceContext* dev_ctx = pool.Get(place); + const platform::XPUDeviceContext* xpu_ctx = + reinterpret_cast(dev_ctx); + xpu::memset(xpu_ctx->x_context(), thread_tensor->data(), 0, + thread_tensor->numel() * SizeOfType(thread_tensor->type())); +#endif } else { memset(thread_tensor->data(), 0, thread_tensor->numel() * SizeOfType(thread_tensor->type())); @@ -281,12 +359,25 @@ int HeterXpuTrainer::EndPass(const HeterRequest* request, auto* merge_var = response->add_vars(); heter_ptr_->SerializeToReq(need_merge_var_names_[i], root_scope_, merge_var); - if (platform::is_gpu_place(root_tensor->place())) { + if (!platform::is_cpu_place(root_tensor->place())) { +#ifdef PADDLE_WITH_CUDA auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, root_tensor->place()).device; platform::CUDADeviceGuard guard(dev_id); cudaMemset(root_tensor->data(), 0, root_tensor->numel() * SizeOfType(root_tensor->type())); +#endif +#ifdef PADDLE_WITH_XPU + auto place = root_tensor->place(); + xpu_set_device(BOOST_GET_CONST(platform::XPUPlace, place).device); + platform::DeviceContextPool& pool = + platform::DeviceContextPool::Instance(); + platform::DeviceContext* dev_ctx = pool.Get(place); + const platform::XPUDeviceContext* xpu_ctx = + reinterpret_cast(dev_ctx); + xpu::memset(xpu_ctx->x_context(), root_tensor->data(), 0, + root_tensor->numel() * SizeOfType(root_tensor->type())); +#endif } else { memset(root_tensor->data(), 0, root_tensor->numel() * SizeOfType(root_tensor->type())); @@ -346,11 +437,12 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request, OperatorBase* local_op_ptr = local_op.release(); (context->ops_).push_back(local_op_ptr); } - +#ifdef PADDLE_WITH_CUDA auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; platform::CUDADeviceGuard guard(dev_id); PADDLE_ENFORCE_CUDA_SUCCESS( cudaEventCreateWithFlags(&context->event_, cudaEventDisableTiming)); +#endif } context->Reset(); @@ -359,15 +451,22 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request, auto deserial_timer = std::make_shared("xpu_service_deserial"); for (int i = 0; i < request->vars_size(); ++i) { +#ifdef PADDLE_WITH_CUDA heter_ptr_->DeSerializeToTensor(context->scope_, request->vars(i), place, copy_streams_[context->place_num_]); +#endif +#ifdef PADDLE_WITH_XPU + heter_ptr_->DeSerializeToTensor(context->scope_, request->vars(i), place); +#endif } +#ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_CUDA_SUCCESS( cudaEventRecord(context->event_, copy_streams_[context->place_num_])); while (cudaEventQuery(context->event_) != cudaSuccess) { VLOG(3) << "wait for kernel"; bthread_yield(); } +#endif } { @@ -378,6 +477,7 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request, op->Run(*(context->scope_), place); } } +#ifdef PADDLE_WITH_CUDA auto* dev_ctx = static_cast( platform::DeviceContextPool::Instance().Get(place)); PADDLE_ENFORCE_CUDA_SUCCESS( @@ -391,6 +491,10 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request, bthread_yield(); } } +#endif +#ifdef PADDLE_WITH_XPU + xpu_wait(); +#endif for (int i = 0; i < trainer_desc_.xpu_send_list_size(); ++i) { const std::string& varname = trainer_desc_.xpu_send_list(i); @@ -407,11 +511,19 @@ int HeterXpuTrainer::RunTask(const HeterRequest* request, ++i) { uint64_t tid = static_cast(param_.program_config(0).push_dense_table_id(i)); +#ifdef PADDLE_WITH_CUDA fleet_ptr_->PushDenseVarsAsync( *(context->scope_), tid, dense_grad_names_[tid], &(context->push_dense_status_), scale_datanorm_, request->cur_batch(), places_[context->place_num_], copy_streams_[context->place_num_], context->event_); +#endif +#ifdef PADDLE_WITH_XPU + fleet_ptr_->PushDenseVarsAsync( + *(context->scope_), tid, dense_grad_names_[tid], + &(context->push_dense_status_), scale_datanorm_, request->cur_batch(), + places_[context->place_num_]); +#endif } for (int i = 0; i < param_.program_config(0).push_dense_table_id_size(); ++i) { @@ -453,7 +565,6 @@ void HeterXpuTrainer::Finalize() { pull_dense_worker_->Stop(); root_scope_->DropKids(); } - } // namespace framework } // namespace paddle #endif diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 9c1eaa99a3ca04ddbeecab639d5587d5509e3f00..96952e20c2158453df0d94c9e43c64bb6bb1e04f 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -1894,8 +1894,7 @@ PDNode *patterns::QuantizePlacement::operator()( PDNode *patterns::Bfloat16Placement::operator()( const std::unordered_set &bfloat16_enabled_op_types) { - std::unordered_set supported_op_types = - std::unordered_set(); + std::unordered_set supported_op_types{"conv2d"}; if (!bfloat16_enabled_op_types.empty()) { supported_op_types = bfloat16_enabled_op_types; } diff --git a/paddle/fluid/framework/pull_dense_worker.cc b/paddle/fluid/framework/pull_dense_worker.cc index c399c5d02eb19d08f929168de0804ecea18cde37..6aeef8a39b53342a1c7eb99ba0892bda29a8fbcd 100644 --- a/paddle/fluid/framework/pull_dense_worker.cc +++ b/paddle/fluid/framework/pull_dense_worker.cc @@ -62,13 +62,15 @@ void PullDenseWorker::Initialize(const TrainerDesc& param) { fleet_ptr_ = FleetWrapper::GetInstance(); #ifdef PADDLE_WITH_CUDA copy_streams_.clear(); +#endif +#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU) places_.clear(); thread_scopes_.clear(); #endif } void PullDenseWorker::CreatePinVar() { -#ifdef PADDLE_WITH_CUDA +#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_PSLIB) // for (auto& v : dense_value_names_) { // for (auto& name : v.second) { for (int i = 0; i < dwp_param_.program_config(0).pull_dense_table_id_size(); @@ -83,8 +85,13 @@ void PullDenseWorker::CreatePinVar() { auto* ptr = root_scope_->Var(name + "pin"); InitializeVariable(ptr, proto::VarType::LOD_TENSOR); LoDTensor* pin_tensor = ptr->GetMutable(); +#ifdef PADDLE_WITH_CUDA pin_tensor->mutable_data(tensor->dims(), platform::CUDAPinnedPlace()); +#endif +#ifdef PADDLE_WITH_XPU + pin_tensor->mutable_data(tensor->dims(), platform::CPUPlace()); +#endif } } #endif @@ -107,7 +114,7 @@ void PullDenseWorker::Wait(std::vector<::std::future>* status_vec) { exit(-1); } status_vec->resize(0); -#ifdef PADDLE_WITH_CUDA +#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU) for (size_t i = 0; i < places_.size(); ++i) { // for (auto& v : dense_value_names_) { @@ -125,9 +132,16 @@ void PullDenseWorker::Wait(std::vector<::std::future>* status_vec) { Variable* var = thread_scopes_[i]->FindVar(name); LoDTensor* tensor = var->GetMutable(); float* w = tensor->data(); +#ifdef PADDLE_WITH_CUDA memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, places_[i]), w, platform::CUDAPinnedPlace(), pin_w, sizeof(float) * tensor->numel(), copy_streams_[i]); +#endif +#ifdef PADDLE_WITH_XPU + memory::Copy(BOOST_GET_CONST(platform::XPUPlace, places_[i]), w, + platform::CPUPlace(), pin_w, + sizeof(float) * tensor->numel()); +#endif } } } @@ -148,7 +162,7 @@ void PullDenseWorker::PullDense(bool force_update) { uint64_t tid = static_cast( dwp_param_.program_config(0).pull_dense_table_id(i)); if (force_update || CheckUpdateParam(tid)) { -#ifdef PADDLE_WITH_CUDA +#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU) VLOG(3) << "pull dense " << force_update << " " << tid; fleet_ptr_->PullDenseVarsAsync(*root_scope_, tid, dense_value_names_[tid], &pull_dense_status_, false); diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h index d041ef48e2c04a8e6db6bee7fe1c762de89bc9eb..ecaec49aa461cd6134cb60b7af7adb50f1a94686 100644 --- a/paddle/fluid/framework/trainer.h +++ b/paddle/fluid/framework/trainer.h @@ -138,7 +138,8 @@ class DistMultiTrainer : public MultiTrainer { std::shared_ptr pull_dense_worker_; }; -#if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB) +#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \ + (defined PADDLE_WITH_PSLIB) class HeterServiceContext { public: HeterServiceContext() {} @@ -151,7 +152,9 @@ class HeterServiceContext { void Reset() { push_dense_status_.clear(); } int place_num_; Scope* scope_{nullptr}; +#ifdef PADDLE_WITH_CUDA cudaEvent_t event_; +#endif std::vector ops_; std::vector<::std::future> push_dense_status_; }; @@ -178,10 +181,18 @@ class HeterXpuTrainer : public TrainerBase { virtual void CacheProgram(const ProgramDesc& main_program) { new (&program_) ProgramDesc(main_program); } + virtual std::string GetDumpPath(int tid) { return ""; } + virtual void InitDumpEnv() {} template +#ifdef PADDLE_WITH_CUDA void HeterMemCpy(LoDTensor* tensor, LoDTensor* root_tensor, const paddle::platform::Place& thread_place, cudaStream_t stream); +#endif +#ifdef PADDLE_WITH_XPU + void HeterMemCpy(LoDTensor* thread_tensor, LoDTensor* root_tensor, + const paddle::platform::Place& thread_place); +#endif void CreateThreadParam(const ProgramDesc& program, int num); template void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor); @@ -207,9 +218,11 @@ class HeterXpuTrainer : public TrainerBase { std::vector op_names_; std::vector place_scopes_; BtObjectPool object_pool_; - std::vector copy_streams_; std::vector places_; +#ifdef PADDLE_WITH_CUDA + std::vector copy_streams_; std::vector events_; +#endif }; #endif diff --git a/paddle/fluid/framework/trainer_factory.cc b/paddle/fluid/framework/trainer_factory.cc index 15584620d86b62c05e5fef841fc26058e8610c21..cc92c50cc428a59905ce3864a0b89f591d1b2390 100644 --- a/paddle/fluid/framework/trainer_factory.cc +++ b/paddle/fluid/framework/trainer_factory.cc @@ -63,7 +63,8 @@ std::shared_ptr TrainerFactory::CreateTrainer( REGISTER_TRAINER_CLASS(MultiTrainer); REGISTER_TRAINER_CLASS(DistMultiTrainer); -#if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB) +#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \ + (defined PADDLE_WITH_PSLIB) REGISTER_TRAINER_CLASS(HeterXpuTrainer); #endif #if defined(PADDLE_WITH_NCCL) diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc index 7caeb4378ce3d1ca1d1557054642c9fa184bea39..07f1868b7fa29914b4d362cf2c71d9380ca446be 100644 --- a/paddle/fluid/imperative/gradient_accumulator.cc +++ b/paddle/fluid/imperative/gradient_accumulator.cc @@ -13,9 +13,11 @@ // limitations under the License. #include "paddle/fluid/imperative/gradient_accumulator.h" + #include #include #include + #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/selected_rows.h" @@ -136,9 +138,13 @@ void TensorAdd(const framework::Variable& src, framework::Variable* dst) { return; } - PADDLE_ENFORCE_EQ(dst_tensor->numel() == numel, true, - "dst_numel %d vs. src_numel %d", dst_tensor->numel(), - numel); + PADDLE_ENFORCE_EQ( + dst_tensor->numel(), numel, + platform::errors::PreconditionNotMet( + "The number of elements of source tensor and destination tensor " + "should be equal, but got the number of elements of source tensor is " + "%zu and the number of elements of destination tensor is %zu.", + numel, dst_tensor->numel())); auto data_type = src_tensor.type(); auto place = src_tensor.place(); diff --git a/paddle/fluid/imperative/jit/program_desc_tracer.cc b/paddle/fluid/imperative/jit/program_desc_tracer.cc index 9f4cf713f7c19c0d8bebdaf023d512a4f593d8e4..59ff5b4eae4419274412160632ed78c02b298867 100644 --- a/paddle/fluid/imperative/jit/program_desc_tracer.cc +++ b/paddle/fluid/imperative/jit/program_desc_tracer.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/imperative/jit/program_desc_tracer.h" + #include #include @@ -203,7 +204,8 @@ TracedProgramTuple ProgramDescTracer::CreateProgramDesc( void ProgramDescTracer::InsertVarIfNotExist( const std::shared_ptr &new_var, bool is_input) { - PADDLE_ENFORCE_NOT_NULL(new_var); + PADDLE_ENFORCE_NOT_NULL(new_var, platform::errors::InvalidArgument( + "The variable to insert is NULL.")); if (vars_.count(new_var) != 0) return; auto new_var_desc = new framework::VarDesc(""); @@ -220,7 +222,9 @@ void ProgramDescTracer::InsertVarIfNotExist( } const auto &inner_var = new_var->Var(); - PADDLE_ENFORCE_EQ(inner_var.IsInitialized(), true); + PADDLE_ENFORCE_EQ(inner_var.IsInitialized(), true, + platform::errors::InvalidArgument( + "The variable to insert is not initialized.")); if (inner_var.IsType()) { const auto &tensor = inner_var.Get(); new_var_desc->SetType(framework::proto::VarType::LOD_TENSOR); @@ -232,8 +236,9 @@ void ProgramDescTracer::InsertVarIfNotExist( new_var_desc->SetDataType(framework::proto::VarType::FP32); } } else { - PADDLE_THROW("Not support variable type %s", - framework::ToTypeName(inner_var.Type())); + PADDLE_THROW(platform::errors::InvalidArgument( + "Not support variable type %s.", + framework::ToTypeName(inner_var.Type()))); } } diff --git a/paddle/fluid/imperative/nccl_context.cc b/paddle/fluid/imperative/nccl_context.cc index 115078e7eadfc03153fc95ef05a9d4bb6cd40369..c8fd31fcbffe680da36d03276ec0d4c1095030bc 100644 --- a/paddle/fluid/imperative/nccl_context.cc +++ b/paddle/fluid/imperative/nccl_context.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/imperative/nccl_context.h" + #include "paddle/fluid/platform/collective_helper.h" namespace paddle { @@ -21,8 +22,10 @@ namespace imperative { void NCCLParallelContext::RecvNCCLID(const std::string &ep, ncclUniqueId *nccl_id) { auto addr = paddle::string::Split(ep, ':'); - PADDLE_ENFORCE_EQ(addr.size(), 2UL, - "The endpoint should contain host and port: %s", ep); + PADDLE_ENFORCE_EQ( + addr.size(), 2UL, + platform::errors::InvalidArgument( + "The endpoint should contain host and port, but got %s.", ep)); std::string host = addr[0]; int port = std::stoi(addr[1]); @@ -32,27 +35,41 @@ void NCCLParallelContext::RecvNCCLID(const std::string &ep, char buffer[1024] = {0}; int opt = 0; // creating socket fd - if ((server_fd = socket(AF_INET, SOCK_STREAM, 0)) == 0) - PADDLE_THROW("create server fd failed"); - if (setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt))) - PADDLE_THROW("set socket opt failed"); + if ((server_fd = socket(AF_INET, SOCK_STREAM, 0)) == 0) { + PADDLE_THROW( + platform::errors::Unavailable("Create server file descriptor failed.")); + } + + if (setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt))) { + PADDLE_THROW(platform::errors::Unavailable("Set socket options failed.")); + } address.sin_family = AF_INET; address.sin_addr.s_addr = INADDR_ANY; address.sin_port = htons(port); - if (bind(server_fd, (struct sockaddr *)&address, sizeof(address)) < 0) - PADDLE_THROW("binding failed on ep: %s", ep); + if (bind(server_fd, (struct sockaddr *)&address, sizeof(address)) < 0) { + PADDLE_THROW( + platform::errors::Unavailable("Bind on endpoint %s failed.", ep)); + } + VLOG(3) << "listening on: " << ep; - if (listen(server_fd, 3) < 0) PADDLE_THROW("listen on server fd failed"); + if (listen(server_fd, 3) < 0) { + PADDLE_THROW(platform::errors::Unavailable( + "Listen on server file descriptor failed.")); + } if ((new_socket = accept(server_fd, reinterpret_cast(&address), - reinterpret_cast(&addrlen))) < 0) - PADDLE_THROW("accept the new socket fd failed"); + reinterpret_cast(&addrlen))) < 0) { + PADDLE_THROW(platform::errors::Unavailable( + "Accept the new socket file descriptor failed.")); + } + + if (read(new_socket, buffer, 1024) < 0) { + PADDLE_THROW(platform::errors::Unavailable("Read from socket failed.")); + } - if (read(new_socket, buffer, 1024) < 0) - PADDLE_THROW("reading the ncclUniqueId from socket failed"); VLOG(3) << "recevived the ncclUniqueId"; memcpy(nccl_id, buffer, NCCL_UNIQUE_ID_BYTES); @@ -63,8 +80,10 @@ void NCCLParallelContext::RecvNCCLID(const std::string &ep, void NCCLParallelContext::SendNCCLID(const std::string &ep, ncclUniqueId *nccl_id) { auto addr = paddle::string::Split(ep, ':'); - PADDLE_ENFORCE_EQ(addr.size(), 2UL, - "The endpoint should contain host and port: %s", ep); + PADDLE_ENFORCE_EQ( + addr.size(), 2UL, + platform::errors::InvalidArgument( + "The endpoint should contain host and port, but got %s.", ep)); std::string host = addr[0]; int port = std::stoi(addr[1]); // struct sockaddr_in address; @@ -73,15 +92,17 @@ void NCCLParallelContext::SendNCCLID(const std::string &ep, char buffer[1024] = {0}; memcpy(buffer, nccl_id, NCCL_UNIQUE_ID_BYTES); - if ((sock = socket(AF_INET, SOCK_STREAM, 0)) < 0) - PADDLE_THROW("create socket failed"); + if ((sock = socket(AF_INET, SOCK_STREAM, 0)) < 0) { + PADDLE_THROW(platform::errors::Unavailable("Create socket failed.")); + } memset(&serv_addr, '0', sizeof(serv_addr)); serv_addr.sin_family = AF_INET; serv_addr.sin_port = htons(port); - if (inet_pton(AF_INET, host.c_str(), &serv_addr.sin_addr) <= 0) - PADDLE_THROW("invalied address: %s", ep); + if (inet_pton(AF_INET, host.c_str(), &serv_addr.sin_addr) <= 0) { + PADDLE_THROW(platform::errors::Unavailable("Open address %s failed.", ep)); + } int try_times = 0; while (true) { diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc index 92e3933a072832fa42520e67f455d3dc90118518..c661c9f9c37509f6b55f6ce8b67b11752c68418a 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc @@ -127,11 +127,10 @@ void *Alloc(const platform::XPUPlace &place, size_t size) { "Baidu Kunlun Card is properly installed.", ret)); ret = xpu_malloc(reinterpret_cast(&p), size); - PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, - platform::errors::External( - "XPU API return wrong value[%d], please check whether " - "Baidu Kunlun Card is properly installed.", - ret)); + PADDLE_ENFORCE_EQ( + ret, XPU_SUCCESS, + platform::errors::External( + "XPU API return wrong value[%d], no enough memory", ret)); if (FLAGS_init_allocated_mem) { PADDLE_THROW(platform::errors::Unimplemented( "xpu memory FLAGS_init_allocated_mem is not implemented.")); diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 5a3660cee85762f3d76129dfb694eeb6d87bb52c..a640a6c745ccb7e7cda0e47b17104ec990fce0dd 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -763,10 +763,28 @@ class ActivationOpDoubleGrad2 : public framework::OperatorWithKernel { } }; -// +// AbsGrad: dx=dy if x >=0 else -dy +// AbsDoubleGrad: ddy = ddx if x >=0 else -ddx +template +class AbsDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker { + public: + using ::paddle::framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr op) const override { + op->SetType("abs_grad_grad"); + // input1: x + op->SetInput("X", this->Input("X")); + // input2: ddx + op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X"))); + op->SetAttrMap(this->Attrs()); + // output: ddy + op->SetOutput("DDOut", this->InputGrad(framework::GradVarName("Out"))); + } +}; + // ReluGrad: dx = dy if y >= 0 else 0 // ReluGradGrad: ddy = ddx if y >= 0 else 0 -// template class ReluDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker { public: @@ -873,6 +891,28 @@ class SquareDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker { } }; +// log Grad: dx = dout / x +// log Grad Grad: ddout = ddx / x; dx = -(dout / x) * (ddx / x) +template +class LogDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker { + public: + using ::paddle::framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr op) const override { + op->SetType("log_grad_grad"); + op->SetInput("X", this->Input("X")); + // X@GRAD@GRAD: ddx + op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X"))); + op->SetInput("DOut", this->Input(framework::GradVarName("Out"))); + op->SetAttrMap(this->Attrs()); + // X@GRAD: dx + op->SetOutput("DX", this->InputGrad("X")); + // Out@GRAD@GRAD: ddy + op->SetOutput("DDOut", this->InputGrad(framework::GradVarName("Out"))); + } +}; + DECLARE_INPLACE_OP_INFERER(ActivationGradOpInplaceInferer, {framework::GradVarName("Out"), framework::GradVarName("X")}); @@ -1214,7 +1254,13 @@ REGISTER_OPERATOR( std::conditional>(), ops::ActFwdInplaceInferer, void>::type); REGISTER_OPERATOR(abs_grad, ops::ActivationOpGrad, - ops::ActivationGradOpInplaceInferer); + ops::ActivationGradOpInplaceInferer, + ops::AbsDoubleGradMaker, + ops::AbsDoubleGradMaker); +REGISTER_OPERATOR( + abs_grad_grad, + ops::ActivationOpDoubleGrad::FwdDeps()>, + ops::ActivationDoubleGradOpInplaceInferer); REGISTER_OP_CPU_KERNEL(abs, ops::ActivationKernel>, ops::ActivationGradKernel>); +REGISTER_OP_CPU_KERNEL( + abs_grad_grad, + ops::ActivationDoubleGradKernel>, + ops::ActivationDoubleGradKernel>, + ops::ActivationDoubleGradKernel>, + ops::ActivationDoubleGradKernel>, + ops::ActivationDoubleGradKernel>); +/* ========================================================================== */ + +/* ========================== Log register ==================================*/ +REGISTER_OPERATOR( + log, ops::ActivationOp, ops::LogOpMaker, ops::ActivationOpInferVarType, + ops::ActivationGradOpMaker::FwdDeps(), + paddle::framework::OpDesc>, + ops::ActivationGradOpMaker::FwdDeps(), + paddle::imperative::OpBase>, + ops::ActFwdInplaceInferer); +REGISTER_OPERATOR(log_grad, ops::ActivationOpGrad, + ops::ActivationGradOpInplaceInferer, + ops::LogDoubleGradMaker, + ops::LogDoubleGradMaker); + +REGISTER_OPERATOR( + log_grad_grad, + ops::ActivationOpDoubleGrad::FwdDeps()>, + ops::ActivationDoubleGradOpInplaceInferer); + +REGISTER_ACTIVATION_CPU_KERNEL(log, Log, LogFunctor, LogGradFunctor); + +REGISTER_OP_CPU_KERNEL( + log_grad_grad, ops::LogDoubleGradKernel>, + ops::LogDoubleGradKernel>, + ops::LogDoubleGradKernel>); /* ========================================================================== */ /* ========================== register checkpoint ===========================*/ diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu index 48ec90471f0becf921e7e68eb8722544885aaa7a..839776ad58d0352cd9bdd59530951e4eea1120b3 100644 --- a/paddle/fluid/operators/activation_op.cu +++ b/paddle/fluid/operators/activation_op.cu @@ -160,7 +160,7 @@ REGISTER_OP_CUDA_KERNEL( ops::ExpGradFunctor>); /* ========================================================================== */ -/* ========================== exp register ============================ */ +/* ========================== abs register ============================ */ REGISTER_OP_CUDA_KERNEL( abs, ops::ActivationKernel>, @@ -180,4 +180,28 @@ REGISTER_OP_CUDA_KERNEL( ops::AbsGradFunctor>, ops::ActivationGradKernel>); +REGISTER_OP_CUDA_KERNEL( + abs_grad_grad, + ops::ActivationDoubleGradKernel>, + ops::ActivationDoubleGradKernel>, + ops::ActivationDoubleGradKernel>, + ops::ActivationDoubleGradKernel>, + ops::ActivationDoubleGradKernel>); +/* ========================================================================== */ + +/* ========================== Log register ==================================*/ +REGISTER_ACTIVATION_CUDA_KERNEL(log, Log, LogFunctor, LogGradFunctor); + +REGISTER_OP_CUDA_KERNEL( + log_grad_grad, ops::LogDoubleGradKernel>, + ops::LogDoubleGradKernel>, + ops::LogDoubleGradKernel>); /* ========================================================================== */ diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index 00a7c063c9155488d117332d5ef3541d16d76bdb..a5c613297a473c326a2d239ad57ac0cca5a165f3 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -1430,6 +1430,27 @@ class ActivationDoubleGradKernel } }; +template +struct AbsGradGradFunctor : public BaseActivationFunctor { + template + void operator()(const Device& dev, const framework::Tensor* X, + const framework::Tensor* Out, const framework::Tensor* ddX, + framework::Tensor* ddOut, framework::Tensor* dOut, + framework::Tensor* dX) const { + auto* d = dev.eigen_device(); + auto ddx = framework::EigenVector::Flatten( + GET_DATA_SAFELY(ddX, "Input", "DDX", "AbsGradGrad")); + auto x = framework::EigenVector::Flatten( + GET_DATA_SAFELY(X, "Input", "X", "AbsGradGrad")); + if (ddOut) { + auto ddout = framework::EigenVector::Flatten( + GET_DATA_SAFELY(ddOut, "Output", "DDOut", "AbsGradGrad")); + ddout.device(*d) = ddx * x.sign(); + } + } + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + template struct ReluGradGradFunctor : public BaseActivationFunctor { template @@ -1642,6 +1663,10 @@ class SquareDoubleGradKernel } }; +template +class LogDoubleGradKernel + : public SquareDoubleGradKernel {}; + template class ELUDoubleGradKernel : public framework::OpKernel { @@ -1831,6 +1856,37 @@ class PowGradKernel functor(*place, x, out, dout, dx); } }; + +template +struct LogGradGradFunctor : public BaseActivationFunctor { + template + void operator()(const Device& dev, const framework::Tensor* X, + const framework::Tensor* ddX, framework::Tensor* ddOut, + const framework::Tensor* dOut, framework::Tensor* dX) const { + auto* d = dev.eigen_device(); + auto ddx = framework::EigenVector::Flatten( + GET_DATA_SAFELY(ddX, "Input", "DDX", "LogGradGrad")); + auto x = framework::EigenVector::Flatten( + GET_DATA_SAFELY(X, "Input", "X", "LogGradGrad")); + // ddout = ddx / x; dx = -(dout / x) * (ddx / x) + // calculate dx first, so ddout can inplace ddx + if (dX) { + auto dout = framework::EigenVector::Flatten( + GET_DATA_SAFELY(dOut, "Output", "DOut", "LogGradGrad")); + auto dx = framework::EigenVector::Flatten( + GET_DATA_SAFELY(dX, "Output", "DX", "LogGradGrad")); + dx.device(*d) = dout * static_cast(-1) * ddx / (x * x); + } + if (ddOut) { + auto ddout = framework::EigenVector::Flatten( + GET_DATA_SAFELY(ddOut, "Output", "DDOut", "LogGradGrad")); + ddout.device(*d) = ddx * static_cast(1) / x; + } + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + } // namespace operators } // namespace paddle @@ -1851,7 +1907,6 @@ class PowGradKernel __macro(cosh, Cosh, CoshFunctor, CoshGradFunctor); \ __macro(round, Round, RoundFunctor, ZeroGradFunctor); \ __macro(reciprocal, Reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \ - __macro(log, Log, LogFunctor, LogGradFunctor); \ __macro(log1p, Log1p, Log1pFunctor, Log1pGradFunctor); \ __macro(brelu, BRelu, BReluFunctor, BReluGradFunctor); \ __macro(soft_relu, SoftRelu, SoftReluFunctor, SoftReluGradFunctor); \ diff --git a/paddle/fluid/operators/activation_op_xpu.cc b/paddle/fluid/operators/activation_op_xpu.cc new file mode 100644 index 0000000000000000000000000000000000000000..49b7a08a7b52b0a1dd110215aad301f1b484317e --- /dev/null +++ b/paddle/fluid/operators/activation_op_xpu.cc @@ -0,0 +1,179 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_XPU + +#include "paddle/fluid/operators/activation_op.h" +#include +#include "paddle/fluid/platform/xpu_header.h" + +namespace paddle { +namespace operators { + +using paddle::framework::Tensor; + +template +class XPUActivationKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + Functor functor; + + auto attrs = functor.GetAttrs(); + for (auto &attr : attrs) { + *attr.second = context.Attr(attr.first); + } + functor(context); + } +}; + +template +class XPUActivationGradKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + Functor functor; + + auto attrs = functor.GetAttrs(); + for (auto &attr : attrs) { + *attr.second = context.Attr(attr.first); + } + functor(context); + } +}; + +template +void xpu_activation_forward(const framework::ExecutionContext &ctx, + xpu::Activation_t type) { + const auto *x = ctx.Input("X"); + auto *y = ctx.Output("Out"); + const T *x_data = x->data(); + T *y_data = y->mutable_data(ctx.GetPlace()); + int r = 0; + if (xpu::Activation_t::ACT_POW == type.type) { + type.pow_factor = ctx.Attr("factor"); + } + auto xpu_context = ctx.device_context().x_context(); + r = xpu::activation_forward(xpu_context, type, x->numel(), + reinterpret_cast(x_data), + reinterpret_cast(y_data)); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, + platform::errors::External( + "XPU API return wrong value[%d], please check whether " + "Baidu Kunlun Card is properly installed.", + r)); +} + +template +void xpu_activation_backward(const framework::ExecutionContext &ctx, + xpu::Activation_t type) { + /* TODO: relu tanh sigmoid are inplace */ + const auto *x = ctx.Input("X"); + auto *y = ctx.Input("Out"); + auto *dOut = ctx.Input(framework::GradVarName("Out")); + auto *dX = ctx.Output(framework::GradVarName("X")); + const T *x_data = nullptr; + const T *y_data = nullptr; + const T *y_grad = nullptr; + if (x != nullptr) x_data = x->data(); + if (y != nullptr) y_data = y->data(); + if (dOut != nullptr) y_grad = dOut->data(); + T *x_grad = dX->mutable_data(ctx.GetPlace()); + auto xpu_context = ctx.device_context().x_context(); + int r = xpu::activation_backward(xpu_context, type, dX->numel(), + reinterpret_cast(x_data), + reinterpret_cast(y_data), + reinterpret_cast(y_grad), + reinterpret_cast(x_grad)); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, + platform::errors::External( + "XPU API return wrong value[%d], please check whether " + "Baidu Kunlun Card is properly installed.", + r)); +} + +template +struct XPUActivationFunc : public BaseActivationFunctor { + void operator()(const framework::ExecutionContext &ctx) const { + xpu_activation_forward(ctx, + algorithm); + } +}; + +template +struct XPUActivationGradFunc : public BaseActivationFunctor { + void operator()(const framework::ExecutionContext &ctx) const { + xpu_activation_backward(ctx, + algorithm); + } +}; + +template +using XPUReluFunctor = XPUActivationFunc; +template +using XPUSigmoidFunctor = XPUActivationFunc; +template +using XPUTanhFunctor = XPUActivationFunc; +template +using XPUGeluFunctor = XPUActivationFunc; +template +using XPULogFunctor = XPUActivationFunc; +template +using XPUSquareFunctor = XPUActivationFunc; +template +using XPUSuareGradFunctor = XPUActivationGradFunc; +template +using XPUReluGradFunctor = XPUActivationGradFunc; +template +using XPUSigmoidGradFunctor = + XPUActivationGradFunc; +template +using XPUTanhGradFunctor = XPUActivationGradFunc; +template +using XPUGeluGradFunctor = XPUActivationGradFunc; +template +using XPUSqrtFunctor = XPUActivationFunc; +template +using XPUSqrtGradFunctor = XPUActivationGradFunc; +template +using XPUACTPowFunctor = XPUActivationFunc; +template +using XPUABSFunctor = XPUActivationFunc; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +#define REGISTER_ACTIVATION_XPU_KERNEL(act_type, functor, grad_functor) \ + REGISTER_OP_XPU_KERNEL(act_type, \ + ops::XPUActivationKernel>); \ + REGISTER_OP_XPU_KERNEL( \ + act_type##_grad, \ + ops::XPUActivationGradKernel>); + +REGISTER_ACTIVATION_XPU_KERNEL(relu, XPUReluFunctor, XPUReluGradFunctor) +REGISTER_ACTIVATION_XPU_KERNEL(tanh, XPUTanhFunctor, XPUTanhGradFunctor) +REGISTER_ACTIVATION_XPU_KERNEL(sigmoid, XPUSigmoidFunctor, + XPUSigmoidGradFunctor) +REGISTER_ACTIVATION_XPU_KERNEL(gelu, XPUGeluFunctor, XPUGeluGradFunctor) +REGISTER_ACTIVATION_XPU_KERNEL(sqrt, XPUSqrtFunctor, XPUSqrtGradFunctor) +REGISTER_ACTIVATION_XPU_KERNEL(square, XPUSquareFunctor, XPUSuareGradFunctor) +REGISTER_OP_XPU_KERNEL(log, + ops::XPUActivationKernel>); +REGISTER_OP_XPU_KERNEL(pow, + ops::XPUActivationKernel>); +REGISTER_OP_XPU_KERNEL(abs, + ops::XPUActivationKernel>); + +#endif // PADDLE_WITH_XPU diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index dcfe8bb1bb48a505f5526f6471e8ce9ba848b5b3..7a88403aa9daa78f1093115124eb19167bf6e99d 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -839,6 +839,7 @@ void BatchNormDoubleGradMaker::Apply(GradOpPtr op) const { op->SetInput("SavedMean", this->Input("SavedMean")); op->SetInput("SavedVariance", this->Input("SavedVariance")); if (BOOST_GET_CONST(bool, this->GetAttr("use_global_stats"))) { + op->SetInput("Mean", this->Input("Mean")); op->SetInput("Variance", this->Input("Variance")); } op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X"))); @@ -868,14 +869,19 @@ void BatchNormDoubleGradOp::InferShape( "BatchNormDoubleGrad"); } - OP_INOUT_CHECK(ctx->HasInput("DDX"), "Input", "DDX", "BatchNormDoubleGrad"); OP_INOUT_CHECK(ctx->HasInput("DY"), "Input", "DY", "BatchNormDoubleGrad"); // check output OP_INOUT_CHECK(ctx->HasOutput("DX"), "Output", "DX", "BatchNormDoubleGrad"); const auto x_dims = ctx->GetInputDim("X"); - const int C = x_dims[1]; + const DataLayout data_layout = framework::StringToDataLayout( + ctx->Attrs().Get("data_layout")); + const int C = + ((this->IsMKLDNNType() == true) || (data_layout == DataLayout::kNCHW) + ? x_dims[1] + : x_dims[x_dims.size() - 1]); + if (ctx->HasOutput("DX")) { ctx->SetOutputDim("DX", x_dims); } @@ -957,7 +963,9 @@ class BatchNormDoubleGradKernel Tensor inv_var_tensor; if (use_global_stats) { + const auto *running_mean = ctx.Input("Mean"); const auto *running_variance = ctx.Input("Variance"); + mean_data = running_mean->data(); inv_var_tensor.Resize({C}); T *running_inv_var_data = inv_var_tensor.mutable_data(ctx.GetPlace()); @@ -1077,12 +1085,12 @@ class BatchNormDoubleGradKernel // (np.mean(dy, axis=(n,h,w)) - dy) + inv_var.pow(3) / NxHxW * // np.sum(dy, // axis=(n,h,w)) * (x - mean) * - // (np.mean(ddx, axis=(n,h,w)) - ddx) + ddr * (dy * inv_var - + // (np.mean(ddx, axis=(n,h,w)) - ddx)) + ddr * (dy * inv_var - // inv_var // * // np.mean(dy, axis=(n,h,w)) - // inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean), - // axis=(n,h,w)))) + // axis=(n,h,w))) if (ddX) { dx_arr += @@ -1176,7 +1184,8 @@ class BatchNormDoubleGradKernel C, sample_size); ddy_arr.setZero(); if (use_global_stats) { - // math: ddy = r * ddx * inv_var + // math: ddy = r * ddx * inv_var + ddbias + + // ddscale * (x - mean) * inv_var if (ddX) { ddy_arr = scale_tile_data * ddx_arr * inv_var_tile_data; } @@ -1196,25 +1205,29 @@ class BatchNormDoubleGradKernel .replicate(1, sample_size) / sample_size); } - if (ddScale && ddBias) { - ConstEigenVectorArrayMap ddscale_arr(ddScale->data(), C); - Tensor ddscale_tile; - ddscale_tile.Resize({C, sample_size}); - EigenArrayMap ddscale_tile_data( - ddscale_tile.mutable_data(ctx.GetPlace()), C, sample_size); - ddscale_tile_data = ddscale_arr.replicate(1, sample_size); + } + if (ddScale) { + ConstEigenVectorArrayMap ddscale_arr(ddScale->data(), C); + Tensor ddscale_tile; + ddscale_tile.Resize({C, sample_size}); + EigenArrayMap ddscale_tile_data( + ddscale_tile.mutable_data(ctx.GetPlace()), C, sample_size); + ddscale_tile_data = ddscale_arr.replicate(1, sample_size); + + ddy_arr += x_sub_mean_mul_invstd_arr * ddscale_tile_data; + } - ConstEigenVectorArrayMap ddbias_arr(ddBias->data(), C); - Tensor ddbias_tile; - ddbias_tile.Resize({C, sample_size}); - EigenArrayMap ddbias_tile_data( - ddbias_tile.mutable_data(ctx.GetPlace()), C, sample_size); - ddbias_tile_data = ddbias_arr.replicate(1, sample_size); + if (ddBias) { + ConstEigenVectorArrayMap ddbias_arr(ddBias->data(), C); + Tensor ddbias_tile; + ddbias_tile.Resize({C, sample_size}); + EigenArrayMap ddbias_tile_data( + ddbias_tile.mutable_data(ctx.GetPlace()), C, sample_size); + ddbias_tile_data = ddbias_arr.replicate(1, sample_size); - ddy_arr += x_sub_mean_mul_invstd_arr * ddscale_tile_data; - ddy_arr += ddbias_tile_data; - } + ddy_arr += ddbias_tile_data; } + if (data_layout == DataLayout::kNCHW) { VLOG(3) << "Transform batchnorm output from NHWC to NCHW"; TransToChannelFirst( diff --git a/paddle/fluid/operators/benchmark/op_tester.cc b/paddle/fluid/operators/benchmark/op_tester.cc index 5ec34e574504e1058021a0623d09d4d33cf75c66..654df5ccd5e9df324f6e127addadd4e71a641d94 100644 --- a/paddle/fluid/operators/benchmark/op_tester.cc +++ b/paddle/fluid/operators/benchmark/op_tester.cc @@ -47,8 +47,8 @@ void OpTester::Init(const OpTesterConfig &config) { CreateInputVarDesc(); CreateOutputVarDesc(); } else { - PADDLE_THROW(platform::errors::NotFound("Operator '%s' is not registered.", - config_.op_type)); + PADDLE_THROW(platform::errors::NotFound( + "Operator '%s' is not registered in OpTester.", config_.op_type)); } if (config_.device_id >= 0) { @@ -81,7 +81,8 @@ void OpTester::Run() { platform::EnableProfiler(platform::ProfilerState::kAll); platform::SetDeviceId(config_.device_id); #else - PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); + PADDLE_THROW(platform::errors::PermissionDenied( + "'CUDAPlace' is not supported in CPU only device.")); #endif } @@ -162,7 +163,8 @@ framework::proto::VarType::Type OpTester::TransToVarType(std::string str) { } else if (str == "fp64") { return framework::proto::VarType::FP64; } else { - PADDLE_THROW("Unsupported dtype %s.", str.c_str()); + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported dtype %s in OpTester.", str.c_str())); } } @@ -233,8 +235,8 @@ void OpTester::CreateOpDesc() { case framework::proto::AttrType::INTS: case framework::proto::AttrType::FLOATS: case framework::proto::AttrType::STRINGS: - PADDLE_THROW( - platform::errors::Unimplemented("Not supported STRINGS type yet.")); + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported STRINGS type in OpTester yet.")); break; case framework::proto::AttrType::LONG: { int64_t value = StringTo(value_str); @@ -242,7 +244,8 @@ void OpTester::CreateOpDesc() { } break; case framework::proto::AttrType::LONGS: default: - PADDLE_THROW("Unsupport attr type %d", type); + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupport attr type %d in OpTester.", type)); } } } @@ -299,7 +302,8 @@ void OpTester::SetupTensor(framework::LoDTensor *tensor, } is.close(); } else { - PADDLE_THROW("Unsupported initializer %s.", initializer.c_str()); + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported initializer %s in OpTester.", initializer.c_str())); } if (!platform::is_cpu_place(place_)) { @@ -351,7 +355,8 @@ void OpTester::CreateVariables(framework::Scope *scope) { static_cast(1.0), item.second.initializer, item.second.filename); } else { - PADDLE_THROW("Unsupported dtype %d.", data_type); + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported dtype %d in OpTester.", data_type)); } VLOG(3) << "Set lod for tensor " << var_name; @@ -473,7 +478,8 @@ std::string OpTester::DebugString() { << "\n"; } break; default: - PADDLE_THROW("Unsupport attr type %d", attr_type); + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupport attr type %d in OpTester.", attr_type)); } ss << GenSpaces(--count) << "}\n"; } @@ -484,8 +490,10 @@ std::string OpTester::DebugString() { TEST(op_tester, base) { if (!FLAGS_op_config_list.empty()) { std::ifstream fin(FLAGS_op_config_list, std::ios::in | std::ios::binary); - PADDLE_ENFORCE(static_cast(fin), "Cannot open file %s", - FLAGS_op_config_list.c_str()); + PADDLE_ENFORCE_EQ( + static_cast(fin), true, + platform::errors::InvalidArgument("OpTester cannot open file %s", + FLAGS_op_config_list.c_str())); std::vector op_configs; while (!fin.eof()) { VLOG(4) << "Reading config " << op_configs.size() << "..."; diff --git a/paddle/fluid/operators/benchmark/op_tester_config.cc b/paddle/fluid/operators/benchmark/op_tester_config.cc index 818e5f64edc2c1d213659c48d282df75625676ca..e9477798858d13e7a2862081561634011f9156c8 100644 --- a/paddle/fluid/operators/benchmark/op_tester_config.cc +++ b/paddle/fluid/operators/benchmark/op_tester_config.cc @@ -78,7 +78,8 @@ void OpInputConfig::ParseDType(std::istream& is) { } else if (dtype_str == "fp64" || dtype_str == "double") { dtype = "fp64"; } else { - PADDLE_THROW("Unsupported dtype %s", dtype_str.c_str()); + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported dtype %s in OpInputConfig.", dtype_str.c_str())); } VLOG(4) << "dtype of input " << name << " is: " << dtype; } @@ -91,7 +92,9 @@ void OpInputConfig::ParseInitializer(std::istream& is) { const std::vector supported_initializers = {"random", "natural", "zeros", "file"}; if (!Has(supported_initializers, initializer_str)) { - PADDLE_THROW("Unsupported initializer %s", initializer_str.c_str()); + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported initializer %s in OpInputConfig.", + initializer_str.c_str())); } initializer = initializer_str; @@ -126,7 +129,12 @@ void OpInputConfig::ParseLoD(std::istream& is) { } } EraseEndSep(&lod_str); - PADDLE_ENFORCE_GE(lod_str.length(), 4U); + PADDLE_ENFORCE_GE( + lod_str.length(), 4U, + platform::errors::InvalidArgument( + "The length of lod string should be " + "equal to or larger than 4. But length of lod string is %zu.", + lod_str.length())); VLOG(4) << "lod: " << lod_str << ", length: " << lod_str.length(); // Parse the lod_str @@ -153,8 +161,10 @@ void OpInputConfig::ParseLoD(std::istream& is) { OpTesterConfig::OpTesterConfig(const std::string& filename) { std::ifstream fin(filename, std::ios::in | std::ios::binary); - PADDLE_ENFORCE(static_cast(fin), "Cannot open file %s", - filename.c_str()); + PADDLE_ENFORCE_EQ( + static_cast(fin), true, + platform::errors::InvalidArgument("OpTesterConfig cannot open file %s.", + filename.c_str())); Init(fin); } diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index bf97b9d03c455182a8d95b6987896b9a580c84fe..ef8a2b38f20b99f0b1e41ddc1976f88dd8d1f5ab 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -166,7 +166,8 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( #endif if (input_data_type != framework::proto::VarType::INT8 && - input_data_type != framework::proto::VarType::UINT8) { + input_data_type != framework::proto::VarType::UINT8 && + input_data_type != framework::proto::VarType::BF16) { auto filter_data_type = ctx.Input("Filter")->type(); PADDLE_ENFORCE_EQ(input_data_type, filter_data_type, platform::errors::InvalidArgument( @@ -455,6 +456,11 @@ void Conv3DOpMaker::Make() { AddAttr("use_mkldnn", "(bool, default false) Only used in mkldnn kernel") .SetDefault(false); + AddAttr( + "mkldnn_data_type", + "(string, default \"float32\"). Data type of mkldnn kernel") + .SetDefault("float32") + .InEnum({"float32", "int8", "bfloat16"}); AddAttr("fuse_relu", "(bool, default false) Only used in mkldnn kernel") .SetDefault(false); AddAttr("fuse_activation", diff --git a/paddle/fluid/operators/distributed/parameter_recv.cc b/paddle/fluid/operators/distributed/parameter_recv.cc index a91df5b3c471e234dd1ae72771c287e21ebf7af0..51b13bc2c569d0078199e6ede42bfecb2e33148b 100644 --- a/paddle/fluid/operators/distributed/parameter_recv.cc +++ b/paddle/fluid/operators/distributed/parameter_recv.cc @@ -175,10 +175,6 @@ void RecvGeoSparseRecords(const CommContext &rpc_ctx, template void RecvLodTensor(const CommContext &rpc_ctx, const framework::Scope &scope) { - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto cpu_place = platform::CPUPlace(); - auto &cpu_ctx = *pool.Get(cpu_place); - distributed::RPCClient *rpc_client = distributed::RPCClient::GetInstance(rpc_ctx.trainer_id); @@ -188,8 +184,13 @@ void RecvLodTensor(const CommContext &rpc_ctx, const framework::Scope &scope) { if (rpc_ctx.origin_varnames.size() == 1 && rpc_ctx.splited_varnames.size() == 1) { auto varname = rpc_ctx.origin_varnames[0]; - VLOG(4) << "recv " << varname << " from " << rpc_ctx.epmap[0]; - rets.push_back(rpc_client->AsyncGetVarNoBarrier(rpc_ctx.epmap[0], cpu_ctx, + const auto place = + scope.FindVar(varname)->Get().place(); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &ctx = *pool.Get(place); + VLOG(4) << "recv " << varname << " from " << rpc_ctx.epmap[0] << " in gpu? " + << platform::is_gpu_place(place); + rets.push_back(rpc_client->AsyncGetVarNoBarrier(rpc_ctx.epmap[0], ctx, scope, varname, varname)); for (size_t i = 0; i < rets.size(); i++) { diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc new file mode 100644 index 0000000000000000000000000000000000000000..9ff7a71d7f03a4e91cea42926f23c7b270857ba9 --- /dev/null +++ b/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc @@ -0,0 +1,162 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_XPU +#include "paddle/fluid/operators/elementwise/elementwise_add_op.h" +#include +#include +#include "paddle/fluid/operators/elementwise/elementwise_op.h" + +#include "paddle/fluid/operators/elementwise/elementwise_xpu.h" + +namespace paddle { +namespace operators { + +template +class ElementwiseAddXPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + XPUElementwise>(ctx); + } +}; + +template +class ElementwiseAddGradXPUKernel : public ElemwiseGradKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + ElemwiseGradKernel::Compute(ctx); + using Tensor = framework::Tensor; + + auto *dout = ctx.Input(framework::GradVarName("Out")); + auto *dx = ctx.Output(framework::GradVarName("X")); + auto *dy = ctx.Output(framework::GradVarName("Y")); + + auto dx_dims = dout->dims(); + auto dy_dims_untrimed = dout->dims(); + T *dx_data = NULL; + T *dy_data = NULL; + + int axis = ctx.Attr("axis"); + PADDLE_ENFORCE_GE(dx_dims.size(), dy_dims_untrimed.size(), + "Rank of first input must >= rank of second input."); + + if (dx != nullptr) { + dx->mutable_data(ctx.GetPlace()); + dx_dims = dx->dims(); + dx_data = dx->data(); + } + + if (dy != nullptr) { + dy->mutable_data(ctx.GetPlace()); + dy_dims_untrimed = dy->dims(); + dy_data = dy->data(); + } + + int pre, n, post, is_common_broadcast; + if (dx_dims == dy_dims_untrimed) { + pre = post = 1; + n = dout->numel(); + } else { + axis = (axis == -1 ? dx_dims.size() - dy_dims_untrimed.size() : axis); + PADDLE_ENFORCE(axis >= 0 && axis < dx_dims.size(), + "Axis should be in range [0, dx_dims)"); + auto dy_dims = trim_trailing_singular_dims(dy_dims_untrimed); + axis = (dy_dims.size() == 0) ? dx_dims.size() : axis; + get_mid_dims(dx_dims, dy_dims, axis, &pre, &n, &post, + &is_common_broadcast); + } + int len = pre * n * post; + + auto &dev_ctx = + ctx.template device_context(); + if (post == 1) { + int r = xpu::matrix_vector_add_grad( + dev_ctx.x_context(), dout->data(), dout->data(), + dout->data(), dout->data(), dx_data, dy_data, pre, n); + PADDLE_ENFORCE_EQ( + r, XPU_SUCCESS, + platform::errors::External( + "XPU API return wrong value[%d], please check whether " + "Baidu Kunlun Card is properly installed.", + r)); + return; + } + + if (dx == nullptr) { + PADDLE_ENFORCE_EQ( + xpu_malloc(reinterpret_cast(&dx_data), len * sizeof(float)), + XPU_SUCCESS, platform::errors::External("XPU has no enough memory")); + } + + if (dy == nullptr) { + PADDLE_ENFORCE_EQ( + xpu_malloc(reinterpret_cast(&dy_data), len * sizeof(float)), + XPU_SUCCESS, platform::errors::External("XPU has no enough memory")); + } else { + if (len != n) { + PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast(&dy_data), + len * sizeof(float)), + XPU_SUCCESS, platform::errors::External( + "XPU has no enough memory")); + } + } + + int r = xpu::elementwise_add_grad( + dev_ctx.x_context(), dout->data() /*x*/, dout->data() /*y*/, + dout->data() /*out*/, dout->data(), dx_data, dy_data, len); + PADDLE_ENFORCE_EQ( + r, XPU_SUCCESS, + platform::errors::External( + "XPU API return wrong value[%d], please check whether " + "Baidu Kunlun Card is properly installed.", + r)); + + if ((dy != nullptr) && (len != n)) { + r = xpu::reduce_ew(dev_ctx.x_context(), dy_data, dy->data(), pre, n, + post, xpu::ElementwiseOp::ASSIGN); + PADDLE_ENFORCE_EQ( + r, XPU_SUCCESS, + platform::errors::External( + "XPU API return wrong value[%d], please check whether " + "Baidu Kunlun Card is properly installed.", + r)); + dev_ctx.Wait(); + xpu_free(dy_data); + } + + if ((dx == nullptr || dy == nullptr) && !(dy != nullptr && len != n)) { + dev_ctx.Wait(); + } + + if (dx == nullptr) { + xpu_free(dx_data); + } + if (dy == nullptr) { + xpu_free(dy_data); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_XPU_KERNEL( + elementwise_add, + ops::ElementwiseAddXPUKernel); +REGISTER_OP_XPU_KERNEL(elementwise_add_grad, + ops::ElementwiseAddGradXPUKernel< + paddle::platform::XPUDeviceContext, float>); +#endif diff --git a/paddle/fluid/operators/elementwise/elementwise_xpu.h b/paddle/fluid/operators/elementwise/elementwise_xpu.h new file mode 100644 index 0000000000000000000000000000000000000000..53c4332e9190de131b48d68c30b84b035ec66e67 --- /dev/null +++ b/paddle/fluid/operators/elementwise/elementwise_xpu.h @@ -0,0 +1,113 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once +#ifdef PADDLE_WITH_XPU +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace operators { + +template +struct XPUAddFunctor { + int operator()(xpu::Context* ctx, const T* x, const T* y, T* z, int len) { + return xpu::elementwise_add(ctx, x, y, z, len); + } +}; + +template +struct XPUMulFunctor { + int operator()(xpu::Context* ctx, const T* x, const T* y, T* z, int len) { + return xpu::elementwise_mul(ctx, x, y, z, len); + } +}; + +template +void XPUElementwise(const framework::ExecutionContext& ctx) { + PADDLE_ENFORCE(platform::is_xpu_place(ctx.GetPlace()), + "This kernel only runs on XPU device."); + auto x_var = ctx.InputVar("X"); + PADDLE_ENFORCE_NE(x_var, nullptr, + platform::errors::Fatal("Cannot get input Variable X")); + PADDLE_ENFORCE(x_var->IsType(), + "XPU only support LoDTensor"); + + auto x = x_var->Get(); + auto* y = ctx.Input("Y"); + auto* z = ctx.Output("Out"); + z->mutable_data(ctx.GetPlace()); + + int axis = ctx.Attr("axis"); + auto x_dims = x.dims(); + auto y_dims_untrimed = y->dims(); + PADDLE_ENFORCE_GE(x_dims.size(), y_dims_untrimed.size(), + "Rank of first input must >= rank of second input."); + axis = (axis == -1 ? x_dims.size() - y_dims_untrimed.size() : axis); + PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(), + "Axis should be in range [0, x_dims)"); + auto y_dims = trim_trailing_singular_dims(y_dims_untrimed); + axis = (y_dims.size() == 0) ? x_dims.size() : axis; + int pre, n, post, is_common_broadcast; + get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post, &is_common_broadcast); + int len = pre * n * post; + + const T* x_data = x.data(); + const T* y_data = y->data(); + T* z_data = z->data(); + T* y_broadcast = nullptr; + + auto& dev_ctx = + ctx.template device_context(); + + if (post == 1) { + if (std::is_same>::value) { + int res = xpu::matrix_vector_add(dev_ctx.x_context(), x_data, y_data, + z_data, pre, n); + PADDLE_ENFORCE(res == xpu::Error_t::SUCCESS, "XPU kernel error! res = %d", + res); + return; + } + if (std::is_same>::value) { + int res = xpu::matrix_vector_mul(dev_ctx.x_context(), x_data, y_data, + z_data, pre, n); + PADDLE_ENFORCE(res == xpu::Error_t::SUCCESS, "XPU kernel error! res = %d", + res); + return; + } + } + + if (pre != 1 || post != 1) { + PADDLE_ENFORCE(xpu_malloc(reinterpret_cast(&y_broadcast), + len * sizeof(T)) == XPU_SUCCESS); + int res = xpu::broadcast_ew(dev_ctx.x_context(), y_data, y_broadcast, pre, + n, post, xpu::ElementwiseOp::ASSIGN); + PADDLE_ENFORCE(res == xpu::Error_t::SUCCESS, "XPU kernel error! res = %d", + res); + y_data = y_broadcast; + } + + Functor functor; + int res = functor(dev_ctx.x_context(), x_data, y_data, z_data, len); + PADDLE_ENFORCE(res == xpu::Error_t::SUCCESS, "XPU kernel error! res = %d", + res); + + if (pre != 1 || post != 1) { + dev_ctx.Wait(); + xpu_free(y_broadcast); + } +} + +} // namespace operators +} // namespace paddle +#endif diff --git a/paddle/fluid/operators/instance_norm_op.cc b/paddle/fluid/operators/instance_norm_op.cc index a5b270c1dfef14bc92697c29bfeafa0fe08211d7..03279a9b2c15b8d918333fd61c07ed636f11d889 100644 --- a/paddle/fluid/operators/instance_norm_op.cc +++ b/paddle/fluid/operators/instance_norm_op.cc @@ -520,11 +520,11 @@ class InstanceNormDoubleGradKernel // (np.mean(dy, axis=(h,w)) - dy) + inv_var.pow(3) / HxW * // np.sum(dy, // axis=(h,w)) * (x - mean) * - // (np.mean(ddx, axis=(h,w)) - ddx) + ddr * (dy * inv_var - inv_var - // * + // (np.mean(ddx, axis=(h,w)) - ddx)) + ddr * (dy * inv_var - + // inv_var * // np.mean(dy, axis=(h,w)) - // inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean), - // axis=(h,w)))) + // axis=(h,w))) Tensor x_sub_mean_mul_invstd; x_sub_mean_mul_invstd.Resize({sample_size, NxC}); diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc index 898f27f9afef9ca13a9f24ab1b61a50f745d40f7..d65cdc6c150ec6b9e5e4ed3e469069b3beffc819 100644 --- a/paddle/fluid/operators/jit/benchmark.cc +++ b/paddle/fluid/operators/jit/benchmark.cc @@ -136,7 +136,6 @@ void BenchAllImpls(const typename KernelTuple::attr_type& attr, Args... args) { } using Tensor = paddle::framework::Tensor; - template void BenchKernelXYZN() { using T = typename KernelTuple::data_type; @@ -320,8 +319,15 @@ void BenchKernelSgd() { const T lr = 0.1; auto UnDuplicatedRandomVec = [](int n, const int64_t lower, const int64_t upper) -> std::vector { - PADDLE_ENFORCE_LE(static_cast(upper - lower), n - 1); - PADDLE_ENFORCE_GT(n, 0); + PADDLE_ENFORCE_LE( + static_cast(upper - lower), n - 1, + paddle::platform::errors::InvalidArgument( + "The range of Sgd (upper - lower) should be equal to or lower " + "than n-1 (Sgd size -1). But upper - lower is %d and n-1 is %d.", + static_cast(upper - lower), (n - 1))); + PADDLE_ENFORCE_GT( + n, 0, paddle::platform::errors::InvalidArgument( + "The Sgd size should be larger than 0. But the n is %d.", n)); std::vector all, out; for (int i = 0; i < n; ++i) { all.push_back(i); diff --git a/paddle/fluid/operators/jit/gen/embseqpool.cc b/paddle/fluid/operators/jit/gen/embseqpool.cc index b4e63d87eac064c7f29855cefc3dbe875ccee28f..c549fec0970cb235bed77105c4297669c163c5e7 100644 --- a/paddle/fluid/operators/jit/gen/embseqpool.cc +++ b/paddle/fluid/operators/jit/gen/embseqpool.cc @@ -132,11 +132,31 @@ class EmbSeqPoolCreator : public JitCodeCreator { } std::unique_ptr CreateJitCode( const emb_seq_pool_attr_t& attr) const override { - PADDLE_ENFORCE_GT(attr.table_height, 0); - PADDLE_ENFORCE_GT(attr.table_width, 0); - PADDLE_ENFORCE_GT(attr.index_height, 0); - PADDLE_ENFORCE_GT(attr.index_width, 0); - PADDLE_ENFORCE_GT(attr.out_width, 0); + PADDLE_ENFORCE_GT(attr.table_height, 0, + platform::errors::InvalidArgument( + "The attribute table_height of EmbSeqPool should " + "be larger than 0. But it is %d.", + attr.table_height)); + PADDLE_ENFORCE_GT(attr.table_width, 0, + platform::errors::InvalidArgument( + "The attribute table_width of EmbSeqPool should " + "be larger than 0. But it is %d.", + attr.table_width)); + PADDLE_ENFORCE_GT(attr.index_height, 0, + platform::errors::InvalidArgument( + "The attribute index_height of EmbSeqPool should " + "be larger than 0. But it is %d.", + attr.index_height)); + PADDLE_ENFORCE_GT(attr.index_width, 0, + platform::errors::InvalidArgument( + "The attribute index_width of EmbSeqPool should " + "be larger than 0. But it is %d.", + attr.index_width)); + PADDLE_ENFORCE_GT(attr.out_width, 0, + platform::errors::InvalidArgument( + "The attribute out_width of EmbSeqPool should be " + "larger than 0. But it is %d.", + attr.out_width)); return make_unique(attr, CodeSize(attr)); } }; diff --git a/paddle/fluid/operators/jit/gen/matmul.cc b/paddle/fluid/operators/jit/gen/matmul.cc index 047d0d3e1caa2290111104d5799e67cea7b7eed2..3139b252cadbc37d6ffbe2af023bd5e836f15ab7 100644 --- a/paddle/fluid/operators/jit/gen/matmul.cc +++ b/paddle/fluid/operators/jit/gen/matmul.cc @@ -29,7 +29,11 @@ void MatMulJitCode::genCode() { preCode(); int block, rest; const auto groups = packed_groups(n_, k_, &block, &rest); - PADDLE_ENFORCE_GT(groups.front(), 0); + PADDLE_ENFORCE_GT( + groups.front(), 0, + platform::errors::InvalidArgument("The number of rest registers should " + "be larger than 0. But it is %d.", + groups.front())); const int block_len = sizeof(float) * block; const int x_reg_idx = (block == ZMM_FLOAT_BLOCK ? 32 : 16) - 1; @@ -118,9 +122,21 @@ class MatMulCreator : public JitCodeCreator { } std::unique_ptr CreateJitCode( const matmul_attr_t& attr) const override { - PADDLE_ENFORCE_GT(attr.m, 0); - PADDLE_ENFORCE_GT(attr.n, 0); - PADDLE_ENFORCE_GT(attr.k, 0); + PADDLE_ENFORCE_GT( + attr.m, 0, platform::errors::InvalidArgument( + "The attribute m (first matrix's row) of MatMul should " + "be larger than 0. But it is %d.", + attr.m)); + PADDLE_ENFORCE_GT( + attr.n, 0, platform::errors::InvalidArgument( + "The attribute n (first matrix's col) of MatMul should " + "be larger than 0. But it is %d.", + attr.n)); + PADDLE_ENFORCE_GT( + attr.k, 0, platform::errors::InvalidArgument( + "The attribute k (second matrix's col) of MatMul should " + "be larger than 0. But it is %d.", + attr.k)); return make_unique(attr, CodeSize(attr)); } }; diff --git a/paddle/fluid/operators/jit/gen/matmul.h b/paddle/fluid/operators/jit/gen/matmul.h index 4f04f7606d2deb8ba58809f9e07e60ba19182a71..eb7328d7e069cf05a22ec1ecee70f36280e6d231 100644 --- a/paddle/fluid/operators/jit/gen/matmul.h +++ b/paddle/fluid/operators/jit/gen/matmul.h @@ -33,7 +33,10 @@ class MatMulJitCode : public JitCode { size_t code_size = 256 * 1024, void* code_ptr = nullptr) : JitCode(code_size, code_ptr), m_(attr.m), n_(attr.n), k_(attr.k) { - PADDLE_ENFORCE_EQ(m_, 1, "Only support m==1 yet"); + PADDLE_ENFORCE_EQ(m_, 1, platform::errors::Unimplemented( + "Jitcode of matmul only support m==1 (first " + "matrix's row) now. But m is %d.", + m_)); this->genCode(); } diff --git a/paddle/fluid/operators/jit/gen/seqpool.cc b/paddle/fluid/operators/jit/gen/seqpool.cc index ec8e4e9827441bc0a817c6da455cb9e530c8c1bf..d8c7b3cdb7b1f36125b76feab19ab4369d491219 100644 --- a/paddle/fluid/operators/jit/gen/seqpool.cc +++ b/paddle/fluid/operators/jit/gen/seqpool.cc @@ -70,8 +70,14 @@ class SeqPoolCreator : public JitCodeCreator { } std::unique_ptr CreateJitCode( const seq_pool_attr_t& attr) const override { - PADDLE_ENFORCE_GT(attr.w, 0); - PADDLE_ENFORCE_GT(attr.h, 0); + PADDLE_ENFORCE_GT(attr.w, 0, platform::errors::InvalidArgument( + "The attribute width of SeqPool should " + "be larger than 0. But it is %d.", + attr.w)); + PADDLE_ENFORCE_GT(attr.h, 0, platform::errors::InvalidArgument( + "The attribute height of SeqPool should " + "be larger than 0. But it is %d.", + attr.h)); return make_unique(attr, CodeSize(attr)); } }; diff --git a/paddle/fluid/operators/jit/gen/seqpool.h b/paddle/fluid/operators/jit/gen/seqpool.h index cb562c4c9a6c6be5c5881cb6273ca7426a6c2a10..d4e7b2e29ce22705c0ef7320495f55483d9bfef1 100644 --- a/paddle/fluid/operators/jit/gen/seqpool.h +++ b/paddle/fluid/operators/jit/gen/seqpool.h @@ -127,8 +127,13 @@ class SeqPoolJitCode : public JitCode { vmovss(xmm_t(reg_idx + max_num_regs), ptr[reg_ptr_src_i]); reg_idx++; } - PADDLE_ENFORCE_EQ(reg_idx, rest_used_num_regs, - "All heights should use same regs"); + PADDLE_ENFORCE_EQ( + reg_idx, rest_used_num_regs, + platform::errors::InvalidArgument( + "All heights of SeqPool should use the same number of registers." + "It equals to the numbr of rest registers. But use %d registers " + "and the numbr of rest registers is %d.", + reg_idx, rest_used_num_regs)); for (int i = 0; i < reg_idx; ++i) { vaddps(xmm_t(i), xmm_t(i), xmm_t(i + max_num_regs)); } diff --git a/paddle/fluid/operators/jit/gen/sgd.cc b/paddle/fluid/operators/jit/gen/sgd.cc index 1452d4139b0d7994e5304680d63a63cb28bb606b..7fe93fdb6a51a811a6e60ba5af31d9a91aadd336 100644 --- a/paddle/fluid/operators/jit/gen/sgd.cc +++ b/paddle/fluid/operators/jit/gen/sgd.cc @@ -116,9 +116,24 @@ class SgdCreator : public JitCodeCreator { size_t CodeSize(const sgd_attr_t& attr) const override { return 96 + 32 * 8; } std::unique_ptr CreateJitCode( const sgd_attr_t& attr) const override { - PADDLE_ENFORCE_EQ(attr.param_width, attr.grad_width); - PADDLE_ENFORCE_LE(attr.selected_rows_size, attr.grad_height); - PADDLE_ENFORCE_GE(attr.selected_rows_size, 0); + PADDLE_ENFORCE_EQ(attr.param_width, attr.grad_width, + platform::errors::InvalidArgument( + "The attribute param_width of Sgd should be " + "equal to the attribute grad_width. But param_width " + "is %d and grad_width is %d.", + attr.param_width, attr.grad_width)); + PADDLE_ENFORCE_LE(attr.selected_rows_size, attr.grad_height, + platform::errors::InvalidArgument( + "The attribute selected_rows_size of Sgd should be " + "equal to or less than the attribute grad_height. " + "But selected_rows_size is %d and grad_height is %d.", + attr.selected_rows_size, attr.grad_height)); + PADDLE_ENFORCE_GE( + attr.selected_rows_size, 0, + platform::errors::InvalidArgument( + "The attribute selected_rows_size of Sgd should be " + "equal to or larger than 0. But selected_rows_size is %d.", + attr.selected_rows_size)); return make_unique(attr, CodeSize(attr)); } }; diff --git a/paddle/fluid/operators/jit/gen/vbroadcast.cc b/paddle/fluid/operators/jit/gen/vbroadcast.cc index 66a8d75fd4de5bae3ba37cf7fe7b1645938aa855..4084d68c2a840812358ec13f33d99fbb1f592c9f 100644 --- a/paddle/fluid/operators/jit/gen/vbroadcast.cc +++ b/paddle/fluid/operators/jit/gen/vbroadcast.cc @@ -76,7 +76,11 @@ class VBroadcastCreator : public JitCodeCreator { return 96 + (w / YMM_FLOAT_BLOCK) * 16 * 8; } std::unique_ptr CreateJitCode(const int64_t& w) const override { - PADDLE_ENFORCE_GT(w, 0); + PADDLE_ENFORCE_GT( + w, 0, + platform::errors::InvalidArgument( + "The width of VBroadcast should be larger than 0. But w is %d.", + w)); return make_unique(w, CodeSize(w)); } }; diff --git a/paddle/fluid/operators/jit/gen_base.cc b/paddle/fluid/operators/jit/gen_base.cc index 4c49eff49e3efc0664a084f9fa2bb897db0c6f1d..2ae71256cddcb172edb24488d559fe788e99ada5 100644 --- a/paddle/fluid/operators/jit/gen_base.cc +++ b/paddle/fluid/operators/jit/gen_base.cc @@ -49,9 +49,14 @@ void GenBase::dumpCode(const unsigned char* code) const { void* GenBase::operator new(size_t size) { void* ptr; constexpr size_t alignment = 32ul; - PADDLE_ENFORCE_EQ(posix_memalign(&ptr, alignment, size), 0, - "GenBase Alloc %ld error!", size); - PADDLE_ENFORCE(ptr, "Fail to allocate GenBase CPU memory: size = %d .", size); + PADDLE_ENFORCE_EQ( + posix_memalign(&ptr, alignment, size), 0, + platform::errors::InvalidArgument( + "Jitcode generator (GenBase) allocate %ld memory error!", size)); + PADDLE_ENFORCE_NOT_NULL(ptr, platform::errors::InvalidArgument( + "Fail to allocate jitcode generator " + "(GenBase) CPU memory: size = %d .", + size)); return ptr; } diff --git a/paddle/fluid/operators/jit/helper.cc b/paddle/fluid/operators/jit/helper.cc index 2952cdb87146ec01a366abaf332ce1099c425966..c66e8092d5e4221767100c94174210af24a43abc 100644 --- a/paddle/fluid/operators/jit/helper.cc +++ b/paddle/fluid/operators/jit/helper.cc @@ -66,7 +66,8 @@ const char* to_string(KernelType kt) { ONE_CASE(kEmbSeqPool); ONE_CASE(kSgd); default: - PADDLE_THROW("Not support type: %d, or forget to add it.", kt); + PADDLE_THROW(platform::errors::Unimplemented( + "JIT kernel do not support type: %d.", kt)); return "NOT JITKernel"; } return nullptr; @@ -79,7 +80,8 @@ const char* to_string(SeqPoolType tp) { ONE_CASE(kAvg); ONE_CASE(kSqrt); default: - PADDLE_THROW("Not support type: %d, or forget to add it.", tp); + PADDLE_THROW(platform::errors::Unimplemented( + "SeqPool JIT kernel do not support type: %d.", tp)); return "NOT PoolType"; } return nullptr; @@ -100,7 +102,8 @@ KernelType to_kerneltype(const std::string& act) { } else if (lower == "tanh" || lower == "vtanh") { return kVTanh; } - PADDLE_THROW("Not support type: %s, or forget to add this case", act); + PADDLE_THROW(platform::errors::Unimplemented( + "Act JIT kernel do not support type: %s.", act)); return kNone; } @@ -109,12 +112,19 @@ void pack_weights(const float* src, float* dst, int n, int k) { int block, rest; const auto groups = packed_groups(n, k, &block, &rest); std::for_each(groups.begin(), groups.end(), [&](int i) { - PADDLE_ENFORCE_GT(i, 0, "each element of groups should be larger than 0."); + PADDLE_ENFORCE_GT(i, 0, platform::errors::InvalidArgument( + "Each element of groups should be larger than " + "0. However the element: %d doesn't satify.", + i)); }); int sum = std::accumulate(groups.begin(), groups.end(), 0); std::memset(dst, 0, k * sum * block * sizeof(float)); PADDLE_ENFORCE_GE(sum * block, n, - "The packed n should be equal to or larger than n"); + platform::errors::InvalidArgument( + "The packed n (sum * block) should be equal to or " + "larger than n (matmul row size). " + "However, the packed n is %d and n is %d.", + sum * block, n)); const int block_len = sizeof(float) * block; int n_offset = 0; @@ -136,7 +146,8 @@ void pack_weights(const float* src, float* dst, int n, int k) { template typename std::enable_if::value>::type pack_weights( const T* src, T* dst, int n, int k) { - PADDLE_THROW("Only support pack with float type."); + PADDLE_THROW(platform::errors::Unimplemented( + "Only supports pack weights with float type.")); } } // namespace jit diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h index b6dd49b77728c48c1075109934699545bb282420..0791bb5810526cb930fe1869a60913d4239f72a3 100644 --- a/paddle/fluid/operators/jit/helper.h +++ b/paddle/fluid/operators/jit/helper.h @@ -85,8 +85,10 @@ inline const Kernel* GetReferKernel() { auto& ref_pool = ReferKernelPool::Instance().AllKernels(); KernelKey kkey(KernelTuple::kernel_type, platform::CPUPlace()); auto ref_iter = ref_pool.find(kkey); - PADDLE_ENFORCE(ref_iter != ref_pool.end(), - "Every Kernel should have reference function."); + PADDLE_ENFORCE_NE( + ref_iter, ref_pool.end(), + platform::errors::PreconditionNotMet( + "Every Refer Kernel of jitcode should have reference function.")); auto& ref_impls = ref_iter->second; for (auto& impl : ref_impls) { auto i = dynamic_cast*>(impl.get()); @@ -101,7 +103,9 @@ template inline typename KernelTuple::func_type GetReferFunc() { auto ker = GetReferKernel(); auto p = dynamic_cast*>(ker); - PADDLE_ENFORCE(p, "The Refer kernel should exsit"); + PADDLE_ENFORCE_NOT_NULL(p, platform::errors::InvalidArgument( + "Get the reference code of kernel in CPU " + "failed. The Refer kernel should exsit.")); return p->GetFunc(); } @@ -132,7 +136,9 @@ std::vector GetAllCandidateKernels( // The last implementation should be reference function on CPUPlace. auto ref = GetReferKernel(); - PADDLE_ENFORCE(ref != nullptr, "Refer Kernel can not be empty."); + PADDLE_ENFORCE_NOT_NULL(ref, platform::errors::InvalidArgument( + "Get all candicate kernel in CPU failed. " + "The Refer Kernel can not be empty.")); res.emplace_back(ref); return res; } @@ -147,11 +153,14 @@ GetAllCandidateFuncsWithTypes(const typename KernelTuple::attr_type& attr) { std::string name = k->ImplType(); if (name == "JitCode") { auto i = dynamic_cast(k); - PADDLE_ENFORCE(i, "jitcode kernel cast can not fail."); + PADDLE_ENFORCE_NOT_NULL(i, + platform::errors::InvalidArgument( + "Generate jitcode kernel (GenBase) failed.")); res.emplace_back(std::make_pair(name, i->template getCode())); } else { auto i = dynamic_cast*>(k); - PADDLE_ENFORCE(i, "kernel cast can not fail."); + PADDLE_ENFORCE_NOT_NULL(i, platform::errors::InvalidArgument( + "Kernel cast (KernelMore) failed.")); res.emplace_back(std::make_pair(name, i->GetFunc())); } } @@ -173,7 +182,9 @@ template typename KernelTuple::func_type GetDefaultBestFunc( const typename KernelTuple::attr_type& attr) { auto funcs = GetAllCandidateFuncs(attr); - PADDLE_ENFORCE_GE(funcs.size(), 1UL); + PADDLE_ENFORCE_GE(funcs.size(), 1UL, + platform::errors::InvalidArgument( + "The candicate jit kernel is at least one in CPU.")); // Here could do some runtime benchmark of this attr and return the best one. // But yet just get the first one as the default best one, // which is searched in order and tuned by offline. diff --git a/paddle/fluid/operators/jit/more/mix/mix.cc b/paddle/fluid/operators/jit/more/mix/mix.cc index f5b7bfff89825bfcd6cbe4b1008628d3e1093f4c..5d63f4848e6165bfb84c1bfe301d20cc24cfc7b0 100644 --- a/paddle/fluid/operators/jit/more/mix/mix.cc +++ b/paddle/fluid/operators/jit/more/mix/mix.cc @@ -95,7 +95,8 @@ void (*getActFunc(KernelType type, int d))(const T*, T*, int) { // NOLINT } else if (type == kVIdentity) { return KernelFuncs, CPUPlace>::Cache().At(d); } - PADDLE_THROW("Not support type: %s", type); + PADDLE_THROW(platform::errors::Unimplemented( + "Act JIT kernel do not support type: %s", type)); return nullptr; } diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.h b/paddle/fluid/operators/jit/more/mkl/mkl.h index ee31c8df2f882d092783cf0408564a39ca6fafa1..5f3c29ad5efb848f1fa12236ffe36a9f654864a3 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.h +++ b/paddle/fluid/operators/jit/more/mkl/mkl.h @@ -103,11 +103,24 @@ void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) { template void EmbSeqPool(const T* table, const int64_t* idx, T* out, const emb_seq_pool_attr_t* attr) { - PADDLE_ENFORCE_EQ(attr->table_width * attr->index_width, attr->out_width); + PADDLE_ENFORCE_EQ( + attr->table_width * attr->index_width, attr->out_width, + platform::errors::InvalidArgument( + "The attribute table_width * index_width of EmbSeqPool should " + "be equal to out_width. But table_width * index_width is %d, " + "out_width is %d.", + attr->table_width * attr->index_width, attr->out_width)); auto check_idx_value_valid = [&](int64_t i) { - PADDLE_ENFORCE_LT(idx[i], attr->table_height, "idx value: %d, i: %d", - idx[i], i); - PADDLE_ENFORCE_GE(idx[i], 0, "idx value: %d, i: %d", idx[i], i); + PADDLE_ENFORCE_LT( + idx[i], attr->table_height, + platform::errors::InvalidArgument( + "The idx shoud be lower than the attribute table_height of " + "EmbSeqPool. But %dth of idx is %d and table_height is %d.", + i, idx[i], attr->table_height)); + PADDLE_ENFORCE_GE(idx[i], 0, platform::errors::InvalidArgument( + "The idx shoud be equal to or larger than " + "the 0. But %dth of idx is %d.", + i, idx[i])); }; for (int64_t w = 0; w != attr->index_width; ++w) { @@ -168,22 +181,50 @@ void Softmax(const T* x, T* y, int n, int bs, int remain = 1) { template void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows, T* out, const sgd_attr_t* attr) { - PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width); - PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height); + PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width, + platform::errors::InvalidArgument( + "The attribute param_width of Sgd should be " + "equal to the attribute grad_width. But param_width " + "is %d and grad_width is %d.", + attr->param_width, attr->grad_width)); + PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height, + platform::errors::InvalidArgument( + "The attribute selected_rows_size of Sgd should be " + "equal to or less than the attribute grad_height. " + "But selected_rows_size is %d and grad_height is %d.", + attr->selected_rows_size, attr->grad_height)); T scalar = -lr[0]; int width = attr->grad_width; if (out == param) { for (int64_t i = 0; i < attr->selected_rows_size; ++i) { auto h_idx = rows[i]; - PADDLE_ENFORCE_LT(h_idx, attr->param_height); - PADDLE_ENFORCE_GE(h_idx, 0); + PADDLE_ENFORCE_LT(h_idx, attr->param_height, + platform::errors::InvalidArgument( + "The rows of Sgd should be " + "less than the attribute. But %dth of rows " + "is %d and grad_width is %d.", + i, h_idx, attr->param_height)); + PADDLE_ENFORCE_GE(h_idx, 0, platform::errors::InvalidArgument( + "The rows of Sgd should be " + "larger than 0. But %dth of rows " + "is %d.", + i, h_idx)); VAXPY(scalar, grad + i * width, out + h_idx * width, width); } } else { for (int64_t i = 0; i < attr->selected_rows_size; ++i) { auto h_idx = rows[i]; - PADDLE_ENFORCE_LT(h_idx, attr->param_height); - PADDLE_ENFORCE_GE(h_idx, 0); + PADDLE_ENFORCE_LT(h_idx, attr->param_height, + platform::errors::InvalidArgument( + "The rows of Sgd should be " + "less than the attribute. But %dth of rows " + "is %d and grad_width is %d.", + i, h_idx, attr->param_height)); + PADDLE_ENFORCE_GE(h_idx, 0, platform::errors::InvalidArgument( + "The rows of Sgd should be " + "larger than 0. But %dth of rows " + "is %d.", + i, h_idx)); VScal(&scalar, grad + i * width, out + h_idx * width, width); VAdd(param + h_idx * width, out + h_idx * width, out + h_idx * width, width); diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h index b8d5e2c24071f6a14b070f4120f8c987110ed4f7..42fb7b4f279c225fb38a49d23e9d76ac1854d12d 100644 --- a/paddle/fluid/operators/jit/refer/refer.h +++ b/paddle/fluid/operators/jit/refer/refer.h @@ -147,7 +147,8 @@ void (*getActFunc(KernelType type))(const T*, T*, int) { // NOLINT } else if (type == kVIdentity) { return VIdentity; } - PADDLE_THROW("Not support type: %s", type); + PADDLE_THROW(platform::errors::Unimplemented( + "Act JIT kernel do not support type: %s.", type)); return nullptr; } @@ -465,12 +466,25 @@ void Softmax(const T* x, T* y, int n, int bs = 1, int remain = 1) { template void EmbSeqPool(const T* table, const int64_t* idx, T* out, const emb_seq_pool_attr_t* attr) { - PADDLE_ENFORCE_EQ(attr->table_width * attr->index_width, attr->out_width); + PADDLE_ENFORCE_EQ( + attr->table_width * attr->index_width, attr->out_width, + platform::errors::InvalidArgument( + "The attribute table_width * index_width of EmbSeqPool should " + "be equal to out_width. But table_width * index_width is %d and " + "out_width is %d.", + attr->table_width * attr->index_width, attr->out_width)); auto check_idx_value_valid = [&](int64_t i) { - PADDLE_ENFORCE_LT(idx[i], attr->table_height, "idx value: %d, i: %d", - idx[i], i); - PADDLE_ENFORCE_GE(idx[i], 0, "idx value: %d, i: %d", idx[i], i); + PADDLE_ENFORCE_LT( + idx[i], attr->table_height, + platform::errors::InvalidArgument( + "The idx shoud be lower than the attribute table_height of " + "EmbSeqPool. But %dth of idx is %d and table_height is %d.", + i, idx[i], attr->table_height)); + PADDLE_ENFORCE_GE(idx[i], 0, platform::errors::InvalidArgument( + "The idx shoud be equal to or larger than " + "the 0. But %dth of idx is %d.", + i, idx[i])); }; for (int64_t w = 0; w != attr->index_width; ++w) { @@ -505,12 +519,31 @@ void EmbSeqPool(const T* table, const int64_t* idx, T* out, template void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows, T* out, const sgd_attr_t* attr) { - PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width); - PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height); + PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width, + platform::errors::InvalidArgument( + "The attribute param_width of Sgd should be " + "equal to the attribute grad_width. But param_width " + "is %d and grad_width is %d.", + attr->param_width, attr->grad_width)); + PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height, + platform::errors::InvalidArgument( + "The attribute selected_rows_size of Sgd should be " + "equal to or less than the attribute grad_height. " + "But selected_rows_size is %d and grad_height is %d.", + attr->selected_rows_size, attr->grad_height)); for (int64_t i = 0; i < attr->selected_rows_size; ++i) { auto h_idx = rows[i]; - PADDLE_ENFORCE_LT(h_idx, attr->param_height); - PADDLE_ENFORCE_GE(h_idx, 0); + PADDLE_ENFORCE_LT(h_idx, attr->param_height, + platform::errors::InvalidArgument( + "The rows of Sgd should be " + "less than the attribute. But %dth of rows " + "is %d and grad_width is %d.", + i, h_idx, attr->param_height)); + PADDLE_ENFORCE_GE(h_idx, 0, platform::errors::InvalidArgument( + "The rows of Sgd should be " + "larger than 0. But %dth of rows " + "is %d.", + i, h_idx)); for (int64_t j = 0; j < attr->grad_width; ++j) { out[h_idx * attr->grad_width + j] = param[h_idx * attr->grad_width + j] - diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index eb56f111f0880f1a884e8f7f7ca2edcebfac695a..0cc62720b87943c8d92e56a53705ec9e4b46e047 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -850,8 +850,15 @@ void TestKernelSgd() { const T lr = 0.1; auto UnDuplicatedRandomVec = [](int n, const int64_t lower, const int64_t upper) -> std::vector { - PADDLE_ENFORCE_LE(static_cast(upper - lower), n - 1); - PADDLE_ENFORCE_GT(n, 0); + PADDLE_ENFORCE_LE(static_cast(upper - lower), n - 1, + paddle::platform::errors::InvalidArgument( + "The range of Sgd (upper - lower) should be lower " + "than n-1 (Sgd size -1). But the upper - lower is %d " + "and n-1 is %d.", + static_cast(upper - lower), n - 1)); + PADDLE_ENFORCE_GT( + n, 0, paddle::platform::errors::InvalidArgument( + "The Sgd size should be larger than 0. But the n is %d.", n)); std::vector all, out; for (int i = 0; i < n; ++i) { all.push_back(i); diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h index a0464cf70e2dcc44c42fc2ca7440680ef8a53e6e..aeafe22235c0954d16a73ac242ccb9e54a15413b 100644 --- a/paddle/fluid/operators/math/blas_impl.cu.h +++ b/paddle/fluid/operators/math/blas_impl.cu.h @@ -420,6 +420,22 @@ void Blas::GEMV(bool trans_a, int M, int N, }); } +template <> +template <> +inline void Blas::GEMV( + bool trans_a, int M, int N, platform::float16 alpha, + const platform::float16 *A, const platform::float16 *B, + platform::float16 beta, platform::float16 *C) const { + // Because cublas doesn't support half gemv, we use cublasHgemm to achieve it. + if (trans_a) { + this->template GEMM(CblasNoTrans, CblasNoTrans, 1, N, M, + alpha, B, A, beta, C); + } else { + this->template GEMM(CblasNoTrans, CblasNoTrans, M, 1, N, + alpha, A, B, beta, C); + } +} + template <> template void Blas::BatchedGEMM( @@ -479,6 +495,19 @@ void Blas::BatchedGEMM( } } +template <> +template <> +inline void Blas::BatchedGEMM( + CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K, + platform::float16 alpha, const platform::float16 **A, + const platform::float16 **B, platform::float16 beta, platform::float16 **C, + int batchCount) const { + for (int k = 0; k < batchCount; ++k) { + this->template GEMM(transA, transB, M, N, K, alpha, A[k], + B[k], beta, C[k]); + } +} + template <> template void Blas::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo, diff --git a/paddle/fluid/operators/math/segment_pooling.cu b/paddle/fluid/operators/math/segment_pooling.cu index bb2b6db100b65dae175af1738a9592b1c4212a9a..37155fa184e23a3582226eaef2d2813e79e7c61c 100644 --- a/paddle/fluid/operators/math/segment_pooling.cu +++ b/paddle/fluid/operators/math/segment_pooling.cu @@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/elementwise/elementwise_div_op.h" +#include #include "paddle/fluid/operators/gather.cu.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/segment_pooling.h" #include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/gpu_launch_param_config.h" -#include "paddle/fluid/platform/macros.h" namespace paddle { namespace operators { @@ -100,7 +99,7 @@ __global__ void SegmentOpsKernel(const Index* segment_ids, const T* input, CUDA_KERNEL_LOOP(stripe_index, h.total_stripe_count) { Index segment_offset, dim_index_base, actual_height; Index inner_dim_size = h.inner_dim_size; - h.calculate(stripe_index, segment_offset, dim_index_base, actual_height); + h.calculate(stripe_index, &segment_offset, &dim_index_base, &actual_height); T minmax = pool.initial(); Index first_segment_id = segment_ids[dim_index_base]; @@ -154,7 +153,7 @@ __global__ void SegmentIndexGradKernel(const Index* segment_ids, const T* input, T* in_grad, Helper h) { CUDA_KERNEL_LOOP(stripe_index, h.total_stripe_count) { Index segment_offset, dim_index_base, actual_height; - h.calculate(stripe_index, segment_offset, dim_index_base, actual_height); + h.calculate(stripe_index, &segment_offset, &dim_index_base, &actual_height); for (Index j = 0; j < actual_height; j++) { Index current_segment_id = segment_ids[dim_index_base + j]; @@ -217,11 +216,11 @@ class ArrangeHelper { total_stripe_count = inner_dim_size * input_outer_dim_num_stripe; } - DEVICE inline void calculate(T stripe_index, T& segment_offset, - T& dim_index_base, T& actual_height) { - segment_offset = stripe_index % inner_dim_size; - dim_index_base = stripe_index / inner_dim_size * DimTileSize; - actual_height = min(DimTileSize, input_length_size - dim_index_base); + DEVICE inline void calculate(T stripe_index, T* segment_offset, + T* dim_index_base, T* actual_height) { + *segment_offset = stripe_index % inner_dim_size; + *dim_index_base = stripe_index / inner_dim_size * DimTileSize; + *actual_height = min(DimTileSize, input_length_size - *dim_index_base); } }; diff --git a/paddle/fluid/operators/matmul_op_xpu.cc b/paddle/fluid/operators/matmul_op_xpu.cc new file mode 100644 index 0000000000000000000000000000000000000000..ff038d7ef1223d7e7ddafe942492b845172d4986 --- /dev/null +++ b/paddle/fluid/operators/matmul_op_xpu.cc @@ -0,0 +1,343 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_XPU + +#include +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/blas.h" + +namespace paddle { +namespace operators { + +static framework::DDim RowMatrixFromVector(const framework::DDim &x_dim) { + if (x_dim.size() > 1) { + return x_dim; + } + return framework::make_ddim({1, x_dim[0]}); +} + +static framework::Tensor FoldInitDims(const framework::Tensor &input) { + auto output = input; + auto in_dims = input.dims(); + if (in_dims.size() == 3) { + output.Resize({in_dims[0] * in_dims[1], in_dims[2]}); + } + return output; +} +/** + * Get column matrix shape from a vector shape. If the ran of y_dim > 1, the + * original y_dim is returned. + */ +static framework::DDim ColumnMatrixFromVector(const framework::DDim &y_dim) { + if (y_dim.size() > 1) { + return y_dim; + } + return framework::make_ddim({y_dim[0], 1}); +} + +static void ReshapeTensorIntoMatrixSequence( + framework::Tensor *x, const math::MatDescriptor &descriptor) { + int64_t h, w; + h = descriptor.height_; + w = descriptor.width_; + if (descriptor.trans_) { + std::swap(w, h); + } + if (descriptor.batch_size_) { + x->Resize({descriptor.batch_size_, h, w}); + } else { + x->Resize({h, w}); + } +} +/** + * Reshape the x,y,out tensor to 3-D or 2-D tensor by matrix descriptor + * Out = matmul(x, y) + * + * This method will first calculate X,Y matrix sequence, and then calculate + * the out shape. + * + * Assume X = [BatchSize, H1, W1], Y = [BatchSize, H2, W2] + * The out = [BatchSize, H1, W2] + * + * If there is no batch size in `X` and `Y`, the out will be [H1, W2] + * If any of `X` and `Y` has batch size BatchSize, the out will have the + * BatchSize. + */ +static void ReshapeXYOutIntoMatrixSequence(framework::Tensor *x, + framework::Tensor *y, + framework::Tensor *out, bool trans_x, + bool trans_y) { + auto x_dim = RowMatrixFromVector(x->dims()); + auto y_dim = ColumnMatrixFromVector(y->dims()); + auto mat_dim_x = math::CreateMatrixDescriptor(x_dim, 0, trans_x); + auto mat_dim_y = math::CreateMatrixDescriptor(y_dim, 0, trans_y); + if (mat_dim_x.batch_size_ == 0 && mat_dim_y.batch_size_ == 0) { + out->Resize({mat_dim_x.height_, mat_dim_y.width_}); + } else { + out->Resize({std::max(mat_dim_x.batch_size_, mat_dim_y.batch_size_), + mat_dim_x.height_, mat_dim_y.width_}); + } + + ReshapeTensorIntoMatrixSequence(x, mat_dim_x); + ReshapeTensorIntoMatrixSequence(y, mat_dim_y); +} + +template +class MatMulXPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *x = context.Input("X"); + auto *y = context.Input("Y"); + auto *out = context.Output("Out"); + out->mutable_data(context.GetPlace()); + + auto mat_dim_a = math::CreateMatrixDescriptor( + RowMatrixFromVector(x->dims()), 0, context.Attr("transpose_X")); + auto mat_dim_b = + math::CreateMatrixDescriptor(ColumnMatrixFromVector(y->dims()), 0, + context.Attr("transpose_Y")); + PADDLE_ENFORCE_EQ( + mat_dim_a.width_, mat_dim_b.height_, + platform::errors::InvalidArgument("Shape mistake in matmul_op")); + PADDLE_ENFORCE_EQ( + mat_dim_a.batch_size_, mat_dim_b.batch_size_, + platform::errors::InvalidArgument("Shape mistake in matmul_op")); + T alpha = static_cast(context.Attr("alpha")); + + auto &dev_ctx = context.template device_context(); + float *data_c = out->data(); + if (mat_dim_a.batch_size_ == 0 || mat_dim_a.batch_size_ == 1) { + int r = + xpu::fc_int16(dev_ctx.x_context(), mat_dim_a.trans_, mat_dim_b.trans_, + mat_dim_a.height_, mat_dim_b.width_, mat_dim_a.width_, + alpha, x->data(), y->data(), 0.0f, data_c); + PADDLE_ENFORCE_EQ( + r, XPU_SUCCESS, + platform::errors::External( + "XPU API return wrong value[%d], please check whether " + "Baidu Kunlun Card is properly installed.", + r)); + } else { + // batch matmul + int r = xpu::batched_gemm_int16(dev_ctx.x_context(), mat_dim_a.trans_, + mat_dim_b.trans_, mat_dim_a.batch_size_, + mat_dim_a.height_, mat_dim_b.width_, + mat_dim_a.width_, alpha, x->data(), + y->data(), data_c, nullptr, nullptr); + PADDLE_ENFORCE_EQ( + r, XPU_SUCCESS, + platform::errors::External( + "XPU API return wrong value[%d], please check whether " + "Baidu Kunlun Card is properly installed.", + r)); + } + } +}; + +// Reshape a rank-3 tensor from P x M x N to M x (P * N). +// (Warning: This requires transposing data and writes into new memory.) +// Identity op if the tensor is not of rank 3. +template +static framework::Tensor XPUFoldHeadAndLastDims( + const DeviceContext &context, const framework::Tensor &input) { + auto in_dims = input.dims(); + if (in_dims.size() != 3) { + return input; + } + + framework::Tensor output; + output.Resize({in_dims[1], in_dims[0], in_dims[2]}); + output.mutable_data(context.GetPlace()); + std::vector in_shape_host = {static_cast(in_dims[0]), + static_cast(in_dims[1]), + static_cast(in_dims[2])}; + std::vector axis_host = {1, 0, 2}; + + int r = xpu::transpose(context.x_context(), input.data(), output.data(), + in_shape_host.data(), axis_host.data(), /*ndims=*/3); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, + platform::errors::External( + "XPU API return wrong value[%d], please check whether " + "Baidu Kunlun Card is properly installed.", + r)); + output.Resize({in_dims[1], in_dims[0] * in_dims[2]}); + + return output; +} + +// Using dimensional constraints on matrix multiplication, it is +// straight-forward to check the following table for when X and Y +// are both matrices. +// +// transpose_X | False | True | False | True +// transpose_Y | False | False | True | True +// -----------+----------+----------+----------+----------- +// dX = | dOut Y^T | Y dOut^T | dOut Y | Y^T dOut^T +// dY = | X^T dOut | X dOut | dOut^T X | dOut^T X^T +// +// When X is a vector of size K, we treat it instead as a matrix of shape +// (1, K). Similarly, when Y is a vector of size K, we treat it instead as +// a matrix of shape (K, 1). +// +// When X and Y are both 3-dimensional tensors, then the first dimension +// the batch dimension can be ignored and the exact same formulas apply +// as for two matrices. +// +// Finally, when, e.g., X is a 3-dimensional tensor but Y is a matrix, we end +// up with formulas like +// +// dY_{ij} = \sum_{p, m} X_{pmi} dOut_{pmj} +// +// To handle this sort of scenario, we reshape X : P x M x K, dOut: P x M x N +// to X: (P * M) x K, dOut: (P * M) x N. +template +class MatMulGradXPUKernel : public framework::OpKernel { + public: + void MatMul(const framework::ExecutionContext &context, + const framework::Tensor &a, bool trans_a, + const framework::Tensor &b, bool trans_b, + framework::Tensor *out) const { + out->mutable_data(context.GetPlace()); + auto mat_dim_a = math::CreateMatrixDescriptor(a.dims(), 0, trans_a); + auto mat_dim_b = math::CreateMatrixDescriptor(b.dims(), 0, trans_b); + PADDLE_ENFORCE_EQ( + mat_dim_a.width_, mat_dim_b.height_, + platform::errors::InvalidArgument("Shape mistake in matmul_grad_op")); + PADDLE_ENFORCE_EQ( + mat_dim_a.batch_size_, mat_dim_b.batch_size_, + platform::errors::InvalidArgument("Shape mistake in matmul_grad_op")); + T alpha = static_cast(context.Attr("alpha")); + + auto &dev_ctx = context.template device_context(); + float *data_c = out->data(); + if (mat_dim_a.batch_size_ == 0 || mat_dim_a.batch_size_ == 1) { + int r = + xpu::fc_int16(dev_ctx.x_context(), mat_dim_a.trans_, mat_dim_b.trans_, + mat_dim_a.height_, mat_dim_b.width_, mat_dim_a.width_, + alpha, a.data(), b.data(), 0.0f, data_c); + PADDLE_ENFORCE_EQ( + r, XPU_SUCCESS, + platform::errors::External( + "XPU API return wrong value[%d], please check whether " + "Baidu Kunlun Card is properly installed.", + r)); + } else { + // batch matmul + int r = xpu::batched_gemm_int16(dev_ctx.x_context(), mat_dim_a.trans_, + mat_dim_b.trans_, mat_dim_a.batch_size_, + mat_dim_a.height_, mat_dim_b.width_, + mat_dim_a.width_, alpha, a.data(), + b.data(), data_c, nullptr, nullptr); + PADDLE_ENFORCE_EQ( + r, XPU_SUCCESS, + platform::errors::External( + "XPU API return wrong value[%d], please check whether " + "Baidu Kunlun Card is properly installed.", + r)); + } + } + + void CalcInputGrad(const framework::ExecutionContext &context, + const framework::Tensor &a, bool trans_a, + bool is_fold_init_dims_a, const framework::Tensor &b, + bool trans_b, bool is_fold_init_dims_b, + framework::Tensor *out) const { + if (out == nullptr) return; + bool need_combine = (a.dims().size() == 3 || b.dims().size() == 3) && + out->dims().size() == 2; + if (!need_combine) { + MatMul(context, a, trans_a, b, trans_b, out); + } else { + auto &dev_ctx = context.template device_context(); + MatMul( + context, is_fold_init_dims_a + ? FoldInitDims(a) + : XPUFoldHeadAndLastDims(dev_ctx, a), + trans_a, is_fold_init_dims_b + ? FoldInitDims(b) + : XPUFoldHeadAndLastDims(dev_ctx, b), + trans_b, out); + } + } + + void Compute(const framework::ExecutionContext &context) const override { + auto x = *context.Input("X"); + auto y = *context.Input("Y"); + auto dout = + *context.Input(framework::GradVarName("Out")); + auto *dx = context.Output(framework::GradVarName("X")); + auto *dy = context.Output(framework::GradVarName("Y")); + bool transpose_x = context.Attr("transpose_X"); + bool transpose_y = context.Attr("transpose_Y"); + + ReshapeXYOutIntoMatrixSequence(&x, &y, &dout, transpose_x, transpose_y); + + framework::DDim dx_dims; + if (dx) { + dx_dims = dx->dims(); + if (dx_dims != x.dims()) { + dx->Resize(x.dims()); + } + } + + framework::DDim dy_dims; + if (dy) { + dy_dims = dy->dims(); + if (dy_dims != y.dims()) { + dy->Resize(y.dims()); + } + } + + if (transpose_x && transpose_y) { + CalcInputGrad(context, y, true, true, dout, true, false, dx); + CalcInputGrad(context, dout, true, true, x, true, false, dy); + } else if (transpose_x) { + CalcInputGrad(context, y, false, false, dout, true, false, dx); + CalcInputGrad(context, x, false, false, dout, false, true, dy); + } else if (transpose_y) { + CalcInputGrad(context, dout, false, false, y, false, true, dx); + CalcInputGrad(context, dout, true, true, x, false, true, dy); + } else { + CalcInputGrad(context, dout, false, false, y, true, false, dx); + CalcInputGrad(context, x, true, true, dout, false, true, dy); + } + + if (dx) { + if (dx_dims != x.dims()) { + dx->Resize(dx_dims); + } + } + + if (dy) { + if (dy_dims != y.dims()) { + dy->Resize(dy_dims); + } + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_XPU_KERNEL( + matmul, ops::MatMulXPUKernel); +REGISTER_OP_XPU_KERNEL( + matmul_grad, + ops::MatMulGradXPUKernel); +#endif diff --git a/paddle/fluid/operators/matmul_v2_op.cu b/paddle/fluid/operators/matmul_v2_op.cu index 64ec65a23419725c7cc481beadb9383402a426bd..91958513ddb3c9923487e5de86f188bc3a0a6f65 100644 --- a/paddle/fluid/operators/matmul_v2_op.cu +++ b/paddle/fluid/operators/matmul_v2_op.cu @@ -17,10 +17,12 @@ limitations under the License. */ namespace ops = paddle::operators; namespace plf = paddle::platform; -REGISTER_OP_CUDA_KERNEL(matmul_v2, - ops::MatMulV2Kernel, - ops::MatMulV2Kernel); +REGISTER_OP_CUDA_KERNEL( + matmul_v2, ops::MatMulV2Kernel, + ops::MatMulV2Kernel, + ops::MatMulV2Kernel); REGISTER_OP_CUDA_KERNEL( matmul_v2_grad, ops::MatMulV2GradKernel, - ops::MatMulV2GradKernel); + ops::MatMulV2GradKernel, + ops::MatMulV2GradKernel); diff --git a/paddle/fluid/operators/matmul_v2_op.h b/paddle/fluid/operators/matmul_v2_op.h index 8cd4fa12be4065b3ece42e7525481f2f04f35bc8..ee485bd1711e21b86cdf65fdb2f5f0793e42beb4 100644 --- a/paddle/fluid/operators/matmul_v2_op.h +++ b/paddle/fluid/operators/matmul_v2_op.h @@ -163,17 +163,20 @@ void MatMulFunction(const Tensor* X, const Tensor* Y, if (trans_y) { const int M = Y->numel() / N; VLOG(3) << "MatMul's case 2"; - blas.GEMV(false, M, N, 1., y_data, x_data, 0., Out->data()); + blas.GEMV(false, M, N, static_cast(1), y_data, x_data, + static_cast(0), Out->data()); } else { const int M = y_dims[y_ndim - 1]; const int batch_size = Y->numel() / (M * N); if (batch_size == 1) { VLOG(3) << "MatMul's case 3"; - blas.GEMV(true, N, M, 1., y_data, x_data, 0., Out->data()); + blas.GEMV(true, N, M, static_cast(1), y_data, x_data, + static_cast(0), Out->data()); } else { VLOG(3) << "MatMul's case 4"; - blas.BatchedGEMM(CblasTrans, CblasNoTrans, M, 1, N, 1.0f, y_data, - x_data, 0, Out->data(), batch_size, M * N, 0); + blas.BatchedGEMM(CblasTrans, CblasNoTrans, M, 1, N, static_cast(1), + y_data, x_data, static_cast(0), Out->data(), + batch_size, M * N, 0); } } return; @@ -205,16 +208,19 @@ void MatMulFunction(const Tensor* X, const Tensor* Y, const int batch_size = X->numel() / (M * N); if (batch_size == 1) { VLOG(3) << "MatMul's case 5"; - blas.GEMV(true, N, M, 1.0f, x_data, y_data, 0.0f, Out->data()); + blas.GEMV(true, N, M, static_cast(1), x_data, y_data, + static_cast(0), Out->data()); } else { VLOG(3) << "MatMul's case 6"; - blas.BatchedGEMM(CblasTrans, CblasNoTrans, M, 1, N, 1.0f, x_data, - y_data, 0, Out->data(), batch_size, M * N, 0); + blas.BatchedGEMM(CblasTrans, CblasNoTrans, M, 1, N, static_cast(1), + x_data, y_data, static_cast(0), Out->data(), + batch_size, M * N, 0); } } else { const int M = X->numel() / N; VLOG(3) << "MatMul's case 7"; - blas.GEMV(false, M, N, 1.0f, x_data, y_data, 0.0f, Out->data()); + blas.GEMV(false, M, N, static_cast(1), x_data, y_data, + static_cast(0), Out->data()); } return; } @@ -263,37 +269,38 @@ void MatMulFunction(const Tensor* X, const Tensor* Y, if (x_batch_size == 1 && y_batch_size == 1) { VLOG(3) << "MatMul's case 8"; blas.GEMM(trans_x ? CblasTrans : CblasNoTrans, - trans_y ? CblasTrans : CblasNoTrans, M, N, K, 1.0f, x_data, - y_data, 0.0f, Out->data()); + trans_y ? CblasTrans : CblasNoTrans, M, N, K, static_cast(1), + x_data, y_data, static_cast(0), Out->data()); } else if (x_batch_size == 1) { if (M == 1 && trans_y) { VLOG(3) << "MatMul's case 9"; - blas.GEMV(false, y_batch_size * N, K, 1.0f, y_data, x_data, 0.0f, - Out->data()); + blas.GEMV(false, y_batch_size * N, K, static_cast(1), y_data, x_data, + static_cast(0), Out->data()); } else { VLOG(3) << "MatMul's case 10"; blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans, - trans_y ? CblasTrans : CblasNoTrans, M, N, K, 1.0f, - x_data, y_data, 0, Out->data(), out_batch_size, 0, - K * N); + trans_y ? CblasTrans : CblasNoTrans, M, N, K, + static_cast(1), x_data, y_data, static_cast(0), + Out->data(), out_batch_size, 0, K * N); } } else if (y_batch_size == 1) { if (!trans_x) { VLOG(3) << "MatMul's case 11"; blas.GEMM(CblasNoTrans, trans_y ? CblasTrans : CblasNoTrans, - x_batch_size * M, N, K, 1.0f, x_data, y_data, 0.0f, - Out->data()); + x_batch_size * M, N, K, static_cast(1), x_data, y_data, + static_cast(0), Out->data()); } else { VLOG(3) << "MatMul's case 12"; blas.BatchedGEMM(CblasTrans, trans_y ? CblasTrans : CblasNoTrans, M, N, K, - 1.0f, x_data, y_data, 0, Out->data(), out_batch_size, - M * K, 0); + static_cast(1), x_data, y_data, static_cast(0), + Out->data(), out_batch_size, M * K, 0); } } else if (!is_broadcast_dims) { VLOG(3) << "MatMul's case 13"; blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans, - trans_y ? CblasTrans : CblasNoTrans, M, N, K, 1.0f, x_data, - y_data, 0, Out->data(), out_batch_size, M * K, K * N); + trans_y ? CblasTrans : CblasNoTrans, M, N, K, + static_cast(1), x_data, y_data, static_cast(0), + Out->data(), out_batch_size, M * K, K * N); } else { // in the case, can't use stridedgemm std::vector x_ptr(out_batch_size); @@ -314,9 +321,9 @@ void MatMulFunction(const Tensor* X, const Tensor* Y, } VLOG(3) << "MatMul's case 14"; blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans, - trans_y ? CblasTrans : CblasNoTrans, M, N, K, 1.0f, - x_ptr.data(), y_ptr.data(), 0.0f, out_ptr.data(), - out_batch_size); + trans_y ? CblasTrans : CblasNoTrans, M, N, K, + static_cast(1), x_ptr.data(), y_ptr.data(), + static_cast(0), out_ptr.data(), out_batch_size); } } diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc index a6cda154e55b972fc653cffc4815f9e0f6e975de..7a4e11091fd3a6d064f3c4d905bb65c61d62d882 100644 --- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc @@ -55,12 +55,12 @@ inline MKLDNNMemoryFormat GetWeightsFormat(const MKLDNNMemoryFormat format, } } -static mkldnn::memory::data_type GetDstType(bool is_int8, +static mkldnn::memory::data_type GetDstType(bool is_int8, bool is_bfloat16, bool force_fp32_output, std::string fuse_activation, bool fuse_residual_conn, const Tensor* residual_param) { - auto dst_dt = mkldnn::memory::data_type::f32; // uint8_t, int8_t, float + auto dst_dt = mkldnn::memory::data_type::f32; if (is_int8) { dst_dt = (fuse_activation == "relu" || fuse_activation == "relu6") ? mkldnn::memory::data_type::u8 @@ -72,6 +72,13 @@ static mkldnn::memory::data_type GetDstType(bool is_int8, auto residual_dt = framework::ToMKLDNNDataType(residual_param->type()); if (dst_dt != residual_dt) dst_dt = residual_dt; } + } else { + if (!force_fp32_output && is_bfloat16) { + dst_dt = mkldnn::memory::data_type::bf16; + if (fuse_residual_conn && residual_param) { + dst_dt = framework::ToMKLDNNDataType(residual_param->type()); + } + } } return dst_dt; } @@ -224,12 +231,15 @@ class ConvMKLDNNHandlerT src_tz.size(), chosen_memory_format); } } - - const auto src_md = platform::MKLDNNMemDesc( - src_tz, platform::MKLDNNGetDataType(), chosen_memory_format); - const auto weights_md = - platform::MKLDNNMemDesc(weights_tz, platform::MKLDNNGetDataType(), - MKLDNNMemoryFormat::any); + auto data_type = mkldnn::memory::data_type::f32; + if (ctx.Attr("mkldnn_data_type") == "bfloat16" || + std::is_same::value) + data_type = mkldnn::memory::data_type::bf16; + + const auto src_md = + platform::MKLDNNMemDesc(src_tz, data_type, chosen_memory_format); + const auto weights_md = platform::MKLDNNMemDesc(weights_tz, data_type, + MKLDNNMemoryFormat::any); const auto dst_md = platform::MKLDNNMemDesc( dst_tz, platform::MKLDNNGetDataType(), chosen_memory_format); @@ -241,8 +251,8 @@ class ConvMKLDNNHandlerT if (bias) { auto bias_tz = framework::vectorize(bias->dims()); - auto bias_md = platform::MKLDNNMemDesc( - bias_tz, platform::MKLDNNGetDataType(), MKLDNNMemoryFormat::x); + auto bias_md = + platform::MKLDNNMemDesc(bias_tz, data_type, MKLDNNMemoryFormat::x); this->AcquireForwardPrimitiveDescriptor( conv_attr, fwd_prop_kind, dnnl::algorithm::convolution_direct, @@ -384,15 +394,21 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { "Operator DNNL Conv must use CPUPlace")); bool is_INT8 = std::is_same::value || std::is_same::value; + bool is_BFLOAT16 = ctx.Attr("mkldnn_data_type") == "bfloat16"; + auto residual_param = ctx.Input("ResidualData"); + bool fuse_residual_conn = ctx.Attr("fuse_residual_connection"); + std::string fuse_activation = ctx.Attr("fuse_activation"); + bool force_fp32_output = ctx.Attr("force_fp32_output"); + auto dst_dt = + GetDstType(is_INT8, is_BFLOAT16, force_fp32_output, fuse_activation, + fuse_residual_conn, residual_param); if (!is_INT8) { - ComputeFP32(ctx); + if (dst_dt == mkldnn::memory::data_type::f32) { + ComputeFP32(ctx); + } else if (dst_dt == mkldnn::memory::data_type::bf16) { + ComputeFP32(ctx); + } } else { - std::string fuse_activation = ctx.Attr("fuse_activation"); - bool fuse_residual_conn = ctx.Attr("fuse_residual_connection"); - bool force_fp32_output = ctx.Attr("force_fp32_output"); - auto residual_param = ctx.Input("ResidualData"); - auto dst_dt = GetDstType(true, force_fp32_output, fuse_activation, - fuse_residual_conn, residual_param); if (dst_dt == mkldnn::memory::data_type::f32) { ComputeINT8(ctx); } else if (dst_dt == mkldnn::memory::data_type::u8) { @@ -1103,6 +1119,10 @@ REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, ops::kConvMKLDNNFP32, ops::ConvMKLDNNOpKernel); +REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE( + conv2d, MKLDNN, ::paddle::platform::CPUPlace, BF16, ops::kConvMKLDNNFP32, + ops::ConvMKLDNNOpKernel); + REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, ::paddle::platform::CPUPlace, U8, ops::kConvMKLDNNINT8, diff --git a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc index 540642c7140e707441ad9c4d71ae9b777863a7bd..70d4c34d9c5c4d28e2705c85f56bc65f90fbb3cf 100644 --- a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc @@ -110,4 +110,5 @@ class DeQuantOpKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_KERNEL(dequantize, MKLDNN, ::paddle::platform::CPUPlace, - ops::DeQuantOpKernel, ops::DeQuantOpKernel); + ops::DeQuantOpKernel, ops::DeQuantOpKernel, + ops::DeQuantOpKernel); diff --git a/paddle/fluid/operators/xpu/mul_xpu_op.cc b/paddle/fluid/operators/mul_op_xpu.cc similarity index 100% rename from paddle/fluid/operators/xpu/mul_xpu_op.cc rename to paddle/fluid/operators/mul_op_xpu.cc index 79aae71c3045f938f4b8f0d3e05ce7cf358c41ea..0c8469101abf097629e2768ee35f72a0e0f72bc5 100644 --- a/paddle/fluid/operators/xpu/mul_xpu_op.cc +++ b/paddle/fluid/operators/mul_op_xpu.cc @@ -14,11 +14,11 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU +#include "paddle/fluid/operators/mul_op.h" #include #include #include #include -#include "paddle/fluid/operators/mul_op.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/norm_utils.cu.h b/paddle/fluid/operators/norm_utils.cu.h index 07333f1ae11c3889b543ca6d327e480607a4bcea..02dcb4045f4cdee6840f5caef98d7329e706eaf2 100644 --- a/paddle/fluid/operators/norm_utils.cu.h +++ b/paddle/fluid/operators/norm_utils.cu.h @@ -40,12 +40,12 @@ using DataLayout = framework::DataLayout; // (np.mean(dy, axis=(n,h,w)) - dy) + inv_var.pow(3) / NxHxW * // np.sum(dy, // axis=(n,h,w)) * (x - mean) * -// (np.mean(ddx, axis=(n,h,w)) - ddx) + ddr * (dy * inv_var - +// (np.mean(ddx, axis=(n,h,w)) - ddx)) + ddr * (dy * inv_var - // inv_var // * // np.mean(dy, axis=(n,h,w)) - // inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean), -// axis=(n,h,w)))) +// axis=(n,h,w))) template __global__ void DoubleGradComputeDX(const T *x, const T *mean, @@ -138,7 +138,7 @@ __global__ void DoubleGradComputeDX(const T *x, const T *mean, ? (j / sample_size * C + i) * sample_size + j % sample_size : j * outer_size + i; dx[index] += (dy[index] * var_val - dy_sum_val / inner_size * var_val - - (x[index] - mean_val) * var_val * + (x[index] - mean_val) * var_val * var_val * dy_mul_x_sub_mean_sum_val * var_val / inner_size) * ddscale[i]; } @@ -326,19 +326,57 @@ __global__ void DoubleGradComputeDScaleWithGlobal( } // math: dx = ddscale * dy * inv_var -// math: ddy = scale * ddx * inv_var template -__global__ void DoubleGradComputeDataWithGlobal( - const T *dy, const T *scale, const T *variance, const double epsilon, - const int C, const int sample_size, const int num, T *dx) { +__global__ void DoubleGradComputeDXWithGlobal(const T *dy, const T *ddscale, + const T *variance, + const double epsilon, const int C, + const int sample_size, + const int num, T *dx) { int gid = blockIdx.x * blockDim.x + threadIdx.x; int stride = blockDim.x * gridDim.x; - if (scale != nullptr) { + if (ddscale != nullptr) { for (int i = gid; i < num; i += stride) { const int c = layout == framework::DataLayout::kNCHW ? i / sample_size % C : i % C; T inv_var = 1.0 / sqrt(variance[c] + epsilon); - dx[i] = dy[i] * scale[c] * inv_var; + dx[i] = dy[i] * ddscale[c] * inv_var; + } + } +} + +// math: ddy = scale * ddx * inv_var + ddbias + +// ddscale * (x - mean) * inv_var +template +__global__ void DoubleGradComputeDDYWithGlobal( + const T *ddx, const T *scale, const T *mean, const T *variance, const T *x, + const T *ddbias, const T *ddscale, const double epsilon, const int C, + const int sample_size, const int num, T *ddy) { + int gid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + + if (ddx != nullptr) { + for (int i = gid; i < num; i += stride) { + const int c = + layout == framework::DataLayout::kNCHW ? i / sample_size % C : i % C; + T inv_var = 1.0 / sqrt(variance[c] + epsilon); + ddy[i] += ddx[i] * scale[c] * inv_var; + } + } + __syncthreads(); + if (ddscale != nullptr) { + for (int i = gid; i < num; i += stride) { + const int c = + layout == framework::DataLayout::kNCHW ? i / sample_size % C : i % C; + T inv_var = 1.0 / sqrt(variance[c] + epsilon); + ddy[i] += (x[i] - mean[c]) * inv_var * ddscale[c]; + } + } + __syncthreads(); + if (ddbias != nullptr) { + for (int i = gid; i < num; i += stride) { + const int c = + layout == framework::DataLayout::kNCHW ? i / sample_size % C : i % C; + ddy[i] += ddbias[c]; } } } @@ -383,8 +421,11 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, const T *mean_data, *variance_data; if (use_global_stats) { + const auto *running_mean = ctx.Input("Mean"); const auto *running_var = ctx.Input("Variance"); + const auto *running_mean_data = running_mean->template data(); const auto *running_var_data = running_var->template data(); + mean_data = running_mean_data; variance_data = running_var_data; } else { const T *smean_data = Saved_mean->data(); @@ -398,12 +439,12 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, set_constant(dev_ctx, dX, static_cast(0)); if (use_global_stats) { if (data_layout == DataLayout::kNHWC) { - DoubleGradComputeDataWithGlobal< + DoubleGradComputeDXWithGlobal< T, DataLayout::kNHWC><<>>( dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num, dx_data); } else { - DoubleGradComputeDataWithGlobal< + DoubleGradComputeDXWithGlobal< T, DataLayout::kNCHW><<>>( dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num, dx_data); @@ -456,15 +497,15 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, set_constant(dev_ctx, ddY, static_cast(0)); if (use_global_stats) { if (data_layout == DataLayout::kNHWC) { - DoubleGradComputeDataWithGlobal< + DoubleGradComputeDDYWithGlobal< T, DataLayout::kNHWC><<>>( - ddx_data, scale_data, variance_data, epsilon, C, sample_size, num, - ddy_data); + ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data, + ddscale_data, epsilon, C, sample_size, num, ddy_data); } else { - DoubleGradComputeDataWithGlobal< + DoubleGradComputeDDYWithGlobal< T, DataLayout::kNCHW><<>>( - ddx_data, scale_data, variance_data, epsilon, C, sample_size, num, - ddy_data); + ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data, + ddscale_data, epsilon, C, sample_size, num, ddy_data); } } else { if (data_layout == DataLayout::kNHWC) { diff --git a/paddle/fluid/operators/optimizers/dpsgd_op.cc b/paddle/fluid/operators/optimizers/dpsgd_op.cc index 3bcf17fc7b37f1c29e23ccccc7d4df92e705671e..bce00933420e4cfa750e93d563070cc48051d6cc 100644 --- a/paddle/fluid/operators/optimizers/dpsgd_op.cc +++ b/paddle/fluid/operators/optimizers/dpsgd_op.cc @@ -24,32 +24,45 @@ class DpsgdOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext *ctx) const override { PADDLE_ENFORCE_EQ(ctx->HasInput("Param"), true, - "Input(Param) of DpsgdOp should not be null."); + platform::errors::NotFound( + "Input(Param) of DpsgdOp should not be null.")); PADDLE_ENFORCE_EQ(ctx->HasInput("Grad"), true, - "Input(Grad) of DpsgdOp should not be null."); - PADDLE_ENFORCE_EQ(ctx->HasInput("LearningRate"), true, - "Input(LearningRate) of DpsgdOp should not be null."); + platform::errors::NotFound( + "Input(Grad) of DpsgdOp should not be null.")); + PADDLE_ENFORCE_EQ( + ctx->HasInput("LearningRate"), true, + platform::errors::NotFound( + "Input(LearningRate) of DpsgdOp should not be null.")); PADDLE_ENFORCE_EQ( ctx->GetInputsVarType("Param").front(), framework::proto::VarType::LOD_TENSOR, - "The input var's type should be LoDTensor, but the received is %s", - ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front()); + platform::errors::InvalidArgument( + "The input var's type should be LoDTensor, but the received is %s", + ctx->GetInputsVarType("Param").front())); PADDLE_ENFORCE_EQ( ctx->GetInputsVarType("Grad").front(), framework::proto::VarType::LOD_TENSOR, - "The input var's type should be LoDTensor, but the received is %s", - ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front()); + platform::errors::InvalidArgument( + "The input var's type should be LoDTensor, but the received is %s", + ctx->GetInputsVarType("Grad").front())); PADDLE_ENFORCE_EQ(ctx->HasOutput("ParamOut"), true, - "Output(ParamOut) of DpsgdOp should not be null."); + platform::errors::NotFound( + "Output(ParamOut) of DpsgdOp should not be null.")); auto lr_dims = ctx->GetInputDim("LearningRate"); PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, - "Learning rate should have 1 dimension"); + platform::errors::InvalidArgument( + "Learning rate should have 1 dimension. But Received " + "LearningRate's dims [%s].", + framework::product(lr_dims))); auto param_dims = ctx->GetInputDim("Param"); PADDLE_ENFORCE_EQ( param_dims, ctx->GetInputDim("Grad"), - "Param and Grad input of DpsgdOp should have same dimension"); + platform::errors::InvalidArgument( + "Param and Grad input of DpsgdOp should have same dimension. But " + "received Para's dim [%s] and Grad's dim [%s].", + param_dims, ctx->GetInputDim("Grad"))); ctx->SetOutputDim("ParamOut", param_dims); } diff --git a/paddle/fluid/operators/optimizers/dpsgd_op.h b/paddle/fluid/operators/optimizers/dpsgd_op.h index 4eb52feb85108637aa4878d3555bc3cbff674420..e52a1dd9db1791e0be82eb1ee47999d2b8f51175 100644 --- a/paddle/fluid/operators/optimizers/dpsgd_op.h +++ b/paddle/fluid/operators/optimizers/dpsgd_op.h @@ -28,17 +28,19 @@ class DpsgdOpKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext &ctx) const override { const auto *param_var = ctx.InputVar("Param"); PADDLE_ENFORCE_EQ(param_var->IsType(), true, - "The Var(%s)'s type should be LoDTensor, " - "but the received is %s", - ctx.InputNames("Param").front(), - framework::ToTypeName(param_var->Type())); + platform::errors::InvalidArgument( + "The Var(%s)'s type should be LoDTensor, " + "but the received is %s", + ctx.InputNames("Param").front(), + framework::ToTypeName(param_var->Type()))); const auto *grad_var = ctx.InputVar("Grad"); PADDLE_ENFORCE_EQ(grad_var->IsType(), true, - "The Var(%s)'s type should be LoDTensor, " - "but the received is %s", - ctx.InputNames("Grad").front(), - framework::ToTypeName(grad_var->Type())); + platform::errors::InvalidArgument( + "The Var(%s)'s type should be LoDTensor, " + "but the received is %s", + ctx.InputNames("Grad").front(), + framework::ToTypeName(grad_var->Type()))); const auto *learning_rate = ctx.Input("LearningRate"); diff --git a/paddle/fluid/operators/optimizers/momentum_op.h b/paddle/fluid/operators/optimizers/momentum_op.h index 10b72524efd4a8f9174eab4f45e6173dc56f2c27..083bd91abfc47a4712563c739b333f7417ce21a0 100644 --- a/paddle/fluid/operators/optimizers/momentum_op.h +++ b/paddle/fluid/operators/optimizers/momentum_op.h @@ -40,43 +40,62 @@ class MomentumOp : public framework::OperatorWithKernel { protected: void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("Param"), - "Input(param) of Momentum should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Grad"), - "Input(grad) of Momentum should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Velocity"), - "Input(velocity) of Momentum should not be null."); - PADDLE_ENFORCE(ctx->HasInput("LearningRate"), - "Input(LearningRate) of Momentum should not be null."); - PADDLE_ENFORCE( - ctx->GetInputsVarType("Param").front() == - framework::proto::VarType::LOD_TENSOR, - "The input var's type should be LoDTensor, but the received is %s", - ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front()); - - PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), - "Output(ParamOut) of Momentum should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("VelocityOut"), - "Output(VelocityOut) of Momentum should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("Param"), true, + platform::errors::NotFound( + "Input(param) of Momentum should not be null.")); + PADDLE_ENFORCE_EQ(ctx->HasInput("Grad"), true, + platform::errors::NotFound( + "Input(grad) of Momentum should not be null.")); + PADDLE_ENFORCE_EQ(ctx->HasInput("Velocity"), true, + platform::errors::NotFound( + "Input(velocity) of Momentum should not be null.")); + PADDLE_ENFORCE_EQ( + ctx->HasInput("LearningRate"), true, + platform::errors::NotFound( + "Input(LearningRate) of Momentum should not be null.")); + PADDLE_ENFORCE_EQ( + ctx->GetInputsVarType("Param").front(), + framework::proto::VarType::LOD_TENSOR, + platform::errors::InvalidArgument( + "The input var's type should be LoDTensor, but the received is %s", + ctx->GetInputsVarType("Param").front())); + + PADDLE_ENFORCE_EQ(ctx->HasOutput("ParamOut"), true, + platform::errors::NotFound( + "Output(ParamOut) of Momentum should not be null.")); + PADDLE_ENFORCE_EQ( + ctx->HasOutput("VelocityOut"), true, + platform::errors::NotFound( + "Output(VelocityOut) of Momentum should not be null.")); auto lr_dims = ctx->GetInputDim("LearningRate"); PADDLE_ENFORCE_NE(framework::product(lr_dims), 0, - "Maybe the Input variable LearningRate has not " - "been initialized. You may need to confirm " - "if you put exe.run(startup_program) " - "after optimizer.minimize function."); + platform::errors::InvalidArgument( + "Maybe the Input variable LearningRate has not " + "been initialized. You may need to confirm " + "if you put exe.run(startup_program) " + "after optimizer.minimize function.")); PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, - "Learning_rate should be a scalar"); + platform::errors::InvalidArgument( + "Learning_rate should be a scalar. But Received " + "LearningRate's dim [%s]", + framework::product(lr_dims))); auto param_dim = ctx->GetInputDim("Param"); if (ctx->GetInputsVarType("Grad")[0] == framework::proto::VarType::LOD_TENSOR) { PADDLE_ENFORCE_EQ( param_dim, ctx->GetInputDim("Grad"), - "Param and Grad input of MomentumOp should have the same dimension."); + platform::errors::InvalidArgument( + "Param and Grad input of MomentumOp should have the same " + "dimension. But received Param's dim [%s] and Grad's dim [%s].", + param_dim, ctx->GetInputDim("Grad"))); PADDLE_ENFORCE_EQ( param_dim, ctx->GetInputDim("Velocity"), - "Param and Velocity of MomentumOp should have the same dimension."); + platform::errors::InvalidArgument( + "Param and Velocity of MomentumOp should have the same " + "dimension. But received Param's dim [%s] and Velocity [%s].", + param_dim, ctx->GetInputDim("Velocity"))); } ctx->SetOutputDim("ParamOut", param_dim); @@ -398,10 +417,12 @@ class MomentumOpKernel : public framework::OpKernel { for_range(functor); } } else { - PADDLE_THROW( - string::Sprintf("MomentumOp only supports LoDTensor or SelectedRows " - "gradient, but the received Variable Type is %s", - framework::ToTypeName(grad_var->Type()))); + PADDLE_ENFORCE_EQ(false, true, + platform::errors::PermissionDenied( + "Unsupported Variable Type of Grad " + "in MomentumOp. Excepted LodTensor " + "or SelectedRows, But received [%s]", + paddle::framework::ToTypeName(grad_var->Type()))); } } }; diff --git a/paddle/fluid/operators/optimizers/rmsprop_op.cc b/paddle/fluid/operators/optimizers/rmsprop_op.cc index eeee008cdc53c457146074060d526d8d0e8b43aa..9e7960c237fdec4d61ab8cf2663558b7b596d087 100644 --- a/paddle/fluid/operators/optimizers/rmsprop_op.cc +++ b/paddle/fluid/operators/optimizers/rmsprop_op.cc @@ -22,47 +22,75 @@ class RmspropOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("Param"), - "Input(Param) of RmspropOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("MeanSquare"), - "Input(MeanSquare) of RmspropOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("LearningRate"), - "Input(LearningRate) of RmspropOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Grad"), - "Input(Grad) of RmspropOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Moment"), - "Input(Moment) of RmspropOp should not be null."); - PADDLE_ENFORCE( - ctx->GetInputsVarType("Param").front() == - framework::proto::VarType::LOD_TENSOR, - "The input var's type should be LoDTensor, but the received is %s", - ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front()); - - PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), - "Output(param_out) of RmspropOp should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("MomentOut"), - "Output(MomentOut) of RmspropOp should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("MeanSquareOut"), - "Output(MeanSquareOut) of RmspropOp should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("Param"), true, + platform::errors::NotFound( + "Input(Param) of RmspropOp should not be null.")); + PADDLE_ENFORCE_EQ( + ctx->HasInput("MeanSquare"), true, + platform::errors::NotFound( + "Input(MeanSquare) of RmspropOp should not be null.")); + PADDLE_ENFORCE_EQ( + ctx->HasInput("LearningRate"), true, + platform::errors::NotFound( + "Input(LearningRate) of RmspropOp should not be null.")); + PADDLE_ENFORCE_EQ(ctx->HasInput("Grad"), true, + platform::errors::NotFound( + "Input(Grad) of RmspropOp should not be null.")); + PADDLE_ENFORCE_EQ(ctx->HasInput("Moment"), true, + platform::errors::NotFound( + "Input(Moment) of RmspropOp should not be null.")); + PADDLE_ENFORCE_EQ(ctx->GetInputsVarType("Param").front(), + framework::proto::VarType::LOD_TENSOR, + platform::errors::InvalidArgument( + "The input var's type in RmspropOp should be " + "LoDTensor, but the received is %s", + ctx->GetInputsVarType("Param").front())); + + PADDLE_ENFORCE_EQ( + ctx->HasOutput("ParamOut"), true, + platform::errors::NotFound( + "Output(param_out) of RmspropOp should not be null.")); + PADDLE_ENFORCE_EQ( + ctx->HasOutput("MomentOut"), true, + platform::errors::NotFound( + "Output(MomentOut) of RmspropOp should not be null.")); + PADDLE_ENFORCE_EQ( + ctx->HasOutput("MeanSquareOut"), true, + platform::errors::NotFound( + "Output(MeanSquareOut) of RmspropOp should not be null.")); if (ctx->Attrs().Get("centered")) { - PADDLE_ENFORCE(ctx->HasOutput("MeanGradOut"), - "Output(MeanGradOut) of RmspropOp should not be null."); + PADDLE_ENFORCE_EQ( + ctx->HasOutput("MeanGradOut"), true, + platform::errors::NotFound( + "Output(MeanGradOut) of RmspropOp should not be null.")); } auto param_dim = ctx->GetInputDim("Param"); PADDLE_ENFORCE_EQ( param_dim, ctx->GetInputDim("Grad"), - "Param and grad input of RmspropOp should have the same dimension."); + platform::errors::InvalidArgument( + "Param and grad input of RmspropOp should have the same dimension. " + "But received Param's dim [%s] and Grad's dim [%s].", + param_dim, ctx->GetInputDim("Grad"))); PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("Moment"), - "Param and Momentum input of RmspropOp " - "should have the same dimension."); + platform::errors::InvalidArgument( + "Param and Momentum input of RmspropOp " + "should have the same dimension. But received " + "Param's dim [%s] and Moment [%s]", + param_dim, ctx->GetInputDim("Moment"))); PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("MeanSquare"), - "Param and Momentum input of RmspropOp " - "should have the same dimension."); + platform::errors::InvalidArgument( + "Param and Momentum input of RmspropOp " + "should have the same dimension. But received " + "Param's dim [%s] and MeanSquare [%s]", + param_dim, ctx->GetInputDim("MeanSquare"))); auto lr_dim = ctx->GetInputDim("LearningRate"); PADDLE_ENFORCE_EQ(framework::product(lr_dim), 1, - "Learning Rate should be a scalar."); + platform::errors::InvalidArgument( + "Learning Rate of RmspropOp should be a scalar. But " + "received LearningRate's dim [%s]", + framework::product(lr_dim))); ctx->SetOutputDim("ParamOut", param_dim); ctx->SetOutputDim("MomentOut", param_dim); diff --git a/paddle/fluid/operators/optimizers/rmsprop_op.h b/paddle/fluid/operators/optimizers/rmsprop_op.h index 4550052b2d614ccbbb09f4a2b9e747708b2a2baa..1ec712a1431a4657cf1c1456da91fe5369914438 100644 --- a/paddle/fluid/operators/optimizers/rmsprop_op.h +++ b/paddle/fluid/operators/optimizers/rmsprop_op.h @@ -148,11 +148,15 @@ class RmspropOpKernel : public framework::OpKernel { auto &mom_tensor = *ctx.Input("Moment"); PADDLE_ENFORCE_EQ(&p_tensor, param_out, - "Param and ParamOut must be the same Tensor"); + platform::errors::InvalidArgument( + "Param and ParamOut must be the same Tensor")); PADDLE_ENFORCE_EQ(&mom_tensor, moment_out, - "Moment and MomentOut must be the same Tensor"); - PADDLE_ENFORCE_EQ(&ms_tensor, mean_square_out, - "MeanSquare and MeanSquareOut must be the same Tensor"); + platform::errors::InvalidArgument( + "Moment and MomentOut must be the same Tensor")); + PADDLE_ENFORCE_EQ( + &ms_tensor, mean_square_out, + platform::errors::InvalidArgument( + "MeanSquare and MeanSquareOut must be the same Tensor")); auto &dev_ctx = ctx.template device_context(); size_t limit = static_cast(ms_tensor.numel()); @@ -179,8 +183,10 @@ class RmspropOpKernel : public framework::OpKernel { auto &mg_tensor = *ctx.Input("MeanGrad"); auto mg = EigenVector::Flatten(mg_tensor); auto *mean_grad_out = ctx.Output("MeanGradOut"); - PADDLE_ENFORCE_EQ(&mg_tensor, mean_grad_out, - "MeanGrad and MeanGradOut must be the same Tensor"); + PADDLE_ENFORCE_EQ( + &mg_tensor, mean_grad_out, + platform::errors::InvalidArgument( + "MeanGrad and MeanGradOut must be the same Tensor")); auto mg_out = EigenVector::Flatten(*mean_grad_out); mg_out.device(place) = rho * mg + (1 - rho) * g; @@ -198,8 +204,10 @@ class RmspropOpKernel : public framework::OpKernel { if (centered) { auto &mg_tensor = *ctx.Input("MeanGrad"); auto *mean_grad_out = ctx.Output("MeanGradOut"); - PADDLE_ENFORCE_EQ(&mg_tensor, mean_grad_out, - "MeanGrad and MeanGradOut must be the same Tensor"); + PADDLE_ENFORCE_EQ( + &mg_tensor, mean_grad_out, + platform::errors::InvalidArgument( + "MeanGrad and MeanGradOut must be the same Tensor")); for_range(CenteredRmspropFunctor>( param_out->mutable_data(ctx.GetPlace()), mean_square_out->mutable_data(ctx.GetPlace()), @@ -233,8 +241,10 @@ class RmspropOpKernel : public framework::OpKernel { if (centered) { auto &mg_tensor = *ctx.Input("MeanGrad"); auto *mean_grad_out = ctx.Output("MeanGradOut"); - PADDLE_ENFORCE_EQ(&mg_tensor, mean_grad_out, - "MeanGrad and MeanGradOut must be the same Tensor"); + PADDLE_ENFORCE_EQ( + &mg_tensor, mean_grad_out, + platform::errors::InvalidArgument( + "MeanGrad and MeanGradOut must be the same Tensor")); for_range(CenteredRmspropFunctor>( param_out->mutable_data(ctx.GetPlace()), mean_square_out->mutable_data(ctx.GetPlace()), @@ -249,7 +259,12 @@ class RmspropOpKernel : public framework::OpKernel { rho, epsilon, momentum, grad_func)); } } else { - PADDLE_THROW("RMSProp only supports LoDTensor or SelectedRows gradient"); + PADDLE_ENFORCE_EQ(false, true, + platform::errors::PermissionDenied( + "Unsupported Variable Type of Grad " + "in RmspropOp. Excepted LodTensor " + "or SelectedRows, But received [%s]", + paddle::framework::ToTypeName(grad_var->Type()))); } } }; diff --git a/paddle/fluid/operators/optimizers/sgd_op.cc b/paddle/fluid/operators/optimizers/sgd_op.cc index aeff8da70b958a440953824c46a095a4f86b9379..569dbcd6a3ee105ae8dd8570bbb2215c32343d01 100644 --- a/paddle/fluid/operators/optimizers/sgd_op.cc +++ b/paddle/fluid/operators/optimizers/sgd_op.cc @@ -22,23 +22,31 @@ class SGDOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("Param"), - "Input(Param) of SGDOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Grad"), - "Input(Grad) of SGDOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("LearningRate"), - "Input(LearningRate) of SGDOp should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), - "Output(ParamOut) of SGDOp should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("Param"), true, + platform::errors::NotFound( + "Input(Param) of SGDOp should not be null.")); + PADDLE_ENFORCE_EQ( + ctx->HasInput("Grad"), true, + platform::errors::NotFound("Input(Grad) of SGDOp should not be null.")); + PADDLE_ENFORCE_EQ(ctx->HasInput("LearningRate"), true, + platform::errors::NotFound( + "Input(LearningRate) of SGDOp should not be null.")); + PADDLE_ENFORCE_EQ(ctx->HasOutput("ParamOut"), true, + platform::errors::NotFound( + "Output(ParamOut) of SGDOp should not be null.")); auto lr_dims = ctx->GetInputDim("LearningRate"); PADDLE_ENFORCE_NE(framework::product(lr_dims), 0, - "Maybe the Input variable LearningRate has not " - "been initialized. You may need to confirm " - "if you put exe.run(startup_program) " - "after optimizer.minimize function."); + platform::errors::NotFound( + "Maybe the Input variable LearningRate has not " + "been initialized. You may need to confirm " + "if you put exe.run(startup_program) " + "after optimizer.minimize function.")); PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, - "Learning rate should have 1 element"); + platform::errors::InvalidArgument( + "Learning rate should have 1 element. But received " + "LearningRate dims [%s]", + framework::product(lr_dims))); auto param_dim = ctx->GetInputDim("Param"); if (ctx->GetInputsVarType("Grad")[0] == framework::proto::VarType::LOD_TENSOR) { diff --git a/paddle/fluid/operators/optimizers/sgd_op.cu b/paddle/fluid/operators/optimizers/sgd_op.cu index b70f24e0e5e8f2f6c6ac974942ccd4c4c3ad41bb..a5d9ad271f23a4d6a17f07a6c5b6a7d57aa7a3c5 100644 --- a/paddle/fluid/operators/optimizers/sgd_op.cu +++ b/paddle/fluid/operators/optimizers/sgd_op.cu @@ -57,11 +57,12 @@ class SGDOpKernel public: void Compute(const framework::ExecutionContext& ctx) const override { const auto* param_var = ctx.InputVar("Param"); - PADDLE_ENFORCE(param_var->IsType(), - "The Var(%s)'s type should be LoDTensor, " - "but the received is %s", - ctx.InputNames("Param").front(), - framework::ToTypeName(param_var->Type())); + PADDLE_ENFORCE_EQ(param_var->IsType(), true, + platform::errors::InvalidArgument( + "The Var(%s)'s type should be LoDTensor, " + "but the received is %s", + ctx.InputNames("Param").front(), + paddle::framework::ToTypeName(param_var->Type()))); auto* param = ctx.Input("Param"); auto* param_out = ctx.Output("ParamOut"); @@ -91,18 +92,30 @@ class SGDOpKernel // TODO(qijun): In Sparse SGD operator, in-place update is enforced. // This manual optimization brings difficulty to track data dependency. // It's better to find a more elegant solution. - PADDLE_ENFORCE_EQ(param, param_out); + PADDLE_ENFORCE_EQ( + param, param_out, + platform::errors::InvalidArgument( + "The input tensor Param of SgdOp should be equal with ParamOut " + "if variable's type is SelectedRows.")); auto* grad = ctx.Input("Grad"); auto in_height = grad->height(); auto out_dims = param_out->dims(); - PADDLE_ENFORCE_EQ(in_height, out_dims[0]); + PADDLE_ENFORCE_EQ(in_height, out_dims[0], + platform::errors::InvalidArgument( + "The input tensor Grad's height of SgdOp should be " + "equal with ParamOut's dims. But received Grad's " + "height [%s] and ParamOut's dims [%s]", + in_height, out_dims[0])); auto& in_value = grad->value(); auto& in_rows = grad->rows(); int64_t in_row_numel = in_value.numel() / in_rows.size(); - PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height); + PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height, + platform::errors::InvalidArgument( + "The in_row_numel of SgdOp should be equal with " + "param_out's numel / in_height.")); auto* in_data = in_value.data(); auto* out_data = param_out->data(); @@ -118,7 +131,12 @@ class SGDOpKernel out_data, in_row_numel, in_rows.size()); } else { - PADDLE_THROW("Unsupported Variable Type of Grad"); + PADDLE_ENFORCE_EQ(false, true, + platform::errors::PermissionDenied( + "Unsupported Variable Type of Grad " + "in SgdOp. Excepted LodTensor or " + "SelectedRows, But received [%s]", + paddle::framework::ToTypeName(grad_var->Type()))); } } }; diff --git a/paddle/fluid/operators/optimizers/sgd_op.h b/paddle/fluid/operators/optimizers/sgd_op.h index 539d774a395d739ae98adeb6bd2679d0487d0f06..1aaf95efc3250747a4cb46ab79b4415a6b527907 100644 --- a/paddle/fluid/operators/optimizers/sgd_op.h +++ b/paddle/fluid/operators/optimizers/sgd_op.h @@ -44,8 +44,20 @@ class SGDOpKernel if (grad_var->IsType()) { const auto *grad = ctx.Input("Grad"); auto sz = param_out->numel(); - PADDLE_ENFORCE_EQ(param->numel(), sz); - PADDLE_ENFORCE_EQ(grad->numel(), sz); + PADDLE_ENFORCE_EQ(param->numel(), sz, + platform::errors::InvalidArgument( + "The input tensor Param's numel of SgdOp " + "should be equal with ParamOut's numel. " + "But received Param's " + "numel = [%s], ParamOut's numel = [%s]", + param->numel(), sz)); + PADDLE_ENFORCE_EQ(grad->numel(), sz, + platform::errors::InvalidArgument( + "The input tensor Grad's numel of SgdOp " + "should be equal with ParamOut's numel. " + "But received Grad's " + "numel = [%s], ParamOut's numel = [%s]", + grad->numel(), sz)); jit::sgd_attr_t attr(1, sz, 1, sz, 1); const T *lr = learning_rate->data(); @@ -62,7 +74,11 @@ class SGDOpKernel // TODO(qijun): In Sparse SGD operator, in-place update is enforced. // This manual optimization brings difficulty to track data dependency. // It's better to find a more elegant solution. - PADDLE_ENFORCE_EQ(param, param_out); + PADDLE_ENFORCE_EQ(param, param_out, + platform::errors::InvalidArgument( + "The input tensor Param of SgdOp " + "should be equal with ParamOut if variable's " + "type is SelectedRows. ")); const auto *grad = ctx.Input("Grad"); auto &grad_rows = grad->rows(); @@ -73,7 +89,13 @@ class SGDOpKernel } auto out_dims = param_out->dims(); - PADDLE_ENFORCE_EQ(grad->height(), out_dims[0]); + PADDLE_ENFORCE_EQ( + grad->height(), out_dims[0], + platform::errors::InvalidArgument( + "The input tensor Grad's height of SgdOp " + "should be equal with ParamOut's dims. But received Grad's " + "height [%s] and ParamOut's dims [%s]", + grad->height(), out_dims[0])); auto &grad_value = grad->value(); const T *param_data = param->data(); const T *grad_data = grad_value.data(); @@ -87,19 +109,31 @@ class SGDOpKernel attr.grad_height = grad_rows.size(); // note: it is not grad->height() attr.grad_width = grad_value.numel() / attr.grad_height; attr.selected_rows_size = grad_rows.size(); - PADDLE_ENFORCE_EQ(attr.grad_width, attr.param_width); + PADDLE_ENFORCE_EQ( + attr.grad_width, attr.param_width, + platform::errors::InvalidArgument( + "The grad_value's numel of SgdOp " + "should be equal with param_out's numel. But received " + "grad_value's numel [%s] and param_out's numel [%s]", + attr.grad_width, attr.param_width)); auto sgd = jit::KernelFuncs, platform::CPUPlace>::Cache().At( attr); sgd(lr, param_data, grad_data, rows_data, out_data, &attr); } else { - PADDLE_THROW("Unsupported Variable Type of Grad"); + PADDLE_ENFORCE_EQ( + false, true, + platform::errors::PermissionDenied( + "Unsupported Variable Type of Grad in SgdOp. Excepted " + "LodTensor or SelectedRows, But received [%s]", + paddle::framework::ToTypeName(grad_var->Type()))); } } else if (param_var->IsType()) { - PADDLE_ENFORCE(grad_var->IsType(), - "when param " - "is SelectedRows, gradient should also be SelectedRows"); + PADDLE_ENFORCE_EQ(grad_var->IsType(), true, + platform::errors::InvalidArgument( + "when param is SelectedRows, " + "gradient should also be SelectedRows")); const auto ¶m = param_var->Get(); auto *param_out = ctx.Output("ParamOut"); const auto &grad = grad_var->Get(); @@ -112,27 +146,36 @@ class SGDOpKernel auto param_row_width = param.value().dims()[1]; auto grad_row_width = grad.value().dims()[1]; - VLOG(4) << " param rows: " << param.rows().size() - << " param memory rows: " << param.value().dims()[0] - << " grad rows: " << grad.rows().size() - << " grad memory rows: " << grad.value().dims()[0]; - PADDLE_ENFORCE_EQ(param_row_width, grad_row_width, - "param_row should have the same size with grad_row"); + PADDLE_ENFORCE_EQ( + param_row_width, grad_row_width, + platform::errors::InvalidArgument( + "The param_row in SgdOP should have the same size with grad_row. " + "But received param_row's width is [%s], and grad_row's width is " + "[%s]", + param_row_width, grad_row_width)); const auto *lr = learning_rate->data(); const auto *grad_data = grad.value().data(); auto *out_data = param_out->mutable_value()->data(); for (size_t i = 0; i < grad.rows().size(); i++) { int64_t id_index = param_out->AutoGrownIndex(grad.rows()[i], false); - PADDLE_ENFORCE_GE(id_index, static_cast(0), - "id should be in the table"); + PADDLE_ENFORCE_GE( + id_index, static_cast(0), + platform::errors::InvalidArgument( + "The id in SgdOp should be >= 0. But recevied id_index is [%s]", + id_index)); for (int64_t j = 0; j < grad_row_width; j++) { out_data[id_index * grad_row_width + j] -= lr[0] * grad_data[i * grad_row_width + j]; } } } else { - PADDLE_THROW("Unsupported Variable Type of Parameter"); + PADDLE_ENFORCE_EQ( + false, true, + platform::errors::PermissionDenied( + "Unsupported Variable Type of Parameter in SgdOp. Excepted " + "LodTensor or SelectedRows, But received [%s]", + paddle::framework::ToTypeName(param_var->Type()))); } } }; diff --git a/paddle/fluid/operators/utils.h b/paddle/fluid/operators/utils.h index aec995304a77118ecbf788ca3984c7e9da531f18..05d077b173a13e457fd38187b832f9586926a2ee 100644 --- a/paddle/fluid/operators/utils.h +++ b/paddle/fluid/operators/utils.h @@ -41,7 +41,9 @@ inline std::vector GetDataFromTensor(const framework::Tensor* x) { // NOTE: Converting int64 to int32 may cause data overflow. vec_new_data = std::vector(data, data + x->numel()); } else { - PADDLE_THROW("The dtype of Tensor must be int32 or int64."); + PADDLE_THROW(platform::errors::InvalidArgument( + "The dtype of Tensor must be int32 or int64, but received: %s", + x->type())); } return vec_new_data; } @@ -53,10 +55,11 @@ inline std::vector GetDataFromTensorList( for (size_t i = 0; i < list_tensor.size(); ++i) { auto tensor = list_tensor[i]; PADDLE_ENFORCE_EQ(tensor->dims(), framework::make_ddim({1}), - "ShapeError: The shape of Tensor in list must be [1]. " - "But received the shape " - "is [%s]", - tensor->dims()); + platform::errors::InvalidArgument( + "The shape of Tensor in list must be [1]. " + "But received its shape " + "is [%s]", + tensor->dims())); if (tensor->type() == framework::proto::VarType::INT32) { if (platform::is_gpu_place(tensor->place())) { @@ -76,7 +79,10 @@ inline std::vector GetDataFromTensorList( vec_new_data.push_back(static_cast(*tensor->data())); } } else { - PADDLE_THROW("The dtype of Tensor in list must be int32 or int64."); + PADDLE_THROW(platform::errors::InvalidArgument( + "The dtype of Tensor in list must be int32 or int64, but received: " + "%s", + tensor->type())); } } return vec_new_data; diff --git a/paddle/fluid/platform/cuda_primitives.h b/paddle/fluid/platform/cuda_primitives.h index a5dd19d4363d6a8fa99cf48ef2969186de605127..4d9673e9646dedbd001eabcd70d4d34aecaa10b5 100644 --- a/paddle/fluid/platform/cuda_primitives.h +++ b/paddle/fluid/platform/cuda_primitives.h @@ -137,12 +137,12 @@ USE_CUDA_ATOMIC(Max, unsigned int); #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 USE_CUDA_ATOMIC(Max, unsigned long long int); // NOLINT #else -CUDA_ATOMIC_WRAPPER(Max, unsigned long long int) { +CUDA_ATOMIC_WRAPPER(Max, unsigned long long int) { // NOLINT if (*address >= val) { return; } - unsigned long long int old = *address, assumed; + unsigned long long int old = *address, assumed; // NOLINT do { assumed = old; @@ -169,7 +169,7 @@ CUDA_ATOMIC_WRAPPER(Max, float) { return; } - int *const address_as_i = (int *)address; + int *const address_as_i = reinterpret_cast(address); int old = *address_as_i, assumed; do { @@ -187,9 +187,9 @@ CUDA_ATOMIC_WRAPPER(Max, double) { return; } - unsigned long long int *const address_as_ull = - (unsigned long long int *)address; - unsigned long long int old = *address_as_ull, assumed; + unsigned long long int *const address_as_ull = // NOLINT + reinterpret_cast(address); // NOLINT + unsigned long long int old = *address_as_ull, assumed; // NOLINT do { assumed = old; @@ -209,12 +209,12 @@ USE_CUDA_ATOMIC(Min, unsigned int); #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 USE_CUDA_ATOMIC(Min, unsigned long long int); // NOLINT #else -CUDA_ATOMIC_WRAPPER(Min, unsigned long long int) { +CUDA_ATOMIC_WRAPPER(Min, unsigned long long int) { // NOLINT if (*address <= val) { return; } - unsigned long long int old = *address, assumed; + unsigned long long int old = *address, assumed; // NOLINT do { assumed = old; @@ -241,7 +241,7 @@ CUDA_ATOMIC_WRAPPER(Min, float) { return; } - int *const address_as_i = (int *)address; + int *const address_as_i = reinterpret_cast(address); int old = *address_as_i, assumed; do { @@ -259,9 +259,9 @@ CUDA_ATOMIC_WRAPPER(Min, double) { return; } - unsigned long long int *const address_as_ull = - (unsigned long long int *)address; - unsigned long long int old = *address_as_ull, assumed; + unsigned long long int *const address_as_ull = // NOLINT + reinterpret_cast(address); // NOLINT + unsigned long long int old = *address_as_ull, assumed; // NOLINT do { assumed = old; diff --git a/paddle/fluid/platform/init_test.cc b/paddle/fluid/platform/init_test.cc index f14fbdd74f95bfbed53ff787af861ce4656159c0..f1832206a1abbbfccb5e79b39b1c67307aca6769 100644 --- a/paddle/fluid/platform/init_test.cc +++ b/paddle/fluid/platform/init_test.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/init.h" +#include "paddle/fluid/platform/xpu_info.h" TEST(InitDevices, CPU) { using paddle::framework::InitDevices; diff --git a/paddle/fluid/platform/xpu_header.h b/paddle/fluid/platform/xpu_header.h index d8c5f85f9cfe4b9d6ac07069fff89d37c695af5b..95e4979951d7689b233166170060916033311011 100644 --- a/paddle/fluid/platform/xpu_header.h +++ b/paddle/fluid/platform/xpu_header.h @@ -15,9 +15,36 @@ #pragma once #ifdef PADDLE_WITH_XPU +#include +#include + +#include "paddle/fluid/platform/errors.h" #include "xpu/api.h" #include "xpu/runtime.h" #include "xpu/runtime_ex.h" namespace xpu = baidu::xpu::api; + +class XPUActHelper { + public: + // Convert string to activation type in xpu + static xpu::Activation_t ConvertToXpuActType( + const std::string& act_type_str) { + static std::unordered_map str2act = { + {"linear", xpu::Activation_t::LINEAR}, + {"relu", xpu::Activation_t::RELU}, + {"sigmoid", xpu::Activation_t::SIGMOID}, + {"tanh", xpu::Activation_t::TANH}, + {"gelu", xpu::Activation_t::GELU}, + {"leaky_relu", xpu::Activation_t::LEAKY_RELU}, + {"sqrt", xpu::Activation_t::SQRT}, + {"square", xpu::Activation_t::SQUARE}}; + + auto res = str2act.find(act_type_str); + PADDLE_ENFORCE_NE(res, str2act.end(), + paddle::platform::errors::InvalidArgument( + "Invalid activation type(%s) in XPU", act_type_str)); + return res->second; + } +}; #endif diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index 5ee15073267b6eac8978022a70ead5d0f439c62f..142ab2bb9d790175a843d1b81b74dc762a3213fd 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -41,6 +41,7 @@ namespace detail { // import numpy as np // print np.dtype(np.float16).num # 23 constexpr int NPY_FLOAT16_ = 23; +constexpr int NPY_UINT16_ = 4; // Note: Since float16 is not a builtin type in C++, we register // paddle::platform::float16 as numpy.float16. @@ -60,6 +61,23 @@ struct npy_format_descriptor { static PYBIND11_DESCR name() { return _("float16"); } }; +// Note: Since bfloat16 is not a builtin type in C++ and in numpy, +// we register paddle::platform::bfloat16 as numpy.uint16. +template <> +struct npy_format_descriptor { + static py::dtype dtype() { + handle ptr = npy_api::get().PyArray_DescrFromType_(NPY_UINT16_); + return reinterpret_borrow(ptr); + } + static std::string format() { + // Note: "H" represents UINT16. + // Details at: + // https://docs.python.org/3/library/struct.html#format-characters. + return "H"; + } + static PYBIND11_DESCR name() { return _("bfloat16"); } +}; + } // namespace detail } // namespace pybind11 diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index ac6531a2cc55dbe47049797b96349d659adab496..69303013d2a41a049276c0d1b03b9d902b555d23 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -988,11 +988,6 @@ set +x fi read testcase <<< $(echo "$line"|grep -oEi "\w+$") - if python $PADDLE_ROOT/tools/is_ut_disabled.py $testcase; then - echo $testcase" is disabled." - continue - fi - if [[ "$is_nightly" != "" ]] && [ ${NIGHTLY_MODE:-OFF} == "OFF" ]; then echo $testcase" will only run at night." continue diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index e749cf88b6a49846b678c1c4258d2b3c2a8c01a4..e707de8e068640e28d3a06d539e33f767d7ab2b3 100755 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -230,7 +230,6 @@ from .framework import CPUPlace #DEFINE_ALIAS from .framework import CUDAPlace #DEFINE_ALIAS from .framework import CUDAPinnedPlace #DEFINE_ALIAS -from .framework import to_variable #DEFINE_ALIAS from .framework import grad #DEFINE_ALIAS from .framework import no_grad #DEFINE_ALIAS from .framework import save #DEFINE_ALIAS @@ -258,6 +257,8 @@ from .tensor.stat import numel #DEFINE_ALIAS from .device import get_cudnn_version from .device import set_device from .device import get_device +from .device import is_compiled_with_xpu +from .device import XPUPlace # from .tensor.tensor import Tensor #DEFINE_ALIAS # from .tensor.tensor import LoDTensor #DEFINE_ALIAS # from .tensor.tensor import LoDTensorArray #DEFINE_ALIAS diff --git a/python/paddle/device.py b/python/paddle/device.py index de24fd875130e84d6532d033761f68a5c77a68c2..46d0ff7bedcecfefc7d054b0ccbcbf100c2fa0f6 100644 --- a/python/paddle/device.py +++ b/python/paddle/device.py @@ -22,7 +22,9 @@ from paddle.fluid.dygraph.parallel import ParallelEnv __all__ = [ 'get_cudnn_version', 'set_device', - 'get_device' + 'get_device', + 'XPUPlace', + 'is_compiled_with_xpu' # 'cpu_places', # 'CPUPlace', # 'cuda_pinned_places', @@ -35,6 +37,37 @@ __all__ = [ _cudnn_version = None +def is_compiled_with_xpu(): + """ + Whether paddle was built with WITH_XPU=ON to support Baidu Kunlun + + Returns (bool): whether paddle was built with WITH_XPU=ON + + Examples: + .. code-block:: python + + import paddle + support_xpu = paddle.device.is_compiled_with_xpu() + """ + return core.is_compiled_with_xpu() + + +def XPUPlace(dev_id): + """ + Return a Baidu Kunlun Place + + Parameters: + dev_id(int): Baidu Kunlun device id + + Examples: + .. code-block:: python + + import paddle + place = paddle.device.XPUPlace(0) + """ + return core.XPUPlace(dev_id) + + def get_cudnn_version(): """ This funciton return the version of cudnn. the retuen value is int which represents the diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py index f66f013e4dbaadd534d6859b7ba6530779c82a3b..36da7264efe2e489aadbffde56b4260418f91fb2 100644 --- a/python/paddle/distributed/fleet/base/role_maker.py +++ b/python/paddle/distributed/fleet/base/role_maker.py @@ -495,7 +495,7 @@ class RoleMakerBase(object): Returns: string: all heter_trainers'endpoints """ - assert self._heter_trainer_endpoints != [] + assert self._heter_trainer_endpoints != [], "Heter Worker Endpoints Not initialized" return self._heter_trainer_endpoints def _get_heter_worker_endpoint(self): @@ -505,10 +505,10 @@ class RoleMakerBase(object): e.g: if we have 4 cpu-trainer(default), 2 gpu-trainer(heter) then No.0 and No.2 cpu-trainer will work with No.0 gpu-trainer - and No.1 and No.3 cpu-trainer will work with No.1 gpu-trainerr + and No.1 and No.3 cpu-trainer will work with No.1 gpu-trainer """ - assert self._heter_trainer_endpoints != [] - return self._heter_trainer_endpoints[(self._current_id + 1) % + assert self._heter_trainer_endpoints != [], "Heter Worker Endpoints Not initialized" + return self._heter_trainer_endpoints[(self._current_id) % self._heter_worker_num()] def _get_heter_worker_device(self): diff --git a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py index 6dd4661f00062f55bb834bbee50daf1924a0c87a..42be7e869d9a7c6394152167ac2cbce9b0986de0 100644 --- a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py +++ b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py @@ -23,6 +23,7 @@ from paddle.fluid.executor import Executor from paddle.fluid.parallel_executor import ParallelExecutor from .runtime_base import RuntimeBase +from ..base.private_helper_function import wait_server_ready class ParameterServerRuntime(RuntimeBase): @@ -94,8 +95,8 @@ class ParameterServerRuntime(RuntimeBase): return False if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \ - var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \ - var.desc.type() == core.VarDesc.VarType.READER: + var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \ + var.desc.type() == core.VarDesc.VarType.READER: return False return var.persistable @@ -161,6 +162,17 @@ class ParameterServerRuntime(RuntimeBase): trainer_config = self.async_strategy.get_trainer_runtime_config() + dist_strategy = self.context["valid_strategy"] + launch_barrier = dist_strategy.a_sync_configs["launch_barrier"] + if launch_barrier: + # for trainer wait server ready + wait_server_ready(self.role_maker._get_pserver_endpoints()) + + # for ps-heter mode, wait heter worker ready + if self.role_maker._is_heter_parameter_server_mode and self.role_maker._is_worker( + ): + wait_server_ready(self.role_maker._get_heter_worker_endpoints()) + lrs = _has_global_step(_get_lr_ops(self.origin_main_program)) if lrs: @@ -312,7 +324,7 @@ class ParameterServerRuntime(RuntimeBase): opts = _get_optimize_ops(self.origin_main_program) for op in opts: if "Param" in op.input_names and \ - "LearningRate" in op.input_names and op.input("Param")[0] == param_name: + "LearningRate" in op.input_names and op.input("Param")[0] == param_name: return op def _save_dense_params(self, executor, dirname, context, main_program): diff --git a/python/paddle/fluid/data.py b/python/paddle/fluid/data.py index dc57e9f71ed3d0de1a374bdf719b32a083198b31..05ea66f54451ba08032bff4e7bc805bbffa15e73 100644 --- a/python/paddle/fluid/data.py +++ b/python/paddle/fluid/data.py @@ -19,10 +19,12 @@ from paddle.fluid import core from paddle.fluid.layer_helper import LayerHelper from paddle.fluid.data_feeder import check_dtype, check_type from ..utils import deprecated +from paddle.fluid.framework import static_only __all__ = ['data'] +@static_only @deprecated(since="2.0.0", update_to="paddle.static.data") def data(name, shape, dtype='float32', lod_level=0): """ diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py index 01c2f0fed496081400d363d9464360c69d924be8..69fb23383e5fc06da46d1791a056b6d8f4da8c52 100644 --- a/python/paddle/fluid/dygraph/base.py +++ b/python/paddle/fluid/dygraph/base.py @@ -363,7 +363,7 @@ def guard(place=None): with framework.program_guard(train, startup): with framework.unique_name.guard(): with framework._dygraph_guard(tracer): - with framework._dygraph_place_guard(place): + with framework._dygraph_place_guard(expected_place): yield diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py index a14c3a81c121758ed90450cd5eb5990f3f7739e1..05269028acc4004b8af000dabd946f9a3d8ce40f 100644 --- a/python/paddle/fluid/dygraph/nn.py +++ b/python/paddle/fluid/dygraph/nn.py @@ -3230,14 +3230,11 @@ class Flatten(layers.Layer): .. code-block:: python import paddle - from paddle import to_variable import numpy as np + paddle.disable_static() inp_np = np.ones([5, 2, 3, 4]).astype('float32') - - paddle.disable_static() - - inp_np = to_variable(inp_np) + inp_np = paddle.to_tensor(inp_np) flatten = paddle.nn.Flatten(start_axis=1, stop_axis=2) flatten_res = flatten(inp_np) diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 2e3f34f41648a9343b4bccd1044bcd3f7b3d8189..3dc30767e5aa42d6a0a9f673e093f40045cbed87 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -1355,7 +1355,7 @@ class Executor(object): if not program._fleet_opt is None: if program._fleet_opt.get("worker_class", "") == "HeterCpuWorker": is_heter = 1 - if program._fleet_opt("trainer", "") == "HeterXpuTrainer": + if program._fleet_opt.get("trainer", "") == "HeterXpuTrainer": is_heter = 1 if scope is None: scope = global_scope() diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 797b32f5d4768af59fa4e6aceb75e4b6d9029d91..b4cea6761dcd84e047f98929644a1e264976503d 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -217,7 +217,16 @@ def _dygraph_not_support_(func): def _dygraph_only_(func): def __impl__(*args, **kwargs): assert in_dygraph_mode( - ), "We Only support %s in dynamic mode, please call 'paddle.disable_static()' to enter dynamic mode." % func.__name__ + ), "We only support '%s()' in dynamic graph mode, please call 'paddle.disable_static()' to enter dynamic graph mode." % func.__name__ + return func(*args, **kwargs) + + return __impl__ + + +def _static_only_(func): + def __impl__(*args, **kwargs): + assert not in_dygraph_mode( + ), "We only support '%s()' in static graph mode, please call 'paddle.enable_static()' to enter static graph mode." % func.__name__ return func(*args, **kwargs) return __impl__ @@ -260,6 +269,7 @@ def deprecate_stat_dict(func): dygraph_not_support = wrap_decorator(_dygraph_not_support_) dygraph_only = wrap_decorator(_dygraph_only_) +static_only = wrap_decorator(_static_only_) fake_interface_only = wrap_decorator(_fake_interface_only_) @@ -603,7 +613,9 @@ def convert_np_dtype_to_dtype_(np_dtype): elif dtype == np.bool: return core.VarDesc.VarType.BOOL elif dtype == np.uint16: - return core.VarDesc.VarType.INT16 + # since there is still no support for bfloat16 in NumPy, + # uint16 is used for casting bfloat16 + return core.VarDesc.VarType.BF16 elif dtype == np.uint8: return core.VarDesc.VarType.UINT8 elif dtype == np.int8: diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index d513d44acfff230eb229e161e689fbc60a73c602..6b98dea42903e1392febd14b739b49cec7bc8c14 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -31,6 +31,7 @@ from ..unique_name import generate as unique_name import logging from ..data_feeder import check_dtype, check_type +from paddle.fluid.framework import static_only __all__ = [ 'data', 'read_file', 'double_buffer', 'py_reader', @@ -38,6 +39,7 @@ __all__ = [ ] +@static_only def data(name, shape, append_batch_size=True, diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 2f8952a443107e32d3b6c96dc27c51a8aafe67a1..97a3ebc2135a0649fff88e1a1c14d02dfb7850b1 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -335,9 +335,6 @@ list(REMOVE_ITEM TEST_OPS test_conv3d_transpose_op) # disable this unittest temporarily list(REMOVE_ITEM TEST_OPS test_imperative_data_loader_exception) list(REMOVE_ITEM TEST_OPS test_sampling_id_op) -list(REMOVE_ITEM TEST_OPS test_paddle_save_load) - - if (APPLE OR WIN32) list(REMOVE_ITEM TEST_OPS test_dataset) @@ -534,15 +531,15 @@ if(NOT WIN32) endif() if(NOT APPLE AND NOT WIN32) - bash_test_modules(test_auto_checkpoint START_BASH dist_test.sh TIMEOUT 140) - bash_test_modules(test_auto_checkpoint1 START_BASH dist_test.sh TIMEOUT 140) - bash_test_modules(test_auto_checkpoint2 START_BASH dist_test.sh TIMEOUT 140) - bash_test_modules(test_auto_checkpoint3 START_BASH dist_test.sh TIMEOUT 140) - bash_test_modules(test_auto_checkpoint_multiple START_BASH dist_test.sh TIMEOUT 140) - bash_test_modules(test_auto_checkpoint_dist_basic START_BASH dist_test.sh TIMEOUT 140) - bash_test_modules(test_hdfs1 START_BASH dist_test.sh TIMEOUT 140) - bash_test_modules(test_hdfs2 START_BASH dist_test.sh TIMEOUT 140) - bash_test_modules(test_hdfs3 START_BASH dist_test.sh TIMEOUT 140) + bash_test_modules(test_auto_checkpoint START_BASH dist_test.sh TIMEOUT 140 LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY") + bash_test_modules(test_auto_checkpoint1 START_BASH dist_test.sh TIMEOUT 140 LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY") + bash_test_modules(test_auto_checkpoint2 START_BASH dist_test.sh TIMEOUT 140 LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY") + bash_test_modules(test_auto_checkpoint3 START_BASH dist_test.sh TIMEOUT 140 LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY") + bash_test_modules(test_auto_checkpoint_multiple START_BASH dist_test.sh TIMEOUT 140 LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY") + bash_test_modules(test_auto_checkpoint_dist_basic START_BASH dist_test.sh TIMEOUT 140 LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY") + bash_test_modules(test_hdfs1 START_BASH dist_test.sh TIMEOUT 140 LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY") + bash_test_modules(test_hdfs2 START_BASH dist_test.sh TIMEOUT 140 LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY") + bash_test_modules(test_hdfs3 START_BASH dist_test.sh TIMEOUT 140 LABELS "RUN_TYPE=EXCLUSIVE:NIGHTLY") endif() add_subdirectory(sequence) diff --git a/python/paddle/fluid/tests/unittests/ctr_dataset_reader.py b/python/paddle/fluid/tests/unittests/ctr_dataset_reader.py index 15e98481c26b20de4e9fa493fa022380ba1fcd63..92d84b8b3f381e8607d0168e3891c2399037e456 100644 --- a/python/paddle/fluid/tests/unittests/ctr_dataset_reader.py +++ b/python/paddle/fluid/tests/unittests/ctr_dataset_reader.py @@ -153,7 +153,7 @@ def gen_fake_line(dnn_data_num=7, return line -def prepare_fake_data(file_nums=9, file_lines=1000): +def prepare_fake_data(file_nums=6, file_lines=1000): """ Create fake data with same type as avazu_ctr_data """ diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py index f62ad66e462862f4c3f04bacc58ca7aac583ef1e..fefaecd3b8979b47cf7c0c4f7aa058e9ffcaae42 100644 --- a/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py +++ b/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py @@ -206,13 +206,6 @@ class TestHeterPsCTR2x2(FleetDistHeterRunnerBase): debug=int(os.getenv("Debug", "0"))) pass_time = time.time() - pass_start print("do_dataset_training done. using time {}".format(pass_time)) - if os.getenv("SAVE_MODEL") == "1": - model_dir = tempfile.mkdtemp() - fleet.save_inference_model(exe, model_dir, - [feed.name for feed in self.feeds], - self.avg_cost) - self.check_model_right(model_dir) - shutil.rmtree(model_dir) fleet.stop_worker() print("do_dataset_training stop worker.") diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_bf16_mkldnn_op.py new file mode 100644 index 0000000000000000000000000000000000000000..0ac33383fb26b2a35362e8e39e5994d82d6fe497 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_bf16_mkldnn_op.py @@ -0,0 +1,208 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import struct + +import paddle.fluid.core as core +from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16 +from paddle.fluid.tests.unittests.test_conv2d_op import conv2d_forward_naive, TestConv2dOp + + +def conv2d_forward_refer(input, filter, group, conv_param): + out, in_n, out_h, out_w, out_c = conv2d_forward_naive(input, filter, group, + conv_param) + return out + + +def conv2d_residual_naive(out, residual): + assert out.shape == residual.shape + out = np.add(out, residual) + return out + + +class TestConv2dBf16Op(TestConv2dOp): + def setUp(self): + self.op_type = "conv2d" + self.use_cudnn = False + self.exhaustive_search = False + self.use_cuda = False + self.use_mkldnn = True + self.weight_type = np.float32 + self.input_type = np.float32 + self.use_mkldnn = True + self.mkldnn_data_type = "bfloat16" + self.force_fp32_output = False + self.init_group() + self.init_dilation() + self.init_test_case() + self.init_fuse_relu() + self.init_fuse_residual() + self.init_data_type() + self.init_force_fp32_output() + + conv2d_param = { + 'stride': self.stride, + 'pad': self.pad, + 'dilation': self.dilations + } + self.input = np.random.random(self.input_size).astype(np.float32) + self.filter = np.random.random(self.filter_size).astype(np.float32) + conv_out, _, _, _, _ = conv2d_forward_naive(self.input, self.filter, + self.groups, conv2d_param) + self.conv_output_float = conv_out + + if self.fuse_residual: + self.input_residual = np.random.random( + self.input_residual_size).astype(np.float32) + self.conv_output_float = conv2d_residual_naive( + self.conv_output_float, self.input_residual) + self.conv_output = convert_float_to_uint16(self.conv_output_float) + self.outputs = {'Output': self.conv_output} + elif self.force_fp32_output: + self.outputs = {'Output': self.conv_output_float.astype(np.float32)} + + if self.input_type is not np.float32: + self.input = convert_float_to_uint16(self.input) + + self.inputs = { + 'Input': self.input.view(self.input_type), + 'Filter': OpTest.np_dtype_to_fluid_dtype( + self.filter.astype(self.weight_type)) + } + + if self.fuse_residual: + self.inputs['ResidualData'] = OpTest.np_dtype_to_fluid_dtype( + convert_float_to_uint16(self.input_residual)) + + self.attrs = { + 'strides': self.stride, + 'paddings': self.pad, + 'groups': self.groups, + 'dilations': self.dilations, + 'use_cudnn': self.use_cudnn, + 'use_mkldnn': self.use_mkldnn, + 'mkldnn_data_type': self.mkldnn_data_type, + 'force_fp32_output': self.force_fp32_output, + 'fuse_residual_connection': self.fuse_residual + } + + def test_check_output(self): + self.check_output_with_place(core.CPUPlace()) + + def test_check_grad(self): + pass + + def test_check_grad_no_filter(self): + pass + + def test_check_grad_no_input(self): + pass + + def init_test_case(self): + TestConv2dOp.init_test_case(self) + self.input_size = [1, 1, 5, 5] # NCHW + f_c = self.input_size[1] // self.groups + self.input_residual_size = [1, 2, 3, 3] + self.filter_size = [2, f_c, 3, 3] + + def init_data_type(self): + self.weight_type = np.float32 + self.input_type = np.float32 + + def init_force_fp32_output(self): + self.force_fp32_output = False + + def init_fuse_relu(self): + self.fuse_activation = "relu" + + def init_fuse_residual(self): + self.fuse_residual = True + + +class TestConv2d(TestConv2dBf16Op): + def init_test_case(self): + self.pad = [0, 0] + self.stride = [1, 1] + self.input_size = [2, 3, 5, 5] # NCHW + self.input_residual_size = [2, 6, 3, 3] + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [6, f_c, 3, 3] + + def init_data_type(self): + self.input_type = np.uint16 + + +class TestWithPad(TestConv2d): + def init_test_case(self): + TestConv2d.init_test_case(self) + self.pad = [1, 1] + self.input_residual_size = [2, 6, 5, 5] + + +class TestWithGroup(TestConv2d): + def init_group(self): + self.groups = 3 + + +class TestWithStride(TestConv2dBf16Op): + def init_test_case(self): + self.pad = [1, 1] + self.stride = [2, 2] + self.input_size = [2, 3, 6, 6] + self.input_residual_size = [2, 6, 3, 3] + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [6, f_c, 3, 3] + + def init_data_type(self): + self.input_type = np.uint16 + + +class TestWith1x1ForceFP32Output(TestConv2dBf16Op): + def init_test_case(self): + self.pad = [0, 0] + self.stride = [1, 1] + self.input_size = [1, 3, 5, 5] + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [6, f_c, 1, 1] + + def init_force_fp32_output(self): + self.force_fp32_output = True + + def init_fuse_residual(self): + self.fuse_residual = False + + +class TestWithInput1x1Filter1x1(TestConv2dBf16Op): + def init_test_case(self): + self.pad = [0, 0] + self.stride = [1, 1] + self.input_size = [2, 3, 1, 1] + self.input_residual_size = [2, 6, 1, 1] + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [6, f_c, 1, 1] + + def init_group(self): + self.groups = 3 + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py index 7a494e3c2c3040356641d05772c883e15e4579e3..9731efced69d4b53bbb5b57b4d252d9a7a0c4f5a 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py @@ -36,6 +36,7 @@ class TestConv2dInt8Op(TestConv2dOp): self.use_cuda = False self.use_mkldnn = False self.data_format = "NCHW" + self.mkldnn_data_type = "int8" self.weighttype = np.float32 self.use_mkldnn = True self.init_group() @@ -141,7 +142,8 @@ class TestConv2dInt8Op(TestConv2dOp): 'Scale_weights': self.scale_weights, 'Scale_in_eltwise': self.scale_in_eltwise, 'fuse_activation': self.fuse_activation, - 'fuse_residual_connection': self.fuse_residual + 'fuse_residual_connection': self.fuse_residual, + 'mkldnn_data_type': self.mkldnn_data_type } self.outputs = {'Output': output} diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_dequantize_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_dequantize_mkldnn_op.py index 35419462909df1700219fbbe3841e4dbd094e719..70c76f1fb7186fcc983c0378af657d4aae2d2b32 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_dequantize_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_dequantize_mkldnn_op.py @@ -16,7 +16,7 @@ from __future__ import print_function import unittest import numpy as np -from paddle.fluid.tests.unittests.op_test import OpTest +from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16 class TestDeQuantizeOp(OpTest): @@ -32,6 +32,9 @@ class TestDeQuantizeOp(OpTest): input = (np.random.randint(0, 100, self.input_size) - 50 ).astype(self.data_type) output = (input * (1 / self.scale)).astype('float') + elif self.data_type == 'uint16': + output = np.random.random(self.input_size).astype(np.float32) + input = convert_float_to_uint16(output) else: input = (np.random.randint(0, 100, self.input_size)).astype(self.data_type) @@ -70,5 +73,13 @@ class TestDeQuantizeOp2(TestDeQuantizeOp): self.data_type = 'uint8' +class TestDeQuantizeOpBf16(TestDeQuantizeOp): + def set_scale(self): + self.scale = 1.0 + + def set_data_type(self): + self.data_type = 'uint16' + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index a6a4b9574c50e254def870783adbc0a0dc3c3ed8..96efc36ed0a5022ef7e1eae623a0eec02c4dcfef 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -20,6 +20,7 @@ import warnings import numpy as np import random import six +import struct import time import itertools import collections @@ -167,6 +168,18 @@ def skip_check_grad_ci(reason=None): return wrapper +def copy_bits_from_float_to_uint16(f): + return struct.unpack('> 16 + + +def convert_float_to_uint16(float_list): + new_output = [] + for x in np.nditer(float_list): + new_output.append(np.uint16(copy_bits_from_float_to_uint16(x))) + + return np.reshape(new_output, float_list.shape).view(np.uint16) + + class OpTest(unittest.TestCase): @classmethod def setUpClass(cls): @@ -204,6 +217,9 @@ class OpTest(unittest.TestCase): return False return True + def is_xpu_op_test(): + return hasattr(cls, "use_xpu") and cls.use_xpu == True + def is_mkldnn_op_test(): return hasattr(cls, "use_mkldnn") and cls.use_mkldnn == True @@ -226,6 +242,7 @@ class OpTest(unittest.TestCase): if cls.dtype in [np.float32, np.float64] \ and cls.op_type not in op_accuracy_white_list.NO_FP64_CHECK_GRAD_OP_LIST \ and not hasattr(cls, 'exist_fp64_check_grad') \ + and not is_xpu_op_test() \ and not is_mkldnn_op_test(): raise AssertionError( "This test of %s op needs check_grad with fp64 precision." % @@ -242,6 +259,11 @@ class OpTest(unittest.TestCase): self.call_once = True self.dtype = data_type + def is_bfloat16_op(self): + return self.dtype == np.uint16 or ( + hasattr(self, 'mkldnn_data_type') and + getattr(self, 'mkldnn_data_type') is "bfloat16") + def infer_dtype_from_inputs_outputs(self, inputs, outputs): def is_np_data(input): return isinstance(input, (np.ndarray, np.generic)) @@ -276,8 +298,9 @@ class OpTest(unittest.TestCase): infer_dtype(inputs, dtype_set) dtype_list = [ np.dtype(np.float64), np.dtype(np.float32), np.dtype(np.float16), - np.dtype(np.int64), np.dtype(np.int32), np.dtype(np.int16), - np.dtype(np.int8), np.dtype(np.uint8), np.dtype(np.bool) + np.dtype(np.int64), np.dtype(np.int32), np.dtype(np.uint16), + np.dtype(np.int16), np.dtype(np.int8), np.dtype(np.uint8), + np.dtype(np.bool) ] # check the dtype in dtype_list in order, select the first dtype that in dtype_set for dtype in dtype_list: @@ -317,6 +340,11 @@ class OpTest(unittest.TestCase): self.attrs["use_mkldnn"] == True): self.__class__.use_mkldnn = True + if (hasattr(self, "use_xpu") and self.use_xpu == True) or \ + (hasattr(self, "attrs") and "use_xpu" in self.attrs and \ + self.attrs["use_xpu"] == True): + self.__class__.use_xpu = True + op_proto = OpProtoHolder.instance().get_op_proto(self.op_type) "infer datatype from inputs and outputs for this test case" self.infer_dtype_from_inputs_outputs(self.inputs, self.outputs) @@ -913,6 +941,8 @@ class OpTest(unittest.TestCase): need_run_ops = self._get_need_run_ops(op_desc) res = {} + if hasattr(self, 'attrs') and bool(self.attrs.get('use_xpu', False)): + return for op_desc, father_op_desc in reversed(need_run_ops): # The first one is the forward op has_infer_inplace = fluid.core.has_infer_inplace(op_desc.type()) @@ -957,6 +987,14 @@ class OpTest(unittest.TestCase): self.op_type not in op_threshold_white_list.NEED_FIX_FP64_CHECK_OUTPUT_THRESHOLD_OP_LIST: atol = 0 + if self.is_bfloat16_op(): + check_dygraph = False + if hasattr(self, 'force_fp32_output') and getattr( + self, 'force_fp32_output'): + atol = 1e-2 + else: + atol = 2 + if no_check_set is not None: if self.op_type not in no_check_set_white_list.no_check_set_white_list: raise AssertionError( @@ -1176,6 +1214,11 @@ class OpTest(unittest.TestCase): self.attrs["use_mkldnn"] == True): self.__class__.use_mkldnn = True + if (hasattr(self, "use_xpu") and self.use_xpu == True) or \ + (hasattr(self, "attrs") and "use_xpu" in self.attrs and \ + self.attrs["use_xpu"] == True): + self.__class__.use_xpu = True + places = self._get_places() for place in places: res = self.check_output_with_place(place, atol, no_check_set, @@ -1286,8 +1329,9 @@ class OpTest(unittest.TestCase): no_grad_set = set() else: if (self.op_type not in no_grad_set_white_list.NEED_TO_FIX_OP_LIST - ) and (self.op_type not in - no_grad_set_white_list.NOT_CHECK_OP_LIST): + ) and ( + self.op_type not in no_grad_set_white_list.NOT_CHECK_OP_LIST + ) and (not self.is_bfloat16_op()): raise AssertionError("no_grad_set must be None, op_type is " + self.op_type + " Op.") diff --git a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py index db9e8d2c6bda011bef7c23e7fb51e246137a3906..6c4834b84f91f68f51b65bfc831775966732b36c 100644 --- a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py +++ b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py @@ -78,15 +78,17 @@ class TestLeakyReluDoubleGradCheck(unittest.TestCase): class TestELUDoubleGradCheck(unittest.TestCase): @prog_scope() def func(self, place): - shape = [2, 3, 7, 9] + shape = [2, 3, 6, 6] eps = 1e-6 alpha = 1.1 dtype = np.float64 + SEED = 0 x = layers.data('x', shape, False, dtype) x.persistable = True y = layers.elu(x, alpha=alpha) + np.random.RandomState(SEED) x_arr = np.random.uniform(-1, 1, shape).astype(dtype) gradient_checker.double_grad_check( [x], y, x_init=x_arr, place=place, eps=eps) @@ -147,5 +149,53 @@ class TestSquareDoubleGradCheck(unittest.TestCase): self.func(p) +class TestAbsDoubleGradCheck(unittest.TestCase): + @prog_scope() + def func(self, place): + # the shape of input variable should be clearly specified, not inlcude -1. + shape = [2, 3, 7, 9] + eps = 1e-6 + dtype = np.float64 + + x = layers.data('x', shape, False, dtype) + x.persistable = True + y = layers.abs(x) + x_arr = np.random.uniform(-1, 1, shape).astype(dtype) + + gradient_checker.double_grad_check( + [x], y, x_init=x_arr, place=place, eps=eps) + + def test_grad(self): + places = [fluid.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(fluid.CUDAPlace(0)) + for p in places: + self.func(p) + + +class TestLogDoubleGradCheck(unittest.TestCase): + @prog_scope() + def func(self, place): + shape = [2, 3, 7, 9] + eps = 1e-6 + dtype = np.float64 + + x = layers.data('x', shape, False, dtype) + x.persistable = True + y = layers.log(x) + + x_arr = np.random.uniform(0.1, 1, shape).astype(dtype) + + gradient_checker.double_grad_check( + [x], y, x_init=x_arr, place=place, eps=eps) + + def test_grad(self): + places = [fluid.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(fluid.CUDAPlace(0)) + for p in places: + self.func(p) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py index 791f1ee2dfa534437deb903fc60e2904a8b396a1..ad7539e76e41beaf7afa590f8a0af43a4b3c8b10 100755 --- a/python/paddle/fluid/tests/unittests/test_activation_op.py +++ b/python/paddle/fluid/tests/unittests/test_activation_op.py @@ -228,7 +228,7 @@ class TestTanhAPI(unittest.TestCase): def test_dygraph_api(self): paddle.disable_static(self.place) - x = paddle.to_variable(self.x_np) + x = paddle.to_tensor(self.x_np) out1 = F.tanh(x) out2 = paddle.tanh(x) th = paddle.nn.Tanh() @@ -573,7 +573,7 @@ class TestHardShrinkAPI(unittest.TestCase): def test_dygraph_api(self): paddle.disable_static(self.place) - x = paddle.to_variable(self.x_np) + x = paddle.to_tensor(self.x_np) out1 = F.hardshrink(x) hd = paddle.nn.Hardshrink() out2 = hd(x) @@ -639,7 +639,7 @@ class TestHardtanhAPI(unittest.TestCase): def test_dygraph_api(self): paddle.disable_static(self.place) - x = paddle.to_variable(self.x_np) + x = paddle.to_tensor(self.x_np) out1 = F.hardtanh(x) m = paddle.nn.Hardtanh() out2 = m(x) @@ -1063,7 +1063,7 @@ class TestLeakyReluAPI(unittest.TestCase): def test_dygraph_api(self): paddle.disable_static(self.place) - x = paddle.to_variable(self.x_np) + x = paddle.to_tensor(self.x_np) out1 = F.leaky_relu(x) m = paddle.nn.LeakyReLU() out2 = m(x) diff --git a/python/paddle/fluid/tests/unittests/test_adamax_api.py b/python/paddle/fluid/tests/unittests/test_adamax_api.py index 5a33e11d2862c037639b1643a2e44ff81a757053..6d2ec0eefbb1c5157fdbcb5a2e04e97e918a95c9 100644 --- a/python/paddle/fluid/tests/unittests/test_adamax_api.py +++ b/python/paddle/fluid/tests/unittests/test_adamax_api.py @@ -25,7 +25,7 @@ class TestAdamaxAPI(unittest.TestCase): def test_adamax_api_dygraph(self): paddle.disable_static() value = np.arange(26).reshape(2, 13).astype("float32") - a = paddle.to_variable(value) + a = paddle.to_tensor(value) linear = paddle.nn.Linear(13, 5) adam = paddle.optimizer.Adamax( learning_rate=0.01, diff --git a/python/paddle/fluid/tests/unittests/test_adamw_op.py b/python/paddle/fluid/tests/unittests/test_adamw_op.py index cce24b57d2ca50e96e3ae0cf6d8912a8aea79a31..b799508f6b8d57ed59251f95beeafdb720a7299f 100644 --- a/python/paddle/fluid/tests/unittests/test_adamw_op.py +++ b/python/paddle/fluid/tests/unittests/test_adamw_op.py @@ -22,7 +22,7 @@ class TestAdamWOp(unittest.TestCase): def test_adamw_op_dygraph(self): paddle.disable_static() value = np.arange(26).reshape(2, 13).astype("float32") - a = paddle.to_variable(value) + a = paddle.to_tensor(value) linear = paddle.nn.Linear(13, 5) adam = paddle.optimizer.AdamW( learning_rate=0.01, @@ -37,7 +37,7 @@ class TestAdamWOp(unittest.TestCase): def test_adamw_op_coverage(self): paddle.disable_static() value = np.arange(26).reshape(2, 13).astype("float32") - a = paddle.to_variable(value) + a = paddle.to_tensor(value) linear = paddle.nn.Linear(13, 5) adam = paddle.optimizer.AdamW( learning_rate=0.0, diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py index e3c70884ebcf116feb4f5b0aa808c71e4b7f8c4e..b8c5bd2949124d622bc61fdae9dc43c34ab717af 100644 --- a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py +++ b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py @@ -147,7 +147,7 @@ class TestAdaptiveAvgPool2dAPI(unittest.TestCase): if core.is_compiled_with_cuda() else [False]): place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() paddle.disable_static(place=place) - x = paddle.to_variable(self.x_np) + x = paddle.to_tensor(self.x_np) out_1 = paddle.nn.functional.adaptive_avg_pool2d( x=x, output_size=[3, 3]) @@ -245,7 +245,7 @@ class TestAdaptiveAvgPool2dClassAPI(unittest.TestCase): if core.is_compiled_with_cuda() else [False]): place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() paddle.disable_static(place=place) - x = paddle.to_variable(self.x_np) + x = paddle.to_tensor(self.x_np) adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(output_size=[3, 3]) out_1 = adaptive_avg_pool(x=x) diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py index a3c9dd91a69ea83b08c3f817403620460333b5e9..bb36aaebf08421d1bf97775bc73fd30288e95eeb 100755 --- a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py +++ b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py @@ -162,7 +162,7 @@ class TestAdaptiveAvgPool3dAPI(unittest.TestCase): if core.is_compiled_with_cuda() else [False]): place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() paddle.disable_static(place=place) - x = paddle.to_variable(self.x_np) + x = paddle.to_tensor(self.x_np) out_1 = paddle.nn.functional.adaptive_avg_pool3d( x=x, output_size=[3, 3, 3]) @@ -262,7 +262,7 @@ class TestAdaptiveAvgPool3dClassAPI(unittest.TestCase): if core.is_compiled_with_cuda() else [False]): place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() paddle.disable_static(place=place) - x = paddle.to_variable(self.x_np) + x = paddle.to_tensor(self.x_np) adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d( output_size=[3, 3, 3]) diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_max_pool2d.py b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool2d.py index d78788eb1e7c63be485210780db25e1de6fd84b4..dfa6f3226c8ce06e2f22ee6690fbf1df12b649d0 100644 --- a/python/paddle/fluid/tests/unittests/test_adaptive_max_pool2d.py +++ b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool2d.py @@ -147,7 +147,7 @@ class TestAdaptiveMaxPool2dAPI(unittest.TestCase): if core.is_compiled_with_cuda() else [False]): place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() paddle.disable_static(place=place) - x = paddle.to_variable(self.x_np) + x = paddle.to_tensor(self.x_np) out_1 = paddle.nn.functional.adaptive_max_pool2d( x=x, return_indices=False, output_size=[3, 3]) @@ -240,7 +240,7 @@ class TestAdaptiveMaxPool2dClassAPI(unittest.TestCase): if core.is_compiled_with_cuda() else [False]): place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() paddle.disable_static(place=place) - x = paddle.to_variable(self.x_np) + x = paddle.to_tensor(self.x_np) adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(output_size=[3, 3]) out_1 = adaptive_max_pool(x=x) diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_max_pool3d.py b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool3d.py index a7de0a5c6a7017617124b893313e0f9830cc09f9..1fa703688cdd932f7121ae870fc26f16fda5d815 100755 --- a/python/paddle/fluid/tests/unittests/test_adaptive_max_pool3d.py +++ b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool3d.py @@ -162,7 +162,7 @@ class TestAdaptiveMaxPool3dAPI(unittest.TestCase): if core.is_compiled_with_cuda() else [False]): place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() paddle.disable_static(place=place) - x = paddle.to_variable(self.x_np) + x = paddle.to_tensor(self.x_np) out_1 = paddle.nn.functional.adaptive_max_pool3d( x=x, output_size=[3, 3, 3]) @@ -257,7 +257,7 @@ class TestAdaptiveMaxPool3dClassAPI(unittest.TestCase): if core.is_compiled_with_cuda() else [False]): place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() paddle.disable_static(place=place) - x = paddle.to_variable(self.x_np) + x = paddle.to_tensor(self.x_np) adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d( output_size=[3, 3, 3]) diff --git a/python/paddle/fluid/tests/unittests/test_addmm_op.py b/python/paddle/fluid/tests/unittests/test_addmm_op.py index 6e66c0c0029accdcdf81ae67dff1a49e3e8867d4..6238d7dd4a1f4574fa1fabf5d531db6d4a64df09 100644 --- a/python/paddle/fluid/tests/unittests/test_addmm_op.py +++ b/python/paddle/fluid/tests/unittests/test_addmm_op.py @@ -244,9 +244,9 @@ class TestAddMMAPI(unittest.TestCase): def test_error1(): data_x_wrong = np.ones((2, 3)).astype(np.float32) - x = paddle.to_variable(data_x_wrong) - y = paddle.to_variable(data_y) - input = paddle.to_variable(data_input) + x = paddle.to_tensor(data_x_wrong) + y = paddle.to_tensor(data_y) + input = paddle.to_tensor(data_input) out = paddle.tensor.addmm( input=input, x=x, y=y, beta=0.5, alpha=5.0 ) self.assertRaises(ValueError, test_error1) ''' diff --git a/python/paddle/fluid/tests/unittests/test_arange.py b/python/paddle/fluid/tests/unittests/test_arange.py index 29003d28e441c02e040a8d6cb9888e376521bc72..d62c08b072b10b025e885ada1af0eb97817cab80 100644 --- a/python/paddle/fluid/tests/unittests/test_arange.py +++ b/python/paddle/fluid/tests/unittests/test_arange.py @@ -98,9 +98,9 @@ class TestArangeImperative(unittest.TestCase): x2 = paddle.tensor.arange(5) x3 = paddle.tensor.creation.arange(5) - start = paddle.to_variable(np.array([0], 'float32')) - end = paddle.to_variable(np.array([5], 'float32')) - step = paddle.to_variable(np.array([1], 'float32')) + start = paddle.to_tensor(np.array([0], 'float32')) + end = paddle.to_tensor(np.array([5], 'float32')) + step = paddle.to_tensor(np.array([1], 'float32')) x4 = paddle.arange(start, end, step, 'int64') paddle.enable_static() diff --git a/python/paddle/fluid/tests/unittests/test_cholesky_op.py b/python/paddle/fluid/tests/unittests/test_cholesky_op.py index ab08a0aacbf08768ffff43974ee9a7c7dd4a7288..2fcec657c1404d86354e8a43ea8ac81cfc6947ac 100644 --- a/python/paddle/fluid/tests/unittests/test_cholesky_op.py +++ b/python/paddle/fluid/tests/unittests/test_cholesky_op.py @@ -96,7 +96,7 @@ class TestDygraph(unittest.TestCase): a = np.random.rand(3, 3) a_t = np.transpose(a, [1, 0]) x_data = np.matmul(a, a_t) + 1e-03 - x = paddle.to_variable(x_data) + x = paddle.to_tensor(x_data) out = paddle.cholesky(x, upper=False) diff --git a/python/paddle/fluid/tests/unittests/test_clip_op.py b/python/paddle/fluid/tests/unittests/test_clip_op.py index b56d9f6668e8bcbd37443fb88b1f5f4dd40a2511..2946798a82f78fc55adb6ff0bb3c7f9721a6d60f 100644 --- a/python/paddle/fluid/tests/unittests/test_clip_op.py +++ b/python/paddle/fluid/tests/unittests/test_clip_op.py @@ -168,9 +168,9 @@ class TestClipAPI(unittest.TestCase): paddle.disable_static(place) data_shape = [1, 9, 9, 4] data = np.random.random(data_shape).astype('float32') - images = paddle.to_variable(data, dtype='float32') - v_min = paddle.to_variable(np.array([0.2], dtype=np.float32)) - v_max = paddle.to_variable(np.array([0.8], dtype=np.float32)) + images = paddle.to_tensor(data, dtype='float32') + v_min = paddle.to_tensor(np.array([0.2], dtype=np.float32)) + v_max = paddle.to_tensor(np.array([0.8], dtype=np.float32)) out_1 = paddle.clip(images, min=0.2, max=0.8) out_2 = paddle.clip(images, min=0.2, max=0.9) diff --git a/python/paddle/fluid/tests/unittests/test_communicator_geo.py b/python/paddle/fluid/tests/unittests/test_communicator_geo.py index 5916000fba79fc0da2ef545beac634a3edfe01df..f625e1de4a3e0564037d71e2393f5914415917d9 100644 --- a/python/paddle/fluid/tests/unittests/test_communicator_geo.py +++ b/python/paddle/fluid/tests/unittests/test_communicator_geo.py @@ -113,6 +113,7 @@ class TestCommunicatorGeoEnd2End(unittest.TestCase): strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = True strategy.a_sync_configs = {"k_steps": 100} + strategy.a_sync_configs = {"launch_barrier": False} if training_role == "TRAINER": self.run_trainer(role, strategy) diff --git a/python/paddle/fluid/tests/unittests/test_communicator_sync.py b/python/paddle/fluid/tests/unittests/test_communicator_sync.py index 95b209b14602676a089a667b0a720056bbe1562b..78e2050d3b48edc4a2bd0195a371a6b34afb7b1e 100644 --- a/python/paddle/fluid/tests/unittests/test_communicator_sync.py +++ b/python/paddle/fluid/tests/unittests/test_communicator_sync.py @@ -51,6 +51,7 @@ class TestCommunicator(unittest.TestCase): strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = False + strategy.a_sync_configs = {"launch_barrier": False} optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) diff --git a/python/paddle/fluid/tests/unittests/test_concat_op.py b/python/paddle/fluid/tests/unittests/test_concat_op.py index b4dbba7eead397c46c37a8df013dabb00177f030..14c10e7aa2022e2963b0dbb973cefb1793be842f 100644 --- a/python/paddle/fluid/tests/unittests/test_concat_op.py +++ b/python/paddle/fluid/tests/unittests/test_concat_op.py @@ -285,9 +285,9 @@ class TestConcatAPI(unittest.TestCase): in2 = np.array([[11, 12, 13], [14, 15, 16]]) in3 = np.array([[21, 22], [23, 24]]) paddle.disable_static() - x1 = paddle.to_variable(in1) - x2 = paddle.to_variable(in2) - x3 = paddle.to_variable(in3) + x1 = paddle.to_tensor(in1) + x2 = paddle.to_tensor(in2) + x3 = paddle.to_tensor(in3) out1 = fluid.layers.concat(input=[x1, x2, x3], axis=-1) out2 = paddle.concat(x=[x1, x2], axis=0) np_out1 = np.concatenate([in1, in2, in3], axis=-1) diff --git a/python/paddle/fluid/tests/unittests/test_cosine_similarity_api.py b/python/paddle/fluid/tests/unittests/test_cosine_similarity_api.py index 1e25613fa63da440f71f23841095f153e61735e9..a8899d9f022c0961e5bce9c25e0b23cd7658f256 100644 --- a/python/paddle/fluid/tests/unittests/test_cosine_similarity_api.py +++ b/python/paddle/fluid/tests/unittests/test_cosine_similarity_api.py @@ -75,8 +75,8 @@ class TestCosineSimilarityAPI(unittest.TestCase): np_x2 = np.random.rand(*shape).astype(np.float32) np_out = self._get_numpy_out(np_x1, np_x2, axis=axis, eps=eps) - tesnor_x1 = paddle.to_variable(np_x1) - tesnor_x2 = paddle.to_variable(np_x2) + tesnor_x1 = paddle.to_tensor(np_x1) + tesnor_x2 = paddle.to_tensor(np_x2) y = F.cosine_similarity(tesnor_x1, tesnor_x2, axis=axis, eps=eps) self.assertTrue(np.allclose(y.numpy(), np_out)) @@ -92,8 +92,8 @@ class TestCosineSimilarityAPI(unittest.TestCase): np_x2 = np.random.rand(*shape).astype(np.float32) np_out = self._get_numpy_out(np_x1, np_x2, axis=axis, eps=eps) - tesnor_x1 = paddle.to_variable(np_x1) - tesnor_x2 = paddle.to_variable(np_x2) + tesnor_x1 = paddle.to_tensor(np_x1) + tesnor_x2 = paddle.to_tensor(np_x2) y = F.cosine_similarity(tesnor_x1, tesnor_x2, axis=axis, eps=eps) self.assertTrue(np.allclose(y.numpy(), np_out)) @@ -110,8 +110,8 @@ class TestCosineSimilarityAPI(unittest.TestCase): np_x2 = np.random.rand(*shape2).astype(np.float32) np_out = self._get_numpy_out(np_x1, np_x2, axis=axis, eps=eps) - tesnor_x1 = paddle.to_variable(np_x1) - tesnor_x2 = paddle.to_variable(np_x2) + tesnor_x1 = paddle.to_tensor(np_x1) + tesnor_x2 = paddle.to_tensor(np_x2) y = F.cosine_similarity(tesnor_x1, tesnor_x2, axis=axis, eps=eps) self.assertTrue(np.allclose(y.numpy(), np_out)) @@ -129,8 +129,8 @@ class TestCosineSimilarityAPI(unittest.TestCase): np_out = self._get_numpy_out(np_x1, np_x2, axis=axis, eps=eps) cos_sim_func = nn.CosineSimilarity(axis=axis, eps=eps) - tesnor_x1 = paddle.to_variable(np_x1) - tesnor_x2 = paddle.to_variable(np_x2) + tesnor_x1 = paddle.to_tensor(np_x1) + tesnor_x2 = paddle.to_tensor(np_x2) y = cos_sim_func(tesnor_x1, tesnor_x2) self.assertTrue(np.allclose(y.numpy(), np_out)) diff --git a/python/paddle/fluid/tests/unittests/test_cumsum_op.py b/python/paddle/fluid/tests/unittests/test_cumsum_op.py index ad121fac8cc045e67cf116d2cf9cedd6ac9bef99..818e15bb319b10a1ffa6850874a26f412422b574 100644 --- a/python/paddle/fluid/tests/unittests/test_cumsum_op.py +++ b/python/paddle/fluid/tests/unittests/test_cumsum_op.py @@ -21,13 +21,12 @@ import paddle import paddle.fluid.core as core import paddle.fluid as fluid from paddle.fluid import compiler, Program, program_guard -from paddle import to_variable class TestCumsumOp(unittest.TestCase): def run_cases(self): data_np = np.arange(12).reshape(3, 4) - data = to_variable(data_np) + data = paddle.to_tensor(data_np) y = paddle.cumsum(data) z = np.cumsum(data_np) diff --git a/python/paddle/fluid/tests/unittests/test_data.py b/python/paddle/fluid/tests/unittests/test_data.py index 8070148f8b36dd7dab7711abaf25994acebc7e6f..98739f6e1631e5ebd5fc8da45647118be8c05f6f 100644 --- a/python/paddle/fluid/tests/unittests/test_data.py +++ b/python/paddle/fluid/tests/unittests/test_data.py @@ -99,5 +99,17 @@ class TestApiStaticDataError(unittest.TestCase): self.assertRaises(TypeError, test_shape_type) +class TestApiErrorWithDynamicMode(unittest.TestCase): + def test_error(self): + with program_guard(Program(), Program()): + paddle.disable_static() + self.assertRaises(AssertionError, fluid.data, 'a', [2, 25]) + self.assertRaises( + AssertionError, fluid.layers.data, 'b', shape=[2, 25]) + self.assertRaises( + AssertionError, paddle.static.data, 'c', shape=[2, 25]) + paddle.enable_static() + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_default_dtype.py b/python/paddle/fluid/tests/unittests/test_default_dtype.py index 057933fc7a735c2732cd651e83e99ddfa747b8a8..29ca9a93985977936d1c99d1587d615c67524136 100644 --- a/python/paddle/fluid/tests/unittests/test_default_dtype.py +++ b/python/paddle/fluid/tests/unittests/test_default_dtype.py @@ -20,7 +20,6 @@ import paddle import paddle.fluid as fluid from paddle.fluid.dygraph import Linear import paddle.fluid.core as core -from paddle import to_variable class TestDefaultType(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py b/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py index 2a80e20d692c88497e7edccd6eca5509e3522871..97b6594eb382507ccbbb8b6bfad8e5631d534010 100755 --- a/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py +++ b/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py @@ -72,6 +72,7 @@ class TestDeprecatedDocorator(unittest.TestCase): test old fluid elementwise_mul api, it should fire Warinng function, which insert the Warinng info on top of API's doc string. """ + paddle.enable_static() # Initialization x = fluid.data(name='x', shape=[3, 2, 1], dtype='float32') @@ -80,6 +81,7 @@ class TestDeprecatedDocorator(unittest.TestCase): # captured captured = get_warning_index(fluid.data) + paddle.disable_static() # testting self.assertGreater(expected, captured) diff --git a/python/paddle/fluid/tests/unittests/test_directory_migration.py b/python/paddle/fluid/tests/unittests/test_directory_migration.py index 529fff158c55fc30248b9f5a88c8c615a8b55c79..2f35b45aa670c1d8e2caa7fe2ecf4e06b9884899 100644 --- a/python/paddle/fluid/tests/unittests/test_directory_migration.py +++ b/python/paddle/fluid/tests/unittests/test_directory_migration.py @@ -36,7 +36,7 @@ class TestDirectory(unittest.TestCase): def test_new_directory(self): new_directory = [ 'paddle.enable_static', 'paddle.disable_static', - 'paddle.in_dynamic_mode', 'paddle.to_variable', 'paddle.grad', + 'paddle.in_dynamic_mode', 'paddle.to_tensor', 'paddle.grad', 'paddle.no_grad', 'paddle.save', 'paddle.load', 'paddle.static.save', 'paddle.static.load', 'paddle.distributed.ParallelEnv', diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py index 7f55e956a94aee79dda07762e953e71807899bff..845be6eda6e0d972ed98535fc987f459e4e8d702 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py @@ -52,6 +52,7 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase): strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = True + strategy.a_sync_configs = {"launch_barrier": False} optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) @@ -92,6 +93,7 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase): strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = True + strategy.a_sync_configs = {"launch_barrier": False} optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py index db3f2afb3668bc1831286f8d13b274895e7632fd..668b4ad872f43c65c47f75a8af016a6c6af58513 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py @@ -44,6 +44,7 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase): strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = False + strategy.a_sync_configs = {"launch_barrier": False} optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py index c46d1dc5b0f87262aee8efd4722418be433c98ea..195b3f8de0a40011e3da5ddfa6d633daaa90fc1c 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py @@ -312,9 +312,6 @@ class TestFleetBase(unittest.TestCase): "========================Error tr1_err end===========================" ) - self.assertEqual(tr0_ret, 0, "something wrong in tr0, please check") - self.assertEqual(tr1_ret, 0, "something wrong in tr1, please check") - # close trainer file tr0_pipe.close() tr1_pipe.close() @@ -325,6 +322,8 @@ class TestFleetBase(unittest.TestCase): ps1.terminate() shutil.rmtree(gloo_path) + self.assertEqual(tr0_ret, 0, "something wrong in tr0, please check") + self.assertEqual(tr1_ret, 0, "something wrong in tr1, please check") return 0, 0 def check_with_place(self, diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_base.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_base.py index ba97c5079bde429b0b7145208926b570d04725bc..6c5a1d6e36c2549d5e3549f81f26d5ffcca3a247 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_base.py @@ -81,7 +81,7 @@ class FleetDistHeterRunnerBase(object): def build_strategy(self, args): self.strategy = paddle.distributed.fleet.DistributedStrategy() self.strategy.a_sync = True - + self.strategy.a_sync_configs = {"launch_barrier": True} return self.strategy def build_optimizer(self, avg_cost, strategy): @@ -237,7 +237,10 @@ class TestFleetHeterBase(unittest.TestCase): return heter0_proc, heter1_proc, heter0_pipe, heter1_pipe def _run_cluster(self, model, envs): - env = {'GRAD_CLIP': str(self._grad_clip_mode)} + env = { + 'GRAD_CLIP': str(self._grad_clip_mode), + 'FLAGS_eager_delete_tensor_gb': str(-1) + } python_path = self._python_interp gloo_path = tempfile.mkdtemp() @@ -286,27 +289,6 @@ class TestFleetHeterBase(unittest.TestCase): tr0_ret = tr0.returncode tr1_ret = tr0.returncode - print("tr get returncode: {}".format(tr0_ret)) - if tr0_ret != 0: - print( - "========================Error tr0_err begin===========================" - ) - os.system("cat {}".format(tempfile.gettempdir() + "/tr0_err.log")) - print( - "========================Error tr0_err end===========================" - ) - - if tr1_ret != 0: - print( - "========================Error tr1_err begin===========================" - ) - os.system("cat {}".format(tempfile.gettempdir() + "/tr1_err.log")) - print( - "========================Error tr1_err end===========================" - ) - - self.assertEqual(tr0_ret, 0, "something wrong in tr0, please check") - self.assertEqual(tr1_ret, 0, "something wrong in tr1, please check") # close trainer file tr0_pipe.close() @@ -320,7 +302,8 @@ class TestFleetHeterBase(unittest.TestCase): ps1.terminate() heter0.terminate() heter1.terminate() - + self.assertEqual(tr0_ret, 0, "something wrong in tr0, please check") + self.assertEqual(tr1_ret, 0, "something wrong in tr1, please check") shutil.rmtree(gloo_path) return 0, 0 diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_ctr.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_ctr.py index b3e38a421287611c43bb82d93b4df166e23f6484..5f7d7b21d7ff8da8699c2f55adcde954c1c0156d 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_ctr.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_ctr.py @@ -23,38 +23,6 @@ import paddle paddle.enable_static() -class TestDistHeterDatasetAsync2x2(TestFleetHeterBase): - def _setup_config(self): - self._mode = "async" - self._reader = "dataset" - - def check_with_place(self, - model_file, - delta=1e-3, - check_error_log=False, - need_envs={}): - required_envs = { - "PATH": os.getenv("PATH", ""), - "PYTHONPATH": os.getenv("PYTHONPATH", ""), - "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""), - "FLAGS_rpc_deadline": "5000", # 5sec to fail fast - "http_proxy": "", - "CPU_NUM": "3" - } - - required_envs.update(need_envs) - - if check_error_log: - required_envs["GLOG_v"] = "3" - required_envs["GLOG_logtostderr"] = "1" - - tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs) - - def test_dist_train(self): - self.check_with_place( - "dist_fleet_heter_ctr.py", delta=1e-5, check_error_log=True) - - class TestDistHeterPyreaderAsync2x2(TestFleetHeterBase): def _setup_config(self): self._mode = "async" diff --git a/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py b/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py index 642044bb4b1152b0c6d2b5a8a64e22410f9bd151..e0e487eff11e700b6b2c0531cab9e102d302248e 100644 --- a/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py +++ b/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py @@ -195,7 +195,7 @@ class TestFlattenPython(unittest.TestCase): def test_Negative(): paddle.disable_static() - img = paddle.to_variable(x) + img = paddle.to_tensor(x) out = paddle.flatten(img, start_axis=-2, stop_axis=-1) return out.numpy().shape diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py index 22f16287c33f96a43361b5fe4ed5d0fe3edbb1bc..7378975aa3795bb42e72f5e892bf08a2910e320a 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py @@ -211,7 +211,7 @@ class TestImperative(unittest.TestCase): paddle.disable_static() self.assertTrue(paddle.in_dynamic_mode()) np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32) - var_inp = paddle.to_variable(np_inp) + var_inp = paddle.to_tensor(np_inp) mlp = MLP(input_size=2) out = mlp(var_inp) dy_out1 = out.numpy() @@ -221,7 +221,7 @@ class TestImperative(unittest.TestCase): self.assertFalse(paddle.in_dynamic_mode()) paddle.disable_static() self.assertTrue(paddle.in_dynamic_mode()) - var_inp = paddle.to_variable(np_inp) + var_inp = paddle.to_tensor(np_inp) mlp = MLP(input_size=2) out = mlp(var_inp) dy_out2 = out.numpy() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py index 59ddb365e539603c1eba06ca8828fc244b6e542d..97f7162e9979c504de0e29206a7b6d03884e3e19 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py @@ -54,7 +54,7 @@ class TestSimpleNet(unittest.TestCase): # grad_clip = fluid.clip.GradientClipByGlobalNorm(5.0) input_word = np.array([[1, 2], [2, 1]]).astype('int64') - input = paddle.to_variable(input_word) + input = paddle.to_tensor(input_word) simplenet = SimpleNet(20, 32, dtype) adam = SGDOptimizer( diff --git a/python/paddle/fluid/tests/unittests/test_isfinite_v2_op.py b/python/paddle/fluid/tests/unittests/test_isfinite_v2_op.py index 8a868e751f0567e6387b0e9471f0382c9456bcb6..281dc7caded1f5d685350e88da776a614a3b8175 100644 --- a/python/paddle/fluid/tests/unittests/test_isfinite_v2_op.py +++ b/python/paddle/fluid/tests/unittests/test_isfinite_v2_op.py @@ -41,7 +41,7 @@ def run_dygraph(x_np, op_str, use_gpu=True): if use_gpu and fluid.core.is_compiled_with_cuda(): place = paddle.CUDAPlace(0) paddle.disable_static(place) - x = paddle.to_variable(x_np) + x = paddle.to_tensor(x_np) dygraph_result = getattr(paddle.tensor, op_str)(x) return dygraph_result diff --git a/python/paddle/fluid/tests/unittests/test_jit_save_load.py b/python/paddle/fluid/tests/unittests/test_jit_save_load.py index 7e6ca8076de5186def1229b58bd23df73021430e..99404246185040e35c59ab5cf374e1948870b2bb 100644 --- a/python/paddle/fluid/tests/unittests/test_jit_save_load.py +++ b/python/paddle/fluid/tests/unittests/test_jit_save_load.py @@ -543,9 +543,9 @@ class TestJitSaveMultiCases(unittest.TestCase): loaded_layer = paddle.jit.load(model_path) loaded_layer.eval() # inference & compare - x = paddle.to_variable(np.random.random((1, 784)).astype('float32')) + x = paddle.to_tensor(np.random.random((1, 784)).astype('float32')) if with_label: - y = paddle.to_variable(np.random.random((1, 1)).astype('int64')) + y = paddle.to_tensor(np.random.random((1, 1)).astype('int64')) pred, _ = layer(x, y) pred = pred.numpy() else: @@ -677,7 +677,7 @@ class TestJitSaveMultiCases(unittest.TestCase): model_path = "test_not_prune_output_spec_name_warning" configs = paddle.SaveLoadConfig() - out = paddle.to_variable(np.random.random((1, 1)).astype('float')) + out = paddle.to_tensor(np.random.random((1, 1)).astype('float')) configs.output_spec = [out] paddle.jit.save(layer, model_path, configs=configs) @@ -709,7 +709,7 @@ class TestJitSaveMultiCases(unittest.TestCase): model_path = "test_prune_to_static_after_train" configs = paddle.SaveLoadConfig() - out = paddle.to_variable(np.random.random((1, 1)).astype('float')) + out = paddle.to_tensor(np.random.random((1, 1)).astype('float')) configs.output_spec = [out] with self.assertRaises(ValueError): paddle.jit.save( @@ -730,7 +730,7 @@ class TestJitSaveLoadEmptyLayer(unittest.TestCase): def test_save_load_empty_layer(self): layer = EmptyLayer() - x = paddle.to_variable(np.random.random((10)).astype('float32')) + x = paddle.to_tensor(np.random.random((10)).astype('float32')) out = layer(x) paddle.jit.save(layer, self.model_path) load_layer = paddle.jit.load(self.model_path) @@ -746,8 +746,8 @@ class TestJitSaveLoadNoParamLayer(unittest.TestCase): def test_save_load_no_param_layer(self): layer = NoParamLayer() - x = paddle.to_variable(np.random.random((5)).astype('float32')) - y = paddle.to_variable(np.random.random((5)).astype('float32')) + x = paddle.to_tensor(np.random.random((5)).astype('float32')) + y = paddle.to_tensor(np.random.random((5)).astype('float32')) out = layer(x, y) paddle.jit.save(layer, self.model_path) load_layer = paddle.jit.load(self.model_path) diff --git a/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py b/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py index 041fe4e9043d60852fcaab42bc233b63b39609ce..3a3b7071e04dced7fc33c4ac56a9018bb82afbf4 100644 --- a/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py +++ b/python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py @@ -90,7 +90,7 @@ class TestKLDivLossDygraph(unittest.TestCase): with paddle.fluid.dygraph.guard(): kldiv_criterion = paddle.nn.KLDivLoss(reduction) pred_loss = kldiv_criterion( - paddle.to_variable(x), paddle.to_variable(target)) + paddle.to_tensor(x), paddle.to_tensor(target)) self.assertTrue(np.allclose(pred_loss.numpy(), gt_loss)) def test_kl_loss_batchmean(self): diff --git a/python/paddle/fluid/tests/unittests/test_l1_loss.py b/python/paddle/fluid/tests/unittests/test_l1_loss.py index 6a15fe494779f93c2f36a301594aaccf55283902..3c37397cae1b586f77249136d6abd2e0dce1d8b3 100644 --- a/python/paddle/fluid/tests/unittests/test_l1_loss.py +++ b/python/paddle/fluid/tests/unittests/test_l1_loss.py @@ -26,8 +26,8 @@ class TestFunctionalL1Loss(unittest.TestCase): self.label_np = np.random.random(size=(10, 10, 5)).astype(np.float32) def run_imperative(self): - input = paddle.to_variable(self.input_np) - label = paddle.to_variable(self.label_np) + input = paddle.to_tensor(self.input_np) + label = paddle.to_tensor(self.label_np) dy_result = paddle.nn.functional.l1_loss(input, label) expected = np.mean(np.abs(self.input_np - self.label_np)) self.assertTrue(np.allclose(dy_result.numpy(), expected)) @@ -106,8 +106,8 @@ class TestClassL1Loss(unittest.TestCase): self.label_np = np.random.random(size=(10, 10, 5)).astype(np.float32) def run_imperative(self): - input = paddle.to_variable(self.input_np) - label = paddle.to_variable(self.label_np) + input = paddle.to_tensor(self.input_np) + label = paddle.to_tensor(self.label_np) l1_loss = paddle.nn.loss.L1Loss() dy_result = l1_loss(input, label) expected = np.mean(np.abs(self.input_np - self.label_np)) diff --git a/python/paddle/fluid/tests/unittests/test_log_softmax.py b/python/paddle/fluid/tests/unittests/test_log_softmax.py index e3d7003ecedb60f9b4f9a542ed08ca88d894d24a..9ac4895f499f812906fe22cc31712459cb43976c 100644 --- a/python/paddle/fluid/tests/unittests/test_log_softmax.py +++ b/python/paddle/fluid/tests/unittests/test_log_softmax.py @@ -96,7 +96,7 @@ class TestNNLogSoftmaxAPI(unittest.TestCase): # test dygrapg api paddle.disable_static() - x = paddle.to_variable(self.x) + x = paddle.to_tensor(self.x) y = logsoftmax(x) self.assertTrue(np.allclose(y.numpy(), ref_out)) paddle.enable_static() @@ -127,7 +127,7 @@ class TestNNFunctionalLogSoftmaxAPI(unittest.TestCase): self.assertTrue(np.allclose(out[0], ref_out)) paddle.disable_static() - x = paddle.to_variable(self.x) + x = paddle.to_tensor(self.x) y = F.log_softmax(x, axis, dtype) self.assertTrue(np.allclose(y.numpy(), ref_out), True) paddle.enable_static() diff --git a/python/paddle/fluid/tests/unittests/test_logsumexp.py b/python/paddle/fluid/tests/unittests/test_logsumexp.py index cf9203dffcbaa5da641b3f7cb8925ac9efcbe115..9032293070a9697aada6034d0b6028eeccc310dd 100644 --- a/python/paddle/fluid/tests/unittests/test_logsumexp.py +++ b/python/paddle/fluid/tests/unittests/test_logsumexp.py @@ -111,7 +111,7 @@ class TestLogsumexpAPI(unittest.TestCase): self.assertTrue(np.allclose(res[0], out_ref)) paddle.disable_static(self.place) - x = paddle.to_variable(self.x) + x = paddle.to_tensor(self.x) out = paddle.logsumexp(x, axis, keepdim) self.assertTrue(np.allclose(out.numpy(), out_ref)) paddle.enable_static() @@ -126,7 +126,7 @@ class TestLogsumexpAPI(unittest.TestCase): def test_alias(self): paddle.disable_static(self.place) - x = paddle.to_variable(self.x) + x = paddle.to_tensor(self.x) out1 = paddle.logsumexp(x) out2 = paddle.tensor.logsumexp(x) out3 = paddle.tensor.math.logsumexp(x) diff --git a/python/paddle/fluid/tests/unittests/test_matmul_op.py b/python/paddle/fluid/tests/unittests/test_matmul_op.py index 3eb822bfed89b80bccc08fe0d96b6c4b2f9b4ec4..2d5f098a7fe86439d8b829a3b11cf23dfe072b77 100644 --- a/python/paddle/fluid/tests/unittests/test_matmul_op.py +++ b/python/paddle/fluid/tests/unittests/test_matmul_op.py @@ -14,6 +14,7 @@ from __future__ import print_function +import paddle.fluid.core as core import unittest import numpy as np from op_test import OpTest diff --git a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py index 884139a23d51c95c79439b91d501dc935baeae36..640771df23b726bd0a8a36b168bc5428fd953c45 100644 --- a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py +++ b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py @@ -65,15 +65,21 @@ class TestMatMulV2Op(OpTest): self.y_shape = (100, ) self.trans_x = False self.trans_y = False + + def init_kernel_type(self): self.dtype = "float64" def setUp(self): + self.init_kernel_type() self.config() self.op_type = "matmul_v2" x = np.random.random(self.x_shape).astype(self.dtype) y = np.random.random(self.y_shape).astype(self.dtype) + # -0.1 ~ 0.1 + x = -0.1 + 0.2 * x + y = -0.1 + 0.2 * y result = reference_matmul(x, y, self.trans_x, self.trans_y) - + result = result.astype(self.dtype) self.inputs = { 'X': x, 'Y': y, @@ -98,7 +104,6 @@ class TestMatMuklOp2(TestMatMulV2Op): self.y_shape = (1, 3, 2, 100) self.trans_x = False self.trans_y = True - self.dtype = "float64" class TestMatMuklOp3(TestMatMulV2Op): @@ -111,7 +116,6 @@ class TestMatMuklOp3(TestMatMulV2Op): self.y_shape = (1, 1, 100, 2) self.trans_x = False self.trans_y = False - self.dtype = "float64" class TestMatMuklOp4(TestMatMulV2Op): @@ -124,7 +128,6 @@ class TestMatMuklOp4(TestMatMulV2Op): self.y_shape = (1, 2, 100, 2) self.trans_x = False self.trans_y = False - self.dtype = "float64" class TestMatMuklOp5(TestMatMulV2Op): @@ -133,11 +136,10 @@ class TestMatMuklOp5(TestMatMulV2Op): """ def config(self): - self.x_shape = (1, 1, 100, 2) + self.x_shape = (1, 1, 100, 1) self.y_shape = (100, ) self.trans_x = True self.trans_y = False - self.dtype = "float64" class TestMatMuklOp6(TestMatMulV2Op): @@ -150,7 +152,6 @@ class TestMatMuklOp6(TestMatMulV2Op): self.y_shape = (100, ) self.trans_x = True self.trans_y = False - self.dtype = "float64" class TestMatMuklOp7(TestMatMulV2Op): @@ -163,7 +164,6 @@ class TestMatMuklOp7(TestMatMulV2Op): self.y_shape = (100, ) self.trans_x = False self.trans_y = False - self.dtype = "float64" class TestMatMuklOp8(TestMatMulV2Op): @@ -176,7 +176,6 @@ class TestMatMuklOp8(TestMatMulV2Op): self.y_shape = (1, 1, 100, 2) self.trans_x = False self.trans_y = False - self.dtype = "float64" class TestMatMuklOp9(TestMatMulV2Op): @@ -189,7 +188,6 @@ class TestMatMuklOp9(TestMatMulV2Op): self.y_shape = (2, 1, 2, 100) self.trans_x = False self.trans_y = True - self.dtype = "float64" class TestMatMuklOp10(TestMatMulV2Op): @@ -198,11 +196,10 @@ class TestMatMuklOp10(TestMatMulV2Op): """ def config(self): - self.x_shape = (1, 1, 2, 100) - self.y_shape = (1, 2, 100, 2) + self.x_shape = (1, 1, 25, 4) + self.y_shape = (1, 2, 4, 25) self.trans_x = False self.trans_y = False - self.dtype = "float64" class TestMatMuklOp11(TestMatMulV2Op): @@ -215,7 +212,6 @@ class TestMatMuklOp11(TestMatMulV2Op): self.y_shape = (1, 1, 100, 2) self.trans_x = False self.trans_y = False - self.dtype = "float64" class TestMatMuklOp12(TestMatMulV2Op): @@ -224,11 +220,10 @@ class TestMatMuklOp12(TestMatMulV2Op): """ def config(self): - self.x_shape = (2, 1, 100, 2) - self.y_shape = (1, 1, 100, 2) + self.x_shape = (2, 1, 4, 25) + self.y_shape = (1, 1, 4, 25) self.trans_x = True self.trans_y = False - self.dtype = "float64" class TestMatMuklOp13(TestMatMulV2Op): @@ -237,11 +232,10 @@ class TestMatMuklOp13(TestMatMulV2Op): """ def config(self): - self.x_shape = (2, 2, 100, 2) - self.y_shape = (2, 2, 100, 2) + self.x_shape = (2, 2, 2, 50) + self.y_shape = (2, 2, 2, 50) self.trans_x = True self.trans_y = False - self.dtype = "float64" class TestMatMuklOp14(TestMatMulV2Op): @@ -254,7 +248,6 @@ class TestMatMuklOp14(TestMatMulV2Op): self.y_shape = (1, 2, 2, 100, 2) self.trans_x = True self.trans_y = False - self.dtype = "float64" class TestMatMuklOp15(TestMatMulV2Op): @@ -267,7 +260,6 @@ class TestMatMuklOp15(TestMatMulV2Op): self.y_shape = (1, 2, 2, 100, 1) self.trans_x = False self.trans_y = False - self.dtype = "float64" class TestMatMuklOp16(TestMatMulV2Op): @@ -277,10 +269,9 @@ class TestMatMuklOp16(TestMatMulV2Op): def config(self): self.x_shape = (100) - self.y_shape = (1, 2, 2, 100, 1) + self.y_shape = (1, 2, 2, 100, 2) self.trans_x = False self.trans_y = False - self.dtype = "float64" class TestMatMuklOp17(TestMatMulV2Op): @@ -293,7 +284,54 @@ class TestMatMuklOp17(TestMatMulV2Op): self.y_shape = (100) self.trans_x = False self.trans_y = False - self.dtype = "float64" + + +#--------------------test matmul fp16-------------------- + + +def create_test_fp16_class(parent, atol=0.001, max_relative_error=1.0): + @unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") + class TestMatMulOpFp16Case(parent): + def init_kernel_type(self): + self.dtype = np.float16 + + def test_check_output(self): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_output_with_place(place, atol=atol) + + def test_check_grad(self): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_grad_with_place( + place, ['X', 'Y'], + 'Out', + max_relative_error=max_relative_error) + + cls_name = "{0}_{1}".format(parent.__name__, "Fp16") + TestMatMulOpFp16Case.__name__ = cls_name + globals()[cls_name] = TestMatMulOpFp16Case + + +create_test_fp16_class(TestMatMulV2Op) +create_test_fp16_class(TestMatMuklOp2) +create_test_fp16_class(TestMatMuklOp3) +create_test_fp16_class(TestMatMuklOp4) +create_test_fp16_class(TestMatMuklOp5) +create_test_fp16_class(TestMatMuklOp6) +create_test_fp16_class(TestMatMuklOp7) +create_test_fp16_class(TestMatMuklOp8) +create_test_fp16_class(TestMatMuklOp9) +create_test_fp16_class(TestMatMuklOp10) +create_test_fp16_class(TestMatMuklOp11) +create_test_fp16_class(TestMatMuklOp12) +create_test_fp16_class(TestMatMuklOp13) +create_test_fp16_class(TestMatMuklOp14) +create_test_fp16_class(TestMatMuklOp15) +create_test_fp16_class(TestMatMuklOp16) +create_test_fp16_class(TestMatMuklOp17) class TestMatMulV2API(unittest.TestCase): @@ -331,6 +369,17 @@ class TestMatMulV2API(unittest.TestCase): y = paddle.to_tensor(input_y) result = paddle.matmul(x, y) + def test_dygraph_fp16(self): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + with fluid.dygraph.guard(place): + input_x = np.random.random([4, 3]).astype("float16") + input_y = np.random.random([3, 4]).astype("float16") + x = paddle.to_tensor(input_x) + y = paddle.to_tensor(input_y) + result = paddle.matmul(x, y) + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_max_op.py b/python/paddle/fluid/tests/unittests/test_max_op.py index c9afc4bec66f2927a674ac15e807fe01f724c64f..4786d790b148177a9c687103260667309d6ec3d8 100644 --- a/python/paddle/fluid/tests/unittests/test_max_op.py +++ b/python/paddle/fluid/tests/unittests/test_max_op.py @@ -80,7 +80,7 @@ class ApiMaxTest(unittest.TestCase): def test_imperative_api(self): paddle.disable_static() np_x = np.array([10, 10]).astype('float64') - x = paddle.to_variable(np_x) + x = paddle.to_tensor(np_x) z = paddle.max(x, axis=0) np_z = z.numpy() z_expected = np.array(np.max(np_x, axis=0)) diff --git a/python/paddle/fluid/tests/unittests/test_maximum_op.py b/python/paddle/fluid/tests/unittests/test_maximum_op.py index 5645597007a00cac9c75ec1ae90bc00a5bc75f22..54657d7900e3d43a54adf755e9b213727766db7d 100644 --- a/python/paddle/fluid/tests/unittests/test_maximum_op.py +++ b/python/paddle/fluid/tests/unittests/test_maximum_op.py @@ -61,8 +61,8 @@ class ApiMaximumTest(unittest.TestCase): def test_dynamic_api(self): paddle.disable_static() np_x = np.array([10, 10]).astype('float64') - x = paddle.to_variable(self.input_x) - y = paddle.to_variable(self.input_y) + x = paddle.to_tensor(self.input_x) + y = paddle.to_tensor(self.input_y) z = paddle.maximum(x, y) np_z = z.numpy() z_expected = np.array(np.maximum(self.input_x, self.input_y)) @@ -73,8 +73,8 @@ class ApiMaximumTest(unittest.TestCase): np_x = np.random.rand(5, 4, 3, 2).astype("float64") np_y = np.random.rand(4, 3).astype("float64") - x = paddle.to_variable(self.input_x) - y = paddle.to_variable(self.input_y) + x = paddle.to_tensor(self.input_x) + y = paddle.to_tensor(self.input_y) result_1 = paddle.maximum(x, y, axis=1) result_2 = paddle.maximum(x, y, axis=-2) self.assertEqual((result_1.numpy() == result_2.numpy()).all(), True) diff --git a/python/paddle/fluid/tests/unittests/test_mean_op.py b/python/paddle/fluid/tests/unittests/test_mean_op.py index 29e79b096cf790858e8e07aedc5c6b76881e8f82..f0094e703cd0d2a8113f70ea99dc8e71cef5b86f 100644 --- a/python/paddle/fluid/tests/unittests/test_mean_op.py +++ b/python/paddle/fluid/tests/unittests/test_mean_op.py @@ -204,7 +204,7 @@ class TestMeanAPI(unittest.TestCase): paddle.disable_static(self.place) def test_case(x, axis=None, keepdim=False): - x_tensor = paddle.to_variable(x) + x_tensor = paddle.to_tensor(x) out = paddle.mean(x_tensor, axis, keepdim) if isinstance(axis, list): axis = tuple(axis) diff --git a/python/paddle/fluid/tests/unittests/test_min_op.py b/python/paddle/fluid/tests/unittests/test_min_op.py index b9eff05c5ea9fb585421b6f99bf55b3bb95bf9ff..9c15d7216352c20031b92263a0344ca6eac76c89 100644 --- a/python/paddle/fluid/tests/unittests/test_min_op.py +++ b/python/paddle/fluid/tests/unittests/test_min_op.py @@ -80,7 +80,7 @@ class ApiMinTest(unittest.TestCase): def test_imperative_api(self): paddle.disable_static() np_x = np.array([10, 10]).astype('float64') - x = paddle.to_variable(np_x) + x = paddle.to_tensor(np_x) z = paddle.min(x, axis=0) np_z = z.numpy() z_expected = np.array(np.min(np_x, axis=0)) diff --git a/python/paddle/fluid/tests/unittests/test_mul_op.py b/python/paddle/fluid/tests/unittests/test_mul_op.py index 5f223de1954f7b401ac031265cca8c2e661c7392..927383c1223d54753dd59b3a6c40ea0b9594c378 100644 --- a/python/paddle/fluid/tests/unittests/test_mul_op.py +++ b/python/paddle/fluid/tests/unittests/test_mul_op.py @@ -18,6 +18,8 @@ import unittest import numpy as np import paddle import paddle.fluid.core as core +import sys +sys.path.append("..") from op_test import OpTest import paddle.fluid as fluid from paddle.fluid import Program, program_guard @@ -175,57 +177,5 @@ class TestFP16MulOp2(TestMulOp2): no_grad_set=set('Y')) -@unittest.skipIf(not core.is_compiled_with_xpu(), - "core is not compiled with XPU") -class TestXPUMulOp1(TestMulOp): - def init_dtype_type(self): - self.dtype = np.float32 - - def test_check_output(self): - place = core.XPUPlace(0) - self.check_output_with_place(place, atol=1e-1) - - def test_check_grad_normal(self): - place = core.XPUPlace(0) - self.check_grad_with_place( - place, ['X', 'Y'], 'Out', max_relative_error=0.5) - - def test_check_grad_ingore_x(self): - place = core.XPUPlace(0) - self.check_grad_with_place( - place, ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X")) - - def test_check_grad_ingore_y(self): - place = core.XPUPlace(0) - self.check_grad_with_place( - place, ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y')) - - -@unittest.skipIf(not core.is_compiled_with_xpu(), - "core is not compiled with XPU") -class TestXPUMulOp2(TestMulOp2): - def init_dtype_type(self): - self.dtype = np.float32 - - def test_check_output(self): - place = core.XPUPlace(0) - self.check_output_with_place(place, atol=2e-1) - - def test_check_grad_normal(self): - place = core.XPUPlace(0) - self.check_grad_with_place( - place, ['X', 'Y'], 'Out', max_relative_error=0.9) - - def test_check_grad_ingore_x(self): - place = core.XPUPlace(0) - self.check_grad_with_place( - place, ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X")) - - def test_check_grad_ingore_y(self): - place = core.XPUPlace(0) - self.check_grad_with_place( - place, ['X'], 'Out', max_relative_error=0.9, no_grad_set=set('Y')) - - if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py b/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py index a89b9fde7f92de0d493ad87a2f0950548ba8ff98..cb4bd16ce219f8a649716d8efff07eb82d5fffc4 100644 --- a/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py +++ b/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py @@ -130,5 +130,41 @@ class TestBatchNormDoubleGradCheckCase4(TestBatchNormDoubleGradCheck): self.shape = [2, 2, 3, 4, 5] +class TestBatchNormDoubleGradCheckCase5(TestBatchNormDoubleGradCheck): + @prog_scope() + def func(self, place): + prog = fluid.Program() + with fluid.program_guard(prog): + np.random.seed() + dtype = "float32" + eps = 0.005 + atol = 2e-4 + chn = self.shape[1] if self.data_layout == 'NCHW' else self.shape[ + -1] + x = layers.create_parameter(dtype=dtype, shape=self.shape, name='x') + z = fluid.layers.batch_norm( + input=x, + data_layout=self.data_layout, + use_global_stats=self.use_global_stats) + x_arr = np.random.uniform(-1, 1, self.shape).astype(dtype) + w, b = prog.global_block().all_parameters()[1:3] + w_arr = np.ones(chn).astype(dtype) + b_arr = np.zeros(chn).astype(dtype) + gradient_checker.double_grad_check( + [x, w, b], + z, + x_init=[x_arr, w_arr, b_arr], + atol=atol, + place=place, + eps=eps) + + +class TestBatchNormDoubleGradCheckCase6(TestBatchNormDoubleGradCheckCase5): + def init_test(self): + self.data_layout = 'NCHW' + self.use_global_stats = True + self.shape = [2, 3, 4, 5] + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_paddle_save_load.py b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py index 74d44d0f8b66739f7883173081f1da1250335b17..fee3494558604fb00f767261c06d4b3612e62ad0 100644 --- a/python/paddle/fluid/tests/unittests/test_paddle_save_load.py +++ b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py @@ -29,19 +29,23 @@ IMAGE_SIZE = 784 CLASS_NUM = 10 -# define a random dataset -class RandomDataset(paddle.io.Dataset): - def __init__(self, num_samples): - self.num_samples = num_samples - - def __getitem__(self, idx): +def random_batch_reader(): + def _get_random_inputs_and_labels(): np.random.seed(SEED) - image = np.random.random([IMAGE_SIZE]).astype('float32') - label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64') + image = np.random.random([BATCH_SIZE, IMAGE_SIZE]).astype('float32') + label = np.random.randint(0, CLASS_NUM - 1, ( + BATCH_SIZE, + 1, )).astype('int64') return image, label - def __len__(self): - return self.num_samples + def __reader__(): + for _ in range(BATCH_NUM): + batch_image, batch_label = _get_random_inputs_and_labels() + batch_image = paddle.to_tensor(batch_image) + batch_label = paddle.to_tensor(batch_label) + yield batch_image, batch_label + + return __reader__ class LinearNet(nn.Layer): @@ -66,8 +70,7 @@ def train(layer, loader, loss_fn, opt): class TestSaveLoad(unittest.TestCase): def setUp(self): # enable dygraph mode - self.place = paddle.CPUPlace() - paddle.disable_static(self.place) + paddle.disable_static() # config seed paddle.manual_seed(SEED) @@ -81,14 +84,8 @@ class TestSaveLoad(unittest.TestCase): adam = opt.Adam(learning_rate=0.001, parameters=layer.parameters()) # create data loader - dataset = RandomDataset(BATCH_NUM * BATCH_SIZE) - loader = paddle.io.DataLoader( - dataset, - places=self.place, - batch_size=BATCH_SIZE, - shuffle=True, - drop_last=True, - num_workers=2) + # TODO: using new DataLoader cause unknown Timeout on windows, replace it + loader = random_batch_reader() # train train(layer, loader, loss_fn, adam) @@ -103,8 +100,8 @@ class TestSaveLoad(unittest.TestCase): layer, opt = self.build_and_train_model() # save - layer_save_path = "linear.pdparams" - opt_save_path = "linear.pdopt" + layer_save_path = "test_paddle_save_load.linear.pdparams" + opt_save_path = "test_paddle_save_load.linear.pdopt" layer_state_dict = layer.state_dict() opt_state_dict = opt.state_dict() @@ -120,7 +117,7 @@ class TestSaveLoad(unittest.TestCase): # test save load in static mode paddle.enable_static() - static_save_path = "static_mode_test/linear.pdparams" + static_save_path = "static_mode_test/test_paddle_save_load.linear.pdparams" paddle.save(layer_state_dict, static_save_path) load_static_state_dict = paddle.load(static_save_path) self.check_load_state_dict(layer_state_dict, load_static_state_dict) @@ -133,15 +130,15 @@ class TestSaveLoad(unittest.TestCase): # 2. test save path format error with self.assertRaises(ValueError): - paddle.save(layer_state_dict, "linear.model/") + paddle.save(layer_state_dict, "test_paddle_save_load.linear.model/") # 3. test load path not exist error with self.assertRaises(ValueError): - paddle.load("linear.params") + paddle.load("test_paddle_save_load.linear.params") # 4. test load old save path error with self.assertRaises(ValueError): - paddle.load("linear") + paddle.load("test_paddle_save_load.linear") if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_pool1d_api.py b/python/paddle/fluid/tests/unittests/test_pool1d_api.py index 25216175d59935535a352b02afc3c8f371cedd63..c1169dfc5210ac80a709afa06d3bf9a470a785b0 100644 --- a/python/paddle/fluid/tests/unittests/test_pool1d_api.py +++ b/python/paddle/fluid/tests/unittests/test_pool1d_api.py @@ -195,6 +195,23 @@ class TestPool1d_API(unittest.TestCase): result = max_pool1d_dg(input) self.assertTrue(np.allclose(result.numpy(), result_np)) + def check_max_dygraph_return_index_results(self, place): + with fluid.dygraph.guard(place): + input_np = np.random.random([2, 3, 32]).astype("float32") + input = fluid.dygraph.to_variable(input_np) + result, index = F.max_pool1d( + input, kernel_size=2, stride=2, padding=0, return_indices=True) + + result_np = max_pool1D_forward_naive( + input_np, ksize=[2], strides=[2], paddings=[0]) + + self.assertTrue(np.allclose(result.numpy(), result_np)) + + max_pool1d_dg = paddle.nn.layer.MaxPool1d( + kernel_size=2, stride=None, padding=0) + result = max_pool1d_dg(input) + self.assertTrue(np.allclose(result.numpy(), result_np)) + def check_max_dygraph_padding_same(self, place): with fluid.dygraph.guard(place): input_np = np.random.random([2, 3, 32]).astype("float32") @@ -228,6 +245,7 @@ class TestPool1d_API(unittest.TestCase): self.check_avg_static_results(place) self.check_max_dygraph_padding_same(place) self.check_avg_dygraph_padding_same(place) + self.check_max_dygraph_return_index_results(place) class TestPool2dError_API(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/test_randn_op.py b/python/paddle/fluid/tests/unittests/test_randn_op.py index 9d2c03f3bba914d8f6b06b54ce0e19c168edb9e3..4ddd98a8a73424d3dee5ff640a73f5b49e4453c5 100644 --- a/python/paddle/fluid/tests/unittests/test_randn_op.py +++ b/python/paddle/fluid/tests/unittests/test_randn_op.py @@ -63,7 +63,7 @@ class TestRandnOpForDygraph(unittest.TestCase): dim_2 = paddle.fill_constant([1], "int32", 50) x3 = paddle.randn(shape=[dim_1, dim_2, 784]) - var_shape = paddle.to_variable(np.array(shape)) + var_shape = paddle.to_tensor(np.array(shape)) x4 = paddle.randn(var_shape) for out in [x1, x2, x3, x4]: diff --git a/python/paddle/fluid/tests/unittests/test_retain_graph.py b/python/paddle/fluid/tests/unittests/test_retain_graph.py index 9abbee173852baf9db998aad3b71edabdb3e11ed..98c7e3800c20c900b9cdcb01eacd7ab20a612780 100644 --- a/python/paddle/fluid/tests/unittests/test_retain_graph.py +++ b/python/paddle/fluid/tests/unittests/test_retain_graph.py @@ -105,8 +105,8 @@ class TestRetainGraph(unittest.TestCase): A = np.random.rand(2, 3, 32, 32).astype('float32') B = np.random.rand(2, 3, 32, 32).astype('float32') - realA = paddle.to_variable(A) - realB = paddle.to_variable(B) + realA = paddle.to_tensor(A) + realB = paddle.to_tensor(B) fakeB = g(realA) optim_d.clear_gradients() diff --git a/python/paddle/fluid/tests/unittests/test_transformer_api.py b/python/paddle/fluid/tests/unittests/test_transformer_api.py index 7c7a71a3be1b508c850048c3945f29ef7424654c..067d1ea5f73bf7d7af8a3511fa18dbc38b148656 100644 --- a/python/paddle/fluid/tests/unittests/test_transformer_api.py +++ b/python/paddle/fluid/tests/unittests/test_transformer_api.py @@ -487,24 +487,24 @@ class TestTransformer(unittest.TestCase): dropout=dropout, weight_attr=[None], bias_attr=[False]) - src = paddle.to_variable( + src = paddle.to_tensor( np.random.rand(batch_size, source_length, d_model).astype( "float32")) - tgt = paddle.to_variable( + tgt = paddle.to_tensor( np.random.rand(batch_size, target_length, d_model).astype( "float32")) src_mask = np.zeros((batch_size, n_head, source_length, source_length)).astype("float32") src_mask[0][0][0][0] = -np.inf - src_mask = paddle.to_variable(src_mask) + src_mask = paddle.to_tensor(src_mask) tgt_mask = np.zeros((batch_size, n_head, target_length, target_length)).astype("float32") tgt_mask[0][0][0][0] = -1e9 memory_mask = np.zeros((batch_size, n_head, target_length, source_length)).astype("float32") memory_mask[0][0][0][0] = -1e9 - tgt_mask, memory_mask = paddle.to_variable( - tgt_mask), paddle.to_variable(memory_mask) + tgt_mask, memory_mask = paddle.to_tensor( + tgt_mask), paddle.to_tensor(memory_mask) trans_output = transformer(src, tgt, src_mask, tgt_mask, memory_mask) @@ -521,24 +521,24 @@ class TestTransformer(unittest.TestCase): dropout=dropout, weight_attr=[None, None], bias_attr=[False, False]) - src = paddle.to_variable( + src = paddle.to_tensor( np.random.rand(batch_size, source_length, d_model).astype( "float32")) - tgt = paddle.to_variable( + tgt = paddle.to_tensor( np.random.rand(batch_size, target_length, d_model).astype( "float32")) src_mask = np.zeros((batch_size, n_head, source_length, source_length)).astype("float32") src_mask[0][0][0][0] = -np.inf - src_mask = paddle.to_variable(src_mask) + src_mask = paddle.to_tensor(src_mask) tgt_mask = np.zeros((batch_size, n_head, target_length, target_length)).astype("float32") tgt_mask[0][0][0][0] = -1e9 memory_mask = np.zeros((batch_size, n_head, target_length, source_length)).astype("float32") memory_mask[0][0][0][0] = -1e9 - tgt_mask, memory_mask = paddle.to_variable( - tgt_mask), paddle.to_variable(memory_mask) + tgt_mask, memory_mask = paddle.to_tensor( + tgt_mask), paddle.to_tensor(memory_mask) trans_output = transformer(src, tgt, src_mask, tgt_mask, memory_mask) @@ -555,24 +555,24 @@ class TestTransformer(unittest.TestCase): dropout=dropout, weight_attr=[None, None, None], bias_attr=[False, False, True]) - src = paddle.to_variable( + src = paddle.to_tensor( np.random.rand(batch_size, source_length, d_model).astype( "float32")) - tgt = paddle.to_variable( + tgt = paddle.to_tensor( np.random.rand(batch_size, target_length, d_model).astype( "float32")) src_mask = np.zeros((batch_size, n_head, source_length, source_length)).astype("float32") src_mask[0][0][0][0] = -np.inf - src_mask = paddle.to_variable(src_mask) + src_mask = paddle.to_tensor(src_mask) tgt_mask = np.zeros((batch_size, n_head, target_length, target_length)).astype("float32") tgt_mask[0][0][0][0] = -1e9 memory_mask = np.zeros((batch_size, n_head, target_length, source_length)).astype("float32") memory_mask[0][0][0][0] = -1e9 - tgt_mask, memory_mask = paddle.to_variable( - tgt_mask), paddle.to_variable(memory_mask) + tgt_mask, memory_mask = paddle.to_tensor( + tgt_mask), paddle.to_tensor(memory_mask) trans_output = transformer(src, tgt, src_mask, tgt_mask, memory_mask) @@ -588,24 +588,24 @@ class TestTransformer(unittest.TestCase): dim_feedforward=dim_feedforward, dropout=dropout, bias_attr=False) - src = paddle.to_variable( + src = paddle.to_tensor( np.random.rand(batch_size, source_length, d_model).astype( "float32")) - tgt = paddle.to_variable( + tgt = paddle.to_tensor( np.random.rand(batch_size, target_length, d_model).astype( "float32")) src_mask = np.zeros((batch_size, n_head, source_length, source_length)).astype("float32") src_mask[0][0][0][0] = -np.inf - src_mask = paddle.to_variable(src_mask) + src_mask = paddle.to_tensor(src_mask) tgt_mask = np.zeros((batch_size, n_head, target_length, target_length)).astype("float32") tgt_mask[0][0][0][0] = -1e9 memory_mask = np.zeros((batch_size, n_head, target_length, source_length)).astype("float32") memory_mask[0][0][0][0] = -1e9 - tgt_mask, memory_mask = paddle.to_variable( - tgt_mask), paddle.to_variable(memory_mask) + tgt_mask, memory_mask = paddle.to_tensor( + tgt_mask), paddle.to_tensor(memory_mask) trans_output = transformer(src, tgt, src_mask, tgt_mask, memory_mask) diff --git a/python/paddle/fluid/tests/unittests/test_zeros_like_op.py b/python/paddle/fluid/tests/unittests/test_zeros_like_op.py index 21e618a46201659fe0c4e5c67d1d9a8bafd70f1b..2cea3072809ec316abf55844b75c775bcd72042c 100644 --- a/python/paddle/fluid/tests/unittests/test_zeros_like_op.py +++ b/python/paddle/fluid/tests/unittests/test_zeros_like_op.py @@ -63,7 +63,7 @@ class TestZerosLikeImpeartive(unittest.TestCase): place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda( ) else fluid.CPUPlace() paddle.disable_static(place) - x = paddle.to_variable(np.ones(shape)) + x = paddle.to_tensor(np.ones(shape)) for dtype in [np.bool, np.float32, np.float64, np.int32, np.int64]: out = zeros_like(x, dtype) self.assertEqual((out.numpy() == np.zeros(shape, dtype)).all(), diff --git a/python/paddle/fluid/tests/unittests/xpu/test_activation_op.py b/python/paddle/fluid/tests/unittests/xpu/test_activation_op.py new file mode 100755 index 0000000000000000000000000000000000000000..788c110a592c0e18734e2b361a7edcdbc691230a --- /dev/null +++ b/python/paddle/fluid/tests/unittests/xpu/test_activation_op.py @@ -0,0 +1,215 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import sys +sys.path.append("..") +import unittest +import numpy as np +import paddle.fluid.core as core +from op_test import OpTest +from scipy.special import expit, erf +import paddle +import paddle.fluid as fluid +import paddle.nn as nn +import paddle.nn.functional as F +from paddle.fluid import compiler, Program, program_guard + + +@unittest.skipIf(not paddle.is_compiled_with_xpu(), + "core is not compiled with XPU") +class TestXPUActivation(OpTest): + def setUp(self): + self.op_type = "exp" + self.init_dtype() + self.init_kernel_type() + + x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype) + out = np.exp(x) + + self.attrs = {'use_xpu': True} + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.outputs = {'Out': out} + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + if paddle.is_compiled_with_xpu(): + place = paddle.XPUPlace(0) + self.check_output_with_place(place, atol=1e-3) + + def init_kernel_type(self): + pass + + +@unittest.skipIf(not paddle.is_compiled_with_xpu(), + "core is not compiled with XPU") +class TestXPUSigmoid(TestXPUActivation): + def setUp(self): + self.op_type = "sigmoid" + self.init_dtype() + + x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype) + out = 1 / (1 + np.exp(-x)) + + self.attrs = {'use_xpu': True} + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.outputs = {'Out': out} + + def test_check_grad(self): + if paddle.is_compiled_with_xpu(): + place = paddle.XPUPlace(0) + self.check_grad_with_place( + place, ['X'], 'Out', max_relative_error=0.01) + + +@unittest.skipIf(not paddle.is_compiled_with_xpu(), + "core is not compiled with XPU") +class TestXPUTanh(TestXPUActivation): + def setUp(self): + self.op_type = "tanh" + self.init_dtype() + x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype) + out = np.tanh(x) + + self.attrs = {'use_xpu': True} + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.outputs = {'Out': out} + + +@unittest.skipIf(not paddle.is_compiled_with_xpu(), + "core is not compiled with XPU") +class TestXPUSqrt(TestXPUActivation): + def setUp(self): + self.op_type = "sqrt" + self.init_dtype() + + x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype) + out = np.sqrt(x) + + self.attrs = {'use_xpu': True} + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.outputs = {'Out': out} + + +@unittest.skipIf(not paddle.is_compiled_with_xpu(), + "core is not compiled with XPU") +class TestXPUAbs(TestXPUActivation): + def setUp(self): + self.op_type = "abs" + self.init_dtype() + + x = np.random.uniform(-1, 1, [4, 25]).astype(self.dtype) + # Because we set delta = 0.005 in calculating numeric gradient, + # if x is too small, such as 0.002, x_neg will be -0.003 + # x_pos will be 0.007, so the numeric gradient is inaccurate. + # we should avoid this + x[np.abs(x) < 0.005] = 0.02 + out = np.abs(x) + + self.attrs = {'use_xpu': True} + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.outputs = {'Out': out} + + +@unittest.skipIf(not paddle.is_compiled_with_xpu(), + "core is not compiled with XPU") +class TestXPURelu(TestXPUActivation): + def setUp(self): + self.op_type = "relu" + self.init_dtype() + + x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype) + # The same reason with TestAbs + x[np.abs(x) < 0.005] = 0.02 + out = np.maximum(x, 0) + + self.attrs = {'use_xpu': True} + self.inputs = {'X': x} + self.outputs = {'Out': out} + + +@unittest.skipIf(not paddle.is_compiled_with_xpu(), + "core is not compiled with XPU") +class TestXPUGelu(TestXPUActivation): + def setUp(self): + self.op_type = "gelu" + self.init_dtype() + approximate = False + x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype) + out = gelu(x, approximate) + + self.inputs = {'X': x} + self.outputs = {'Out': out} + self.attrs = {"approximate": approximate, 'use_xpu': True} + + +def gelu(x, approximate): + if approximate: + y_ref = 0.5 * x * (1.0 + np.tanh( + np.sqrt(2 / np.pi) * (x + 0.044715 * np.power(x, 3)))) + else: + y_ref = 0.5 * x * (1 + erf(x / np.sqrt(2))) + return y_ref.astype(x.dtype) + + +@unittest.skipIf(not paddle.is_compiled_with_xpu(), + "core is not compiled with XPU") +class TestXPULog(TestXPUActivation): + def setUp(self): + self.op_type = "log" + self.init_dtype() + + x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype) + out = np.log(x) + + self.attrs = {'use_xpu': True} + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.outputs = {'Out': out} + + +@unittest.skipIf(not paddle.is_compiled_with_xpu(), + "core is not compiled with XPU") +class TestXPUSquare(TestXPUActivation): + def setUp(self): + self.op_type = "square" + self.init_dtype() + + x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype) + out = np.square(x) + + self.attrs = {'use_xpu': True} + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.outputs = {'Out': out} + + +@unittest.skipIf(not paddle.is_compiled_with_xpu(), + "core is not compiled with XPU") +class TestXPUPow(TestXPUActivation): + def setUp(self): + self.op_type = "pow" + self.init_dtype() + + x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype) + out = np.power(x, 3) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.attrs = {'factor': 3.0, 'use_xpu': True} + self.outputs = {'Out': out} + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op.py b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op.py new file mode 100644 index 0000000000000000000000000000000000000000..9c6e7d21c1a1918e1d24f580f1e6787632b41dbc --- /dev/null +++ b/python/paddle/fluid/tests/unittests/xpu/test_elementwise_add_op.py @@ -0,0 +1,346 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import sys +sys.path.append("..") +import unittest +import numpy as np +import paddle +import paddle.fluid.core as core +from op_test import OpTest, skip_check_grad_ci +import paddle.fluid as fluid +from paddle.fluid import compiler, Program, program_guard + + +class TestElementwiseAddOp(OpTest): + def init_kernel_type(self): + self.use_mkldnn = False + + def setUp(self): + self.op_type = "elementwise_add" + self.init_dtype() + self.init_input_output() + self.init_kernel_type() + self.init_axis() + + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype(self.x), + 'Y': OpTest.np_dtype_to_fluid_dtype(self.y) + } + self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn} + self.outputs = {'Out': self.out} + + def test_check_output(self): + # TODO(wangzhongpu): support mkldnn op in dygraph mode + self.check_output(check_dygraph=(self.use_mkldnn == False)) + + def test_check_grad_normal(self): + # TODO(wangzhongpu): support mkldnn op in dygraph mode + if self.dtype == np.float16: + return + self.check_grad( + ['X', 'Y'], 'Out', check_dygraph=(self.use_mkldnn == False)) + + def test_check_grad_ingore_x(self): + # TODO(wangzhongpu): support mkldnn op in dygraph mode + if self.dtype == np.float16: + return + self.check_grad( + ['Y'], + 'Out', + no_grad_set=set("X"), + check_dygraph=(self.use_mkldnn == False)) + + def test_check_grad_ingore_y(self): + # TODO(wangzhongpu): support mkldnn op in dygraph mode + if self.dtype == np.float16: + return + self.check_grad( + ['X'], + 'Out', + no_grad_set=set('Y'), + check_dygraph=(self.use_mkldnn == False)) + + def init_input_output(self): + self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype) + self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype) + self.out = np.add(self.x, self.y) + + def init_dtype(self): + self.dtype = np.float64 + + def init_axis(self): + self.axis = -1 + + +@unittest.skipIf(not paddle.is_compiled_with_xpu(), + "core is not compiled with XPU") +class TestXPUElementwiseAddOp(OpTest): + def setUp(self): + self.op_type = "elementwise_add" + self.init_dtype() + self.init_input_output() + self.init_axis() + + self.inputs = {'X': self.x, 'Y': self.y} + self.attrs = {'axis': self.axis, 'use_mkldnn': False, 'use_xpu': True} + self.outputs = {'Out': self.out} + + def test_check_output(self): + if self.dtype == np.float32 and paddle.is_compiled_with_xpu(): + place = paddle.XPUPlace(0) + self.check_output_with_place(place) + + def test_check_grad_normal(self): + if self.dtype == np.float32 and paddle.is_compiled_with_xpu(): + place = paddle.XPUPlace(0) + self.check_grad_with_place(place, ['X', 'Y'], 'Out') + + def test_check_grad_ingore_x(self): + if self.dtype == np.float32 and paddle.is_compiled_with_xpu(): + place = paddle.XPUPlace(0) + self.check_grad_with_place(place, ['Y'], 'Out') + + def test_check_grad_ingore_y(self): + if self.dtype == np.float32 and paddle.is_compiled_with_xpu(): + place = paddle.XPUPlace(0) + self.check_grad_with_place(place, ['X'], 'Out') + + def init_input_output(self): + self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype) + self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype) + self.out = np.add(self.x, self.y) + + def init_dtype(self): + self.dtype = np.float32 + + def init_axis(self): + self.axis = -1 + + +@skip_check_grad_ci( + reason="[skip shape check] Use y_shape(1) to test broadcast.") +class TestElementwiseAddOp_scalar(TestElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(2, 3, 4).astype(self.dtype) + self.y = np.random.rand(1).astype(self.dtype) + self.out = self.x + self.y + + +@skip_check_grad_ci( + reason="[skip shape check] Use y_shape(1,1) to test broadcast.") +class TestElementwiseAddOp_scalar2(TestElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(2, 3, 4).astype(self.dtype) + self.y = np.random.rand(1, 1).astype(self.dtype) + self.out = self.x + self.y + + +class TestElementwiseAddOp_Vector(TestElementwiseAddOp): + def init_input_output(self): + self.x = np.random.random((100, )).astype(self.dtype) + self.y = np.random.random((100, )).astype(self.dtype) + self.out = np.add(self.x, self.y) + + +class TestElementwiseAddOp_broadcast_0(TestElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(100, 2, 3).astype(self.dtype) + self.y = np.random.rand(100).astype(self.dtype) + self.out = self.x + self.y.reshape(100, 1, 1) + + def init_axis(self): + self.axis = 0 + + +class TestElementwiseAddOp_broadcast_1(TestElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(2, 100, 3).astype(self.dtype) + self.y = np.random.rand(100).astype(self.dtype) + self.out = self.x + self.y.reshape(1, 100, 1) + + def init_axis(self): + self.axis = 1 + + +class TestElementwiseAddOp_broadcast_2(TestElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(2, 3, 100).astype(self.dtype) + self.y = np.random.rand(100).astype(self.dtype) + self.out = self.x + self.y.reshape(1, 1, 100) + + +class TestElementwiseAddOp_broadcast_3(TestElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(2, 10, 12, 3).astype(self.dtype) + self.y = np.random.rand(10, 12).astype(self.dtype) + self.out = self.x + self.y.reshape(1, 10, 12, 1) + + def init_axis(self): + self.axis = 1 + + +class TestElementwiseAddOp_broadcast_4(TestElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(100, 2, 3, 4).astype(self.dtype) + self.y = np.random.rand(100, 1).astype(self.dtype) + self.out = self.x + self.y.reshape(100, 1, 1, 1) + + def init_axis(self): + self.axis = 0 + + +class TestElementwiseAddOp_broadcast_5(TestElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(10, 3, 12).astype(self.dtype) + self.y = np.random.rand(10, 1, 12).astype(self.dtype) + self.out = self.x + self.y + + +class TestElementwiseAddOp_broadcast_6(TestElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(2, 12, 3, 5).astype(self.dtype) + self.y = np.random.rand(2, 12, 1, 5).astype(self.dtype) + self.out = self.x + self.y + + +class TestElementwiseAddOp_broadcast_7(TestElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(1, 1, 20, 5).astype(self.dtype) + self.y = np.random.rand(20, 5, 1, 1).astype(self.dtype) + self.out = self.x + self.y + + +class TestElementwiseAddOp_rowwise_add_0(TestElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(2, 10, 12).astype(self.dtype) + self.y = np.random.rand(10, 12).astype(self.dtype) + self.out = self.x + self.y.reshape(1, 10, 12) + + def init_axis(self): + self.axis = 1 + + +@skip_check_grad_ci( + reason="[skip shape check] Use y_shape(1) to test broadcast.") +class TestElementwiseAddOp_rowwise_add_1(TestElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(100, 1).astype(self.dtype) + self.y = np.random.rand(1).astype(self.dtype) + self.out = self.x + self.y.reshape(1, 1) + + def init_axis(self): + self.axis = 1 + + +class TestElementwiseAddOp_channelwise_add(TestElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(100, 2, 3).astype(self.dtype) + self.y = np.random.rand(100, 1, 1).astype(self.dtype) + self.out = self.x + self.y + + def init_axis(self): + self.axis = -1 + + +class TestElementwiseAddOp_commonuse_add1(TestElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(2, 3, 100).astype(self.dtype) + self.y = np.random.rand(1, 1, 100).astype(self.dtype) + self.out = self.x + self.y + + def init_axis(self): + self.axis = -1 + + +class TestElementwiseAddOp_commonuse_add2(TestElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(10, 3, 1, 4).astype(self.dtype) + self.y = np.random.rand(10, 1, 12, 1).astype(self.dtype) + self.out = self.x + self.y + + def init_axis(self): + self.axis = -1 + + +class TestElementwiseAddOp_xsize_lessthan_ysize_add(TestElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(10, 12).astype(self.dtype) + self.y = np.random.rand(2, 3, 10, 12).astype(self.dtype) + self.out = self.x + self.y + + def init_axis(self): + self.axis = 2 + + +class TestElementwiseAddOpError(unittest.TestCase): + def test_errors(self): + with program_guard(Program(), Program()): + # the input of elementwise_add must be Variable. + x1 = fluid.create_lod_tensor( + np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace()) + y1 = fluid.create_lod_tensor( + np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace()) + self.assertRaises(TypeError, fluid.layers.elementwise_add, x1, y1) + + # the input dtype of elementwise_add must be float16 or float32 or float64 or int32 or int64 + # float16 only can be set on GPU place + x2 = fluid.layers.data(name='x2', shape=[3, 4, 5, 6], dtype="uint8") + y2 = fluid.layers.data(name='y2', shape=[3, 4, 5, 6], dtype="uint8") + self.assertRaises(TypeError, fluid.layers.elementwise_add, x2, y2) + + +class TestAddOp(unittest.TestCase): + def test_name(self): + with fluid.program_guard(fluid.Program()): + x = fluid.data(name="x", shape=[2, 3], dtype="float32") + y = fluid.data(name='y', shape=[2, 3], dtype='float32') + + y_1 = paddle.add(x, y, name='add_res') + self.assertEqual(('add_res' in y_1.name), True) + + def test_declarative(self): + with fluid.program_guard(fluid.Program()): + + def gen_data(): + return { + "x": np.array([2, 3, 4]).astype('float32'), + "y": np.array([1, 5, 2]).astype('float32') + } + + x = fluid.data(name="x", shape=[3], dtype='float32') + y = fluid.data(name="y", shape=[3], dtype='float32') + z = paddle.add(x, y) + + place = fluid.CPUPlace() + exe = fluid.Executor(place) + z_value = exe.run(feed=gen_data(), fetch_list=[z.name]) + z_expected = np.array([3., 8., 6.]) + self.assertEqual((z_value == z_expected).all(), True) + + def test_dygraph(self): + with fluid.dygraph.guard(): + np_x = np.array([2, 3, 4]).astype('float64') + np_y = np.array([1, 5, 2]).astype('float64') + x = fluid.dygraph.to_variable(np_x) + y = fluid.dygraph.to_variable(np_y) + z = paddle.add(x, y) + np_z = z.numpy() + z_expected = np.array([3., 8., 6.]) + self.assertEqual((np_z == z_expected).all(), True) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/xpu/test_matmul_op.py b/python/paddle/fluid/tests/unittests/xpu/test_matmul_op.py new file mode 100644 index 0000000000000000000000000000000000000000..ac32d224910a9254efa9f95135652af6ef8adafe --- /dev/null +++ b/python/paddle/fluid/tests/unittests/xpu/test_matmul_op.py @@ -0,0 +1,355 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import sys +sys.path.append("..") +import paddle.fluid.core as core +import unittest +import numpy as np +from op_test import OpTest +import paddle +import paddle.fluid as fluid +from paddle.fluid import Program, program_guard + + +def generate_compatible_shapes(dim_X, dim_Y, transpose_X, transpose_Y): + BATCH_SIZE = 2 + M = 3 + N = 4 + K = 5 + if (dim_X == 1 and transpose_X) or (dim_Y == 1 and transpose_Y): + K = 1 + if dim_X == 1: + if transpose_X: + shape_X = [M] + else: + shape_X = [K] + if dim_Y == 1: + if transpose_Y: + shape_Y = [N] + else: + shape_Y = [K] + if dim_X >= 2: + if transpose_X: + shape_X = [K, M] + else: + shape_X = [M, K] + if dim_X == 3: + shape_X = [BATCH_SIZE] + shape_X + if dim_Y >= 2: + if transpose_Y: + shape_Y = [N, K] + else: + shape_Y = [K, N] + if dim_Y == 3: + shape_Y = [BATCH_SIZE] + shape_Y + return shape_X, shape_Y + + +def reference_matmul(X, Y, transpose_X=False, transpose_Y=False): + """Reference forward implementation using np.matmul.""" + # np.matmul does not support the transpose flags, so we manually + # transpose X and Y appropriately. + if transpose_X: + if X.ndim == 1: + X = X.reshape((X.size, 1)) + elif X.ndim == 2: + X = X.T + else: + dim = [i for i in range(len(X.shape))] + dim[-1], dim[len(X.shape) - 2] = dim[len(X.shape) - 2], dim[-1] + X = np.transpose(X, tuple(dim)) + if transpose_Y: + if Y.ndim == 1: + Y = Y.reshape((1, Y.size)) + else: + dim = [i for i in range(len(Y.shape))] + dim[-1], dim[len(Y.shape) - 2] = dim[len(Y.shape) - 2], dim[-1] + Y = np.transpose(Y, tuple(dim)) + + Out = np.matmul(X, Y) + if not Out.shape: + # We do not support 0-dimensional Tensors (scalars). So where + # np.matmul outputs a scalar, we must convert to a Tensor of + # shape (1, ) instead. + # Everywhere else, we are compatible with np.matmul. + Out = np.array([Out], dtype="float32") + return Out + + +class Generator(object): + def setUp(self): + self.op_type = "matmul" + X = np.random.random(self.shape_X).astype("float32") + Y = np.random.random(self.shape_Y).astype("float32") + Out = reference_matmul(X, Y, self.transpose_X, self.transpose_Y) + self.inputs = {'X': X, 'Y': Y} + self.attrs = { + 'transpose_X': self.transpose_X, + 'transpose_Y': self.transpose_Y + } + self.outputs = {'Out': Out} + + def test_check_output(self): + self.check_output() + if paddle.is_compiled_with_xpu() and len(self.inputs['X'].shape) == len( + self.inputs['Y'].shape) and self.inputs['X'].shape[ + 0] == self.inputs['Y'].shape[0]: + place = paddle.XPUPlace(0) + self.check_output_with_place(place, atol=1e-3) + + def test_check_grad_normal(self): + self.check_grad(['X', 'Y'], 'Out', max_relative_error=1e-3) + if paddle.is_compiled_with_xpu() and len(self.inputs['X'].shape) == len( + self.inputs['Y'].shape) and self.inputs['X'].shape[ + 0] == self.inputs['Y'].shape[0]: + place = paddle.XPUPlace(0) + self.check_grad_with_place( + place, ['X', 'Y'], 'Out', max_relative_error=5e-2) + + def test_check_grad_ignore_x(self): + self.check_grad( + ['Y'], 'Out', max_relative_error=1e-3, no_grad_set=set("X")) + if paddle.is_compiled_with_xpu() and len(self.inputs['X'].shape) == len( + self.inputs['Y'].shape) and self.inputs['X'].shape[ + 0] == self.inputs['Y'].shape[0]: + place = paddle.XPUPlace(0) + self.check_grad_with_place( + place, ['Y'], + 'Out', + max_relative_error=5e-2, + no_grad_set=set("X")) + + def test_check_grad_ignore_y(self): + self.check_grad( + ['X'], 'Out', max_relative_error=1e-3, no_grad_set=set('Y')) + if paddle.is_compiled_with_xpu() and len(self.inputs['X'].shape) == len( + self.inputs['Y'].shape) and self.inputs['X'].shape[ + 0] == self.inputs['Y'].shape[0]: + place = paddle.XPUPlace(0) + self.check_grad_with_place( + place, ['X'], + 'Out', + max_relative_error=5e-2, + no_grad_set=set('Y')) + + +class TestMatmulOpError(unittest.TestCase): + def test_errors(self): + with program_guard(Program(), Program()): + # The inputs type of matmul_op must be Variable. + input1 = 12 + self.assertRaises(TypeError, fluid.layers.matmul, input1, input1) + # The inputs dtype of matmul_op must be float32, float64. + input2 = fluid.layers.data( + name='input2', shape=[10, 10], dtype="int32") + self.assertRaises(TypeError, fluid.layers.matmul, input2, input2) + input3 = fluid.layers.data( + name='input3', shape=[2, 2], dtype="float16") + fluid.layers.matmul(input3, input3) + + +# Negative dimension generation +def generate_negative_dims(in_shape): + from itertools import combinations + size = len(in_shape) + indexs = list() + shapes = list() + for i in range(size): + indexs.extend(list(combinations([j for j in range(size)], i + 1))) + for idx in indexs: + shapes.append( + [in_shape[i] if i not in idx else -1 for i in range(size)]) + return shapes + + +# Build program with inputs sizes that contain negative numbers +def test_negative_dims_program(obj): + for shape_x in generate_negative_dims(obj.shape_X): + for shape_y in generate_negative_dims(obj.shape_Y): + X = np.random.random(obj.shape_X).astype("float32") + Y = np.random.random(obj.shape_Y).astype("float32") + Ref = reference_matmul(X, Y, obj.transpose_X, obj.transpose_Y) + with program_guard(Program(), Program()): + x = fluid.data(name='x', shape=shape_x, dtype='float32') + y = fluid.data(name='y', shape=shape_y, dtype='float32') + output = fluid.layers.matmul(x, y, obj.transpose_X, + obj.transpose_Y) + obj.assertEqual(len(Ref.shape), len(output.shape)) + for idx in range(len(Ref.shape)): + if output.shape[idx] != -1: + obj.assertEqual(Ref.shape[idx], output.shape[idx]) + exe = fluid.Executor(fluid.CPUPlace()) + res, = exe.run(fluid.default_main_program(), + feed={'x': X, + 'y': Y}, + fetch_list=[output]) + np.allclose(res, Ref, atol=1e-5) + + +# Generate program api cases for all negative possibilities +def api_test(dim_x, dim_y, trans_x, trans_y): + test_name = ('TestMatMulAPI_dimX_{}_dim_Y_{}_transX_{}_transY_{}'.format( + dim_x, dim_y, trans_x, trans_y)) + shape_x, shape_y = generate_compatible_shapes(dim_x, dim_y, trans_x, + trans_y) + globals()[test_name] = type(test_name, (unittest.TestCase, ), { + 'shape_X': shape_x, + 'shape_Y': shape_y, + 'transpose_X': trans_x, + 'transpose_Y': trans_y, + 'test_propram': test_negative_dims_program, + }) + + +# Generate operators cases for all possibilities +def inject_test(dim_x, dim_y, trans_x, trans_y): + test_name = ('TestMatMulOp_dimX_{}_dim_Y_{}_transX_{}_transY_{}'.format( + dim_x, dim_y, trans_x, trans_y)) + shape_x, shape_y = generate_compatible_shapes(dim_x, dim_y, trans_x, + trans_y) + globals()[test_name] = type(test_name, (Generator, OpTest), { + 'shape_X': shape_x, + 'shape_Y': shape_y, + 'transpose_X': trans_x, + 'transpose_Y': trans_y, + }) + + +for dim_X in (1, 2, 3): + for dim_Y in (1, 2, 3): + for transose_x in (False, True): + for transose_y in (False, True): + inject_test(dim_X, dim_Y, transose_x, transose_y) + api_test(dim_X, dim_Y, transose_x, transose_y) + + +# Test case n-dim +def generate_compatible_shapes(dim, transpose_X, transpose_Y): + M = 2 + N = 4 + K = 3 + shape_X = [2 for _ in range(dim - 2)] + shape_Y = [2 for _ in range(dim - 2)] + + if transpose_X: + shape_X += [K, M] + else: + shape_X += [M, K] + + if transpose_Y: + shape_Y += [N, K] + else: + shape_Y += [K, N] + + return shape_X, shape_Y + + +# # Test case n-dim +for dim in [4]: + for transpose_X in [False, True]: + for transpose_Y in [False, True]: + test_name = ( + 'TestMatMulOp_dimX_{}_dim_Y_{}_transX_{}_transY_{}'.format( + dim, dim, transpose_X, transpose_Y)) + shape_X, shape_Y = generate_compatible_shapes(dim, transpose_X, + transpose_Y) + globals()[test_name] = type(test_name, (Generator, OpTest), { + 'shape_X': shape_X, + 'shape_Y': shape_Y, + 'transpose_X': transpose_X, + 'transpose_Y': transpose_Y, + }) + + +class API_TestMm(unittest.TestCase): + def test_out(self): + with fluid.program_guard(fluid.Program()): + x = fluid.data(name="x", shape=[2], dtype="float64") + y = fluid.data(name='y', shape=[2], dtype='float64') + res = fluid.data(name="output", shape=[1], dtype="float64") + result = paddle.mm(x, y) + exe = fluid.Executor(fluid.CPUPlace()) + data1 = np.random.rand(2) + data2 = np.random.rand(2) + np_res = exe.run(feed={'x': data1, 'y': data2}, fetch_list=[result]) + expected_result = np.matmul( + data1.reshape(1, 2), data2.reshape(2, 1)) + + self.assertTrue( + np.allclose( + np_res, expected_result, atol=1e-5), + "two value is\ + {}\n{}, check diff!".format(np_res, expected_result)) + + def test_dygraph_without_out(self): + device = fluid.CPUPlace() + with fluid.dygraph.guard(device): + input_array1 = np.random.rand(3, 4).astype("float64") + input_array2 = np.random.rand(4, 3).astype("float64") + data1 = fluid.dygraph.to_variable(input_array1) + data2 = fluid.dygraph.to_variable(input_array2) + out = paddle.mm(data1, data2) + expected_result = np.matmul(input_array1, input_array2) + self.assertTrue(np.allclose(expected_result, out.numpy())) + + +class Test_API_Matmul(unittest.TestCase): + def test_dygraph_without_out(self): + device = fluid.CPUPlace() + with fluid.dygraph.guard(device): + input_array1 = np.random.rand(3, 4).astype("float64") + input_array2 = np.random.rand(4, 3).astype("float64") + data1 = fluid.dygraph.to_variable(input_array1) + data2 = fluid.dygraph.to_variable(input_array2) + out = paddle.matmul(data1, data2) + expected_result = np.matmul(input_array1, input_array2) + self.assertTrue(np.allclose(expected_result, out.numpy())) + + +class API_TestMmError(unittest.TestCase): + def test_errors(self): + def test_error1(): + with fluid.program_guard(fluid.Program(), fluid.Program()): + data1 = fluid.data(name="data1", shape=[10, 2], dtype="float32") + data2 = fluid.data(name="data2", shape=[3, 10], dtype="float32") + paddle.mm(data1, data2) + + self.assertRaises(ValueError, test_error1) + + def test_error2(): + with fluid.program_guard(fluid.Program(), fluid.Program()): + data1 = fluid.data( + name="data1", shape=[-1, 10, 2], dtype="float32") + data2 = fluid.data( + name="data2", shape=[-1, 2, 10], dtype="float32") + paddle.mm(data1, data2) + + test_error2() + + def test_error3(): + with fluid.program_guard(fluid.Program(), fluid.Program()): + data1 = fluid.data( + name="data1", shape=[10, 10, 2], dtype="float32") + data2 = fluid.data( + name="data2", shape=[3, 2, 10], dtype="float32") + paddle.mm(data1, data2) + + self.assertRaises(ValueError, test_error3) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/xpu/test_mul_op.py b/python/paddle/fluid/tests/unittests/xpu/test_mul_op.py new file mode 100644 index 0000000000000000000000000000000000000000..94ab5b71e4fbf05fd7ccb61b44dea0cf2a9e0b34 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/xpu/test_mul_op.py @@ -0,0 +1,161 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import paddle +import paddle.fluid.core as core +import sys +sys.path.append("..") +from op_test import OpTest +import paddle.fluid as fluid +from paddle.fluid import Program, program_guard + + +class TestMulOp(OpTest): + def setUp(self): + self.op_type = "mul" + self.dtype = np.float64 + self.init_dtype_type() + self.inputs = { + 'X': np.random.random((20, 5)).astype(self.dtype), + 'Y': np.random.random((5, 21)).astype(self.dtype) + } + self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])} + + def init_dtype_type(self): + pass + + def test_check_output(self): + self.check_output() + + def test_check_grad_normal(self): + self.check_grad(['X', 'Y'], 'Out') + + def test_check_grad_ingore_x(self): + self.check_grad( + ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X")) + + def test_check_grad_ingore_y(self): + self.check_grad( + ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y')) + + +class TestMulOpError(unittest.TestCase): + def test_errors(self): + with program_guard(Program(), Program()): + # The input type of mul_op must be Variable. + x1 = fluid.create_lod_tensor( + np.array([[-1]]), [[1]], fluid.CPUPlace()) + x2 = fluid.create_lod_tensor( + np.array([[-1]]), [[1]], fluid.CPUPlace()) + self.assertRaises(TypeError, fluid.layers.mul, x1, x2) + # The input dtype of mul_op must be float32 or float64. + x3 = fluid.layers.data(name='x3', shape=[4], dtype="int32") + x4 = fluid.layers.data(name='x4', shape=[4], dtype="int32") + self.assertRaises(TypeError, fluid.layers.mul, x3, x4) + + +class TestMulOp2(OpTest): + def setUp(self): + self.op_type = "mul" + self.dtype = np.float64 + self.init_dtype_type() + self.inputs = { + 'X': np.random.random((3, 4, 2, 9)).astype(self.dtype), + 'Y': np.random.random((3, 6, 1, 2, 3)).astype(self.dtype) + } + self.attrs = { + 'x_num_col_dims': 2, + 'y_num_col_dims': 2, + } + result = np.dot(self.inputs['X'].reshape(3 * 4, 2 * 9), + self.inputs['Y'].reshape(3 * 6, 1 * 2 * 3)) + result = result.reshape(3, 4, 1, 2, 3) + self.outputs = {'Out': result} + + def init_dtype_type(self): + pass + + def test_check_output(self): + self.check_output() + + def test_check_grad_normal(self): + self.check_grad(['X', 'Y'], 'Out') + + def test_check_grad_ingore_x(self): + self.check_grad( + ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set('X')) + + def test_check_grad_ignore_y(self): + self.check_grad( + ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y')) + + +@unittest.skipIf(not paddle.is_compiled_with_xpu(), + "core is not compiled with XPU") +class TestXPUMulOp1(TestMulOp): + def init_dtype_type(self): + self.dtype = np.float32 + + def test_check_output(self): + place = paddle.XPUPlace(0) + self.check_output_with_place(place, atol=1e-1) + + def test_check_grad_normal(self): + place = paddle.XPUPlace(0) + self.check_grad_with_place( + place, ['X', 'Y'], 'Out', max_relative_error=0.5) + + def test_check_grad_ingore_x(self): + place = paddle.XPUPlace(0) + self.check_grad_with_place( + place, ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X")) + + def test_check_grad_ingore_y(self): + place = paddle.XPUPlace(0) + self.check_grad_with_place( + place, ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y')) + + +@unittest.skipIf(not paddle.is_compiled_with_xpu(), + "core is not compiled with XPU") +class TestXPUMulOp2(TestMulOp2): + def init_dtype_type(self): + self.dtype = np.float32 + + def test_check_output(self): + place = paddle.XPUPlace(0) + self.check_output_with_place(place, atol=2e-1) + + def test_check_grad_normal(self): + place = paddle.XPUPlace(0) + self.check_grad_with_place( + place, ['X', 'Y'], 'Out', max_relative_error=0.9) + + def test_check_grad_ingore_x(self): + place = paddle.XPUPlace(0) + self.check_grad_with_place( + place, ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X")) + + def test_check_grad_ingore_y(self): + place = paddle.XPUPlace(0) + self.check_grad_with_place( + place, ['X'], 'Out', max_relative_error=0.9, no_grad_set=set('Y')) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/hapi/callbacks.py b/python/paddle/hapi/callbacks.py index 7ed571fa9c6a4a962b20397c999368dad0734ff0..69b7fedd72eed52cfd06715025f5cd88983e2e2a 100644 --- a/python/paddle/hapi/callbacks.py +++ b/python/paddle/hapi/callbacks.py @@ -301,10 +301,11 @@ class ProgBarLogger(Callback): train_dataset = paddle.vision.datasets.MNIST(mode='train') - model = paddle.Model(paddle.vision.LeNet(classifier_activation=None), + lenet = paddle.vision.LeNet() + model = paddle.Model(lenet, inputs, labels) - optim = paddle.optimizer.Adam(0.001) + optim = paddle.optimizer.Adam(0.001, parameters=lenet.parameters()) model.prepare(optimizer=optim, loss=paddle.nn.CrossEntropyLoss(), metrics=paddle.metric.Accuracy()) @@ -436,10 +437,11 @@ class ModelCheckpoint(Callback): train_dataset = paddle.vision.datasets.MNIST(mode='train') - model = paddle.Model(paddle.vision.LeNet(classifier_activation=None), + lenet = paddle.vision.LeNet() + model = paddle.Model(lenet, inputs, labels) - optim = paddle.optimizer.Adam(0.001) + optim = paddle.optimizer.Adam(0.001, parameters=lenet.parameters()) model.prepare(optimizer=optim, loss=paddle.nn.CrossEntropyLoss(), metrics=paddle.metric.Accuracy()) diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py index d00cdf1044bdcd41165777a63feb2950d51e1eb7..fd6161f1cf8ec43ccc391442ebdb7c12769de4cb 100644 --- a/python/paddle/hapi/model.py +++ b/python/paddle/hapi/model.py @@ -820,10 +820,9 @@ class Model(object): from paddle.static import InputSpec device = paddle.set_device('cpu') # or 'gpu' - # if use static graph, do not set - paddle.disable_static(device) net = nn.Sequential( + nn.Flatten(1), nn.Linear(784, 200), nn.Tanh(), nn.Linear(200, 10)) @@ -839,7 +838,7 @@ class Model(object): paddle.nn.CrossEntropyLoss(), paddle.metric.Accuracy()) - data = paddle.vision.datasets.MNIST(mode='train', chw_format=False) + data = paddle.vision.datasets.MNIST(mode='train') model.fit(data, epochs=2, batch_size=32, verbose=1) """ @@ -855,6 +854,7 @@ class Model(object): self._is_shape_inferred = False self._test_dataloader = None +<<<<<<< HEAD if not in_dygraph_mode(): if not isinstance(inputs, (list, dict, Input)): raise TypeError( @@ -863,6 +863,13 @@ class Model(object): self._shapes = [list(input.shape) for input in inputs] self._inputs = self._verify_spec(inputs, is_input=True) +======= + if not isinstance(inputs, (list, dict, Input)): + raise TypeError( + "'inputs' must be list or dict in static graph mode") + + self._inputs = self._verify_spec(inputs, True) +>>>>>>> 6b727e08b1f38b3f4acc1708c163ed6ae5df8d58 self._labels = self._verify_spec(labels) # init backend @@ -896,7 +903,6 @@ class Model(object): from paddle.static import InputSpec device = paddle.set_device('cpu') # or 'gpu' - paddle.disable_static(device) net = nn.Sequential( nn.Linear(784, 200), @@ -946,7 +952,6 @@ class Model(object): from paddle.static import InputSpec device = paddle.set_device('cpu') # or 'gpu' - paddle.disable_static(device) net = nn.Sequential( nn.Linear(784, 200), @@ -991,9 +996,12 @@ class Model(object): import numpy as np import paddle import paddle.nn as nn + from paddle.static import InputSpec device = paddle.set_device('cpu') # or 'gpu' - paddle.disable_static(device) + + input = InputSpec([None, 784], 'float32', 'x') + label = InputSpec([None, 1], 'int64', 'label') net = nn.Sequential( nn.Linear(784, 200), @@ -1001,7 +1009,7 @@ class Model(object): nn.Linear(200, 10), nn.Softmax()) - model = paddle.Model(net) + model = paddle.Model(net, input, label) model.prepare() data = np.random.random(size=(4,784)).astype(np.float32) out = model.test_batch([data]) @@ -1052,6 +1060,7 @@ class Model(object): def __init__(self): super(Mnist, self).__init__() self.net = nn.Sequential( + nn.Flatten(1), nn.Linear(784, 200), nn.Tanh(), nn.Linear(200, 10), @@ -1071,7 +1080,7 @@ class Model(object): optim = paddle.optimizer.SGD(learning_rate=1e-3, parameters=model.parameters()) model.prepare(optim, paddle.nn.CrossEntropyLoss()) - data = paddle.vision.datasets.MNIST(mode='train', chw_format=False) + data = paddle.vision.datasets.MNIST(mode='train') model.fit(data, epochs=1, batch_size=32, verbose=0) model.save('checkpoint/test') # save for training model.save('inference_model', False) # save for inference @@ -1118,15 +1127,18 @@ class Model(object): import paddle import paddle.nn as nn - + from paddle.static import InputSpec + device = paddle.set_device('cpu') - paddle.disable_static(device) + + input = InputSpec([None, 784], 'float32', 'x') model = paddle.Model(nn.Sequential( nn.Linear(784, 200), nn.Tanh(), nn.Linear(200, 10), - nn.Softmax())) + nn.Softmax()), input) + model.save('checkpoint/test') model.load('checkpoint/test') """ @@ -1191,13 +1203,15 @@ class Model(object): import paddle import paddle.nn as nn + from paddle.static import InputSpec - paddle.disable_static() - + input = InputSpec([None, 784], 'float32', 'x') + model = paddle.Model(nn.Sequential( nn.Linear(784, 200), nn.Tanh(), - nn.Linear(200, 10))) + nn.Linear(200, 10)), input) + params = model.parameters() """ return self._adapter.parameters() @@ -1339,7 +1353,7 @@ class Model(object): label = InputSpec([None, 1], 'int64', 'label') model = paddle.Model( - paddle.vision.models.LeNet(classifier_activation=None), + paddle.vision.models.LeNet(), input, label) optim = paddle.optimizer.Adam( learning_rate=0.001, parameters=model.parameters()) @@ -1376,7 +1390,7 @@ class Model(object): label = InputSpec([None, 1], 'int64', 'label') model = paddle.Model( - paddle.vision.models.LeNet(classifier_activation=None), input, label) + paddle.vision.models.LeNet(), input, label) optim = paddle.optimizer.Adam( learning_rate=0.001, parameters=model.parameters()) model.prepare( @@ -1509,7 +1523,7 @@ class Model(object): # imperative mode paddle.disable_static() - model = paddle.Model(paddle.vision.models.LeNet()) + model = paddle.Model(paddle.vision.models.LeNet(), input, label) model.prepare(metrics=paddle.metric.Accuracy()) result = model.evaluate(val_dataset, batch_size=64) print(result) @@ -1606,19 +1620,20 @@ class Model(object): test_dataset = MnistDataset(mode='test', return_label=False) - # declarative mode + # imperative mode input = InputSpec([-1, 1, 28, 28], 'float32', 'image') model = paddle.Model(paddle.vision.models.LeNet(), input) model.prepare() - result = model.predict(test_dataset, batch_size=64) print(len(result[0]), result[0][0].shape) - # imperative mode + # declarative mode device = paddle.set_device('cpu') - paddle.disable_static(device) - model = paddle.Model(paddle.vision.models.LeNet()) + paddle.enable_static() + input = InputSpec([-1, 1, 28, 28], 'float32', 'image') + model = paddle.Model(paddle.vision.models.LeNet(), input) model.prepare() + result = model.predict(test_dataset, batch_size=64) print(len(result[0]), result[0][0].shape) """ @@ -1875,15 +1890,11 @@ class Model(object): import paddle from paddle.static import InputSpec - - dynamic = True - device = paddle.set_device('cpu') - paddle.disable_static(device) if dynamic else None input = InputSpec([None, 1, 28, 28], 'float32', 'image') label = InputSpec([None, 1], 'int64', 'label') - model = paddle.Model(paddle.vision.LeNet(classifier_activation=None), + model = paddle.Model(paddle.vision.LeNet(), input, label) optim = paddle.optimizer.Adam( learning_rate=0.001, parameters=model.parameters()) diff --git a/python/paddle/metric/metrics.py b/python/paddle/metric/metrics.py index 1cd65171ff034e8b834c38184e4452796da985ca..f4a9b8c01d02a109f91aa717342ba47321f5f47e 100644 --- a/python/paddle/metric/metrics.py +++ b/python/paddle/metric/metrics.py @@ -182,7 +182,6 @@ class Accuracy(Metric): import numpy as np import paddle - paddle.disable_static() x = paddle.to_tensor(np.array([ [0.1, 0.2, 0.3, 0.4], [0.1, 0.4, 0.3, 0.2], @@ -202,11 +201,13 @@ class Accuracy(Metric): .. code-block:: python import paddle - - paddle.disable_static() + from paddle.static import InputSpec + + input = InputSpec([None, 1, 28, 28], 'float32', 'image') + label = InputSpec([None, 1], 'int64', 'label') train_dataset = paddle.vision.datasets.MNIST(mode='train') - model = paddle.Model(paddle.vision.LeNet(classifier_activation=None)) + model = paddle.Model(paddle.vision.LeNet(), input, label) optim = paddle.optimizer.Adam( learning_rate=0.001, parameters=model.parameters()) model.prepare( diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py index 1eb9167d0352f36bfcb87db79ba23dce14bac507..bed5df8fa78c753565c8391ba414135e63d335aa 100755 --- a/python/paddle/nn/functional/pooling.py +++ b/python/paddle/nn/functional/pooling.py @@ -571,15 +571,26 @@ def max_pool1d(x, padding = _expand_low_nd_padding(padding) if in_dygraph_mode(): - pool_out = core.ops.max_pool2d_with_index( - x, 'ksize', kernel_size, 'global_pooling', False, 'strides', stride, - 'paddings', padding, 'padding_algorithm', padding_algorithm, - 'use_cudnn', True, 'ceil_mode', ceil_mode, 'use_mkldnn', False, - 'exclusive', True, 'data_format', data_format) - return (squeeze(pool_out[0], [2]), squeeze( - pool_out[1], [2])) if return_indices else squeeze(pool_out[0], [2]) + if return_indices: + pool_out = core.ops.max_pool2d_with_index( + x, 'ksize', kernel_size, 'global_pooling', False, 'strides', + stride, 'paddings', padding, 'padding_algorithm', + padding_algorithm, 'use_cudnn', True, 'ceil_mode', ceil_mode, + 'use_mkldnn', False, 'exclusive', True, 'data_format', + data_format) + return (squeeze(pool_out[0], [2]), squeeze( + pool_out[1], + [2])) if return_indices else squeeze(pool_out[0], [2]) + else: + pool_out = core.ops.pool2d( + x, 'pooling_type', 'max', 'ksize', kernel_size, + 'global_pooling', False, 'padding_algorithm', padding_algorithm, + 'strides', stride, 'paddings', padding, 'use_cudnn', True, + 'ceil_mode', ceil_mode, 'use_mkldnn', False, 'exclusive', True, + 'data_format', data_format) + return squeeze(pool_out, [2]) - op_type = 'max_pool2d_with_index' + op_type = 'max_pool2d_with_index' if return_indices else "pool2d" helper = LayerHelper(op_type, **locals()) dtype = helper.input_dtype() pool_out = helper.create_variable_for_type_inference(dtype) @@ -696,7 +707,7 @@ def max_pool2d(x, ) if in_dygraph_mode(): - if data_format == "NCHW": + if return_indices: output = core.ops.max_pool2d_with_index( x, 'ksize', kernel_size, 'global_pooling', False, 'strides', stride, 'paddings', padding, 'padding_algorithm', @@ -704,7 +715,7 @@ def max_pool2d(x, 'use_mkldnn', False, 'exclusive', True, 'data_format', data_format) return output if return_indices else output[0] - elif data_format == "NHWC" and not return_indices: + else: output = core.ops.pool2d( x, 'pooling_type', 'max', 'ksize', kernel_size, 'global_pooling', False, 'padding_algorithm', padding_algorithm, @@ -713,7 +724,7 @@ def max_pool2d(x, 'data_format', data_format) return output - op_type = 'max_pool2d_with_index' if data_format == "NCHW" else "pool2d" + op_type = 'max_pool2d_with_index' if return_indices else "pool2d" helper = LayerHelper(op_type, **locals()) dtype = helper.input_dtype() pool_out = helper.create_variable_for_type_inference(dtype) @@ -822,7 +833,7 @@ def max_pool3d(x, ) if in_dygraph_mode(): - if data_format == "NCDHW": + if return_indices: output = core.ops.max_pool3d_with_index( x, 'pooling_type', 'max', 'ksize', kernel_size, 'strides', stride, 'paddings', padding, 'global_pooling', False, @@ -830,7 +841,7 @@ def max_pool3d(x, 'ceil_mode', ceil_mode, 'use_mkldnn', False, 'exclusive', True, 'data_format', data_format) return output if return_indices else output[0] - elif data_format == "NDHWC" and not return_indices: + else: output = core.ops.pool3d( x, 'pooling_type', 'max', 'ksize', kernel_size, 'global_pooling', False, 'padding_algorithm', padding_algorithm, @@ -839,7 +850,7 @@ def max_pool3d(x, 'data_format', data_format) return output - op_type = "max_pool3d_with_index" if data_format == "NCDHW" else "pool3d" + op_type = "max_pool3d_with_index" if return_indices else "pool3d" helper = LayerHelper(op_type, **locals()) dtype = helper.input_dtype() pool_out = helper.create_variable_for_type_inference(dtype) diff --git a/python/paddle/static/input.py b/python/paddle/static/input.py index eb70320ea7551de6e1117900e3769f000fdf23dd..d7a3cfcdb92debe0447cb4054478729e92dbab32 100644 --- a/python/paddle/static/input.py +++ b/python/paddle/static/input.py @@ -19,10 +19,12 @@ from paddle.fluid import core, Variable from paddle.fluid.layer_helper import LayerHelper from paddle.fluid.data_feeder import check_type from paddle.fluid.framework import convert_np_dtype_to_dtype_ +from paddle.fluid.framework import static_only __all__ = ['data', 'InputSpec'] +@static_only def data(name, shape, dtype=None, lod_level=0): """ **Data Layer** diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py index f27cfba487d78f284408815eaba933b18f303df9..15580b6618e6dc61d5e74216776417a02846a16a 100644 --- a/python/paddle/tensor/linalg.py +++ b/python/paddle/tensor/linalg.py @@ -156,8 +156,8 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None): def __check_input(x, y): var_names = {'x': x, 'y': y} for name, val in var_names.items(): - check_variable_and_dtype(val, name, ['float32', 'float64'], - 'matmul') + check_variable_and_dtype( + val, name, ['float16', 'float32', 'float64'], 'matmul') __check_input(x, y) @@ -707,20 +707,14 @@ def cross(x, y, axis=None, name=None): Examples: .. code-block:: python import paddle - from paddle import to_variable - import numpy as np - paddle.disable_static() - data_x = np.array([[1.0, 1.0, 1.0], - [2.0, 2.0, 2.0], - [3.0, 3.0, 3.0]]) - data_y = np.array([[1.0, 1.0, 1.0], - [1.0, 1.0, 1.0], - [1.0, 1.0, 1.0]]) - x = to_variable(data_x) - y = to_variable(data_y) - + x = paddle.to_tensor([[1.0, 1.0, 1.0], + [2.0, 2.0, 2.0], + [3.0, 3.0, 3.0]]) + y = paddle.to_tensor([[1.0, 1.0, 1.0], + [1.0, 1.0, 1.0], + [1.0, 1.0, 1.0]]) z1 = paddle.cross(x, y) print(z1.numpy()) # [[-1. -1. -1.] diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py index 966544c7abb54ae7de163aa322890a55ee94d3d8..ce32fb76f5cd4da30f95baa3b8928d1c879477ca 100755 --- a/python/paddle/tensor/math.py +++ b/python/paddle/tensor/math.py @@ -1650,12 +1650,11 @@ def cumsum(x, axis=None, dtype=None, name=None): .. code-block:: python import paddle - from paddle import to_variable import numpy as np paddle.disable_static() data_np = np.arange(12).reshape(3, 4) - data = to_variable(data_np) + data = paddle.to_tensor(data_np) y = paddle.cumsum(data) print(y.numpy()) diff --git a/python/paddle/tests/CMakeLists.txt b/python/paddle/tests/CMakeLists.txt index 6fb73b08c11b417332b064df7408e78ed390cc2f..e1bc65a5d15c2883e14d20c5e06c2ee3cd726ea5 100644 --- a/python/paddle/tests/CMakeLists.txt +++ b/python/paddle/tests/CMakeLists.txt @@ -8,10 +8,6 @@ foreach(TEST_OP ${DIST_TEST_OPS}) list(REMOVE_ITEM TEST_OPS ${TEST_OP}) endforeach() -# disable test_pretrained_model and test_vision_models -list(REMOVE_ITEM TEST_OPS test_pretrained_model) -list(REMOVE_ITEM TEST_OPS test_vision_models) - foreach(src ${TEST_OPS}) py_test(${src} SRCS ${src}.py) endforeach() diff --git a/python/paddle/tests/dist_hapi_mnist_dynamic.py b/python/paddle/tests/dist_hapi_mnist_dynamic.py index 13d966bf38f2aaed35e120aa4d25705cfc36c230..46d02789402b22263cfbd8cbdfeb6d66a5de900d 100644 --- a/python/paddle/tests/dist_hapi_mnist_dynamic.py +++ b/python/paddle/tests/dist_hapi_mnist_dynamic.py @@ -68,7 +68,7 @@ class TestDistTraning(unittest.TestCase): inputs = [Input(im_shape, 'float32', 'image')] labels = [Input([None, 1], 'int64', 'label')] - model = Model(LeNet(classifier_activation=None), inputs, labels) + model = Model(LeNet(), inputs, labels) optim = fluid.optimizer.Momentum( learning_rate=0.001, momentum=.9, parameter_list=model.parameters()) model.prepare(optim, CrossEntropyLoss(), Accuracy()) diff --git a/python/paddle/tests/dist_hapi_mnist_static.py b/python/paddle/tests/dist_hapi_mnist_static.py index 9d8e5f3652c9810579a0b66035a64d1d3b915bff..eab34a6dafbc354a24aa51e93a9fec9efc3b3cee 100644 --- a/python/paddle/tests/dist_hapi_mnist_static.py +++ b/python/paddle/tests/dist_hapi_mnist_static.py @@ -67,7 +67,7 @@ class TestDistTraning(unittest.TestCase): inputs = [Input(im_shape, 'float32', 'image')] labels = [Input([None, 1], 'int64', 'label')] - model = Model(LeNet(classifier_activation=None), inputs, labels) + model = Model(LeNet(), inputs, labels) optim = fluid.optimizer.Momentum( learning_rate=0.001, momentum=.9, parameter_list=model.parameters()) model.prepare(optim, CrossEntropyLoss(), Accuracy()) diff --git a/python/paddle/tests/test_model.py b/python/paddle/tests/test_model.py index 2de39c709d899e5ee56a0ec43c52233de132c920..34e7065cd2391f74dc9d3c62dd1a7eab7bea4afb 100644 --- a/python/paddle/tests/test_model.py +++ b/python/paddle/tests/test_model.py @@ -40,7 +40,7 @@ from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTra class LeNetDygraph(paddle.nn.Layer): - def __init__(self, num_classes=10, classifier_activation=None): + def __init__(self, num_classes=10): super(LeNetDygraph, self).__init__() self.num_classes = num_classes self.features = Sequential( @@ -55,8 +55,7 @@ class LeNetDygraph(paddle.nn.Layer): if num_classes > 0: self.fc = Sequential( - Linear(400, 120), Linear(120, 84), Linear(84, 10), - Softmax()) #Todo: accept any activation + Linear(400, 120), Linear(120, 84), Linear(84, 10)) def forward(self, inputs): x = self.features(inputs) @@ -67,6 +66,34 @@ class LeNetDygraph(paddle.nn.Layer): return x +class LeNetDeclarative(fluid.dygraph.Layer): + def __init__(self, num_classes=10): + super(LeNetDeclarative, self).__init__() + self.num_classes = num_classes + self.features = Sequential( + Conv2d( + 1, 6, 3, stride=1, padding=1), + ReLU(), + Pool2D(2, 'max', 2), + Conv2d( + 6, 16, 5, stride=1, padding=0), + ReLU(), + Pool2D(2, 'max', 2)) + + if num_classes > 0: + self.fc = Sequential( + Linear(400, 120), Linear(120, 84), Linear(84, 10)) + + @declarative + def forward(self, inputs): + x = self.features(inputs) + + if self.num_classes > 0: + x = fluid.layers.flatten(x, 1) + x = self.fc(x) + return x + + class MnistDataset(MNIST): def __init__(self, mode, return_label=True, sample_num=None): super(MnistDataset, self).__init__(mode=mode) @@ -198,7 +225,7 @@ class TestModel(unittest.TestCase): paddle.manual_seed(seed) paddle.framework.random._manual_program_seed(seed) - net = LeNet(classifier_activation=None) + net = LeNet() optim_new = fluid.optimizer.Adam( learning_rate=0.001, parameter_list=net.parameters()) model = Model(net, inputs=self.inputs, labels=self.labels) @@ -287,14 +314,12 @@ class TestModel(unittest.TestCase): class MyModel(paddle.nn.Layer): - def __init__(self, classifier_activation='softmax'): + def __init__(self): super(MyModel, self).__init__() self._fc = Linear(20, 10) - self._act = Softmax() #Todo: accept any activation def forward(self, x): y = self._fc(x) - y = self._act(y) return y @@ -311,7 +336,7 @@ class TestModelFunction(unittest.TestCase): def get_expect(): fluid.enable_dygraph(fluid.CPUPlace()) self.set_seed() - m = MyModel(classifier_activation=None) + m = MyModel() optim = fluid.optimizer.SGD(learning_rate=0.001, parameter_list=m.parameters()) m.train() @@ -330,7 +355,7 @@ class TestModelFunction(unittest.TestCase): fluid.enable_dygraph(device) if dynamic else None self.set_seed() - net = MyModel(classifier_activation=None) + net = MyModel() optim2 = fluid.optimizer.SGD(learning_rate=0.001, parameter_list=net.parameters()) @@ -374,7 +399,7 @@ class TestModelFunction(unittest.TestCase): for dynamic in [True, False]: device = paddle.set_device('cpu') fluid.enable_dygraph(device) if dynamic else None - net = MyModel(classifier_activation=None) + net = MyModel() inputs = [InputSpec([None, 20], 'float32', 'x')] labels = [InputSpec([None, 1], 'int64', 'label')] optim = fluid.optimizer.SGD(learning_rate=0.001, @@ -415,7 +440,7 @@ class TestModelFunction(unittest.TestCase): # dynamic saving device = paddle.set_device('cpu') fluid.enable_dygraph(device) - model = Model(MyModel(classifier_activation=None)) + model = Model(MyModel()) optim = fluid.optimizer.SGD(learning_rate=0.001, parameter_list=model.parameters()) model.prepare(optimizer=optim, loss=CrossEntropyLoss(reduction="sum")) @@ -424,7 +449,7 @@ class TestModelFunction(unittest.TestCase): inputs = [InputSpec([None, 20], 'float32', 'x')] labels = [InputSpec([None, 1], 'int64', 'label')] - model = Model(MyModel(classifier_activation=None), inputs, labels) + model = Model(MyModel(), inputs, labels) optim = fluid.optimizer.SGD(learning_rate=0.001, parameter_list=model.parameters()) model.prepare(optimizer=optim, loss=CrossEntropyLoss(reduction="sum")) @@ -434,7 +459,7 @@ class TestModelFunction(unittest.TestCase): def test_static_save_dynamic_load(self): path = tempfile.mkdtemp() - net = MyModel(classifier_activation=None) + net = MyModel() inputs = [InputSpec([None, 20], 'float32', 'x')] labels = [InputSpec([None, 1], 'int64', 'label')] optim = fluid.optimizer.SGD(learning_rate=0.001, @@ -446,7 +471,7 @@ class TestModelFunction(unittest.TestCase): device = paddle.set_device('cpu') fluid.enable_dygraph(device) #if dynamic else None - net = MyModel(classifier_activation=None) + net = MyModel() inputs = [InputSpec([None, 20], 'float32', 'x')] labels = [InputSpec([None, 1], 'int64', 'label')] optim = fluid.optimizer.SGD(learning_rate=0.001, @@ -582,13 +607,14 @@ class TestModelFunction(unittest.TestCase): class TestRaiseError(unittest.TestCase): def test_input_without_name(self): - net = MyModel(classifier_activation=None) + net = MyModel() inputs = [InputSpec([None, 10], 'float32')] labels = [InputSpec([None, 1], 'int64', 'label')] with self.assertRaises(ValueError): model = Model(net, inputs, labels) +<<<<<<< HEAD def test_export_deploy_model_without_inputs_and_run_in_dygraph(self): paddle.disable_static() net = MyModel(classifier_activation=None) @@ -599,6 +625,15 @@ class TestRaiseError(unittest.TestCase): model = Model(net) model.save(save_dir, training=False) paddle.enable_static() +======= + def test_input_without_input_spec(self): + for dynamic in [True, False]: + paddle.disable_static() if dynamic else None + net = MyModel() + with self.assertRaises(TypeError): + model = Model(net) + paddle.enable_static() +>>>>>>> 6b727e08b1f38b3f4acc1708c163ed6ae5df8d58 if __name__ == '__main__': diff --git a/python/paddle/tests/test_pretrained_model.py b/python/paddle/tests/test_pretrained_model.py index 641147d39e94f7c2bbb426900ed484546bad49c6..bf9c2a2ae061179bd9d656fa3cb23c5ac93c6c53 100644 --- a/python/paddle/tests/test_pretrained_model.py +++ b/python/paddle/tests/test_pretrained_model.py @@ -13,6 +13,8 @@ # limitations under the License. import unittest +import tempfile +import shutil import numpy as np import paddle @@ -23,27 +25,36 @@ import paddle.vision.models as models # test the predicted resutls of static graph and dynamic graph are equal # when used pretrained model class TestPretrainedModel(unittest.TestCase): - def infer(self, x, arch, dygraph=True): - if dygraph: - paddle.disable_static() - - net = models.__dict__[arch](pretrained=True, classifier_activation=None) - inputs = [InputSpec([None, 3, 224, 224], 'float32', 'image')] - model = paddle.Model(network=net, inputs=inputs) - model.prepare() - res = model.test_batch(x) - - if dygraph: - paddle.enable_static() - return res + def infer(self, arch): + path = tempfile.mkdtemp() + x = np.array(np.random.random((2, 3, 224, 224)), dtype=np.float32) + res = {} + for dygraph in [True, False]: + if not dygraph: + paddle.enable_static() + + net = models.__dict__[arch]() + inputs = [InputSpec([None, 3, 224, 224], 'float32', 'image')] + model = paddle.Model(network=net, inputs=inputs) + model.prepare() + + if dygraph: + model.save(path) + res['dygraph'] = model.test_batch(x) + else: + model.load(path) + res['static'] = model.test_batch(x) + + if not dygraph: + paddle.disable_static() + + shutil.rmtree(path) + np.testing.assert_allclose(res['dygraph'], res['static']) def test_models(self): arches = ['mobilenet_v1', 'mobilenet_v2', 'resnet18'] for arch in arches: - x = np.array(np.random.random((2, 3, 224, 224)), dtype=np.float32) - y_dygraph = self.infer(x, arch) - y_static = self.infer(x, arch, dygraph=False) - np.testing.assert_allclose(y_dygraph, y_static) + self.infer(arch) if __name__ == '__main__': diff --git a/python/paddle/tests/test_vision_models.py b/python/paddle/tests/test_vision_models.py index 44f9ab5390122f086af4168e225fe2b5a2d8a9b2..6489b02615bb94269f83c4ed780e555c487eacbe 100644 --- a/python/paddle/tests/test_vision_models.py +++ b/python/paddle/tests/test_vision_models.py @@ -36,7 +36,7 @@ class TestVisonModels(unittest.TestCase): model.test_batch(x) def test_mobilenetv2_pretrained(self): - self.models_infer('mobilenet_v2', pretrained=True) + self.models_infer('mobilenet_v2', pretrained=False) def test_mobilenetv1(self): self.models_infer('mobilenet_v1') diff --git a/python/paddle/vision/models/lenet.py b/python/paddle/vision/models/lenet.py index c2d4be7cda10d580af44154e6a03e0871ec20706..b30d5992f9adf792f0bae90e19b9c00c4d47c0a2 100644 --- a/python/paddle/vision/models/lenet.py +++ b/python/paddle/vision/models/lenet.py @@ -12,20 +12,19 @@ #See the License for the specific language governing permissions and #limitations under the License. -import paddle.fluid as fluid -from paddle.nn import Conv2d, Pool2D, Linear, ReLU, Sequential, Softmax +import paddle +import paddle.nn as nn __all__ = ['LeNet'] -class LeNet(fluid.dygraph.Layer): +class LeNet(nn.Layer): """LeNet model from `"LeCun Y, Bottou L, Bengio Y, et al. Gradient-based learning applied to document recognition[J]. Proceedings of the IEEE, 1998, 86(11): 2278-2324.`_ Args: num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer will not be defined. Default: 10. - classifier_activation (str): activation for the last fc layer. Default: 'softmax'. Examples: .. code-block:: python @@ -35,28 +34,27 @@ class LeNet(fluid.dygraph.Layer): model = LeNet() """ - def __init__(self, num_classes=10, classifier_activation='softmax'): + def __init__(self, num_classes=10): super(LeNet, self).__init__() self.num_classes = num_classes - self.features = Sequential( - Conv2d( + self.features = nn.Sequential( + nn.Conv2d( 1, 6, 3, stride=1, padding=1), - ReLU(), - Pool2D(2, 'max', 2), - Conv2d( + nn.ReLU(), + nn.MaxPool2d(2, 2), + nn.Conv2d( 6, 16, 5, stride=1, padding=0), - ReLU(), - Pool2D(2, 'max', 2)) + nn.ReLU(), + nn.MaxPool2d(2, 2)) if num_classes > 0: - self.fc = Sequential( - Linear(400, 120), Linear(120, 84), Linear(84, 10), - Softmax()) #Todo: accept any activation + self.fc = nn.Sequential( + nn.Linear(400, 120), nn.Linear(120, 84), nn.Linear(84, 10)) def forward(self, inputs): x = self.features(inputs) if self.num_classes > 0: - x = fluid.layers.flatten(x, 1) + x = paddle.flatten(x, 1) x = self.fc(x) return x diff --git a/python/paddle/vision/models/mobilenetv1.py b/python/paddle/vision/models/mobilenetv1.py index 10defbf593dca642386e73b65094612f93dce9dc..39654122e3b33e52f4693653dbdd14d6e513228e 100644 --- a/python/paddle/vision/models/mobilenetv1.py +++ b/python/paddle/vision/models/mobilenetv1.py @@ -12,10 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import paddle.fluid as fluid -from paddle.fluid.initializer import MSRA -from paddle.fluid.param_attr import ParamAttr -from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear +import paddle +import paddle.nn as nn from paddle.utils.download import get_weights_path_from_url @@ -24,85 +22,66 @@ __all__ = ['MobileNetV1', 'mobilenet_v1'] model_urls = { 'mobilenetv1_1.0': ('https://paddle-hapi.bj.bcebos.com/models/mobilenet_v1_x1.0.pdparams', - 'bf0d25cb0bed1114d9dac9384ce2b4a6') + '42a154c2f26f86e7457d6daded114e8c') } -class ConvBNLayer(fluid.dygraph.Layer): +class ConvBNLayer(nn.Layer): def __init__(self, - num_channels, - filter_size, - num_filters, + in_channels, + out_channels, + kernel_size, stride, padding, - channels=None, - num_groups=1, - act='relu', - use_cudnn=True, - name=None): + num_groups=1): super(ConvBNLayer, self).__init__() - self._conv = Conv2D( - num_channels=num_channels, - num_filters=num_filters, - filter_size=filter_size, + self._conv = nn.Conv2d( + in_channels, + out_channels, + kernel_size, stride=stride, padding=padding, groups=num_groups, - act=None, - use_cudnn=use_cudnn, - param_attr=ParamAttr( - initializer=MSRA(), name=self.full_name() + "_weights"), bias_attr=False) - self._batch_norm = BatchNorm( - num_filters, - act=act, - param_attr=ParamAttr(name=self.full_name() + "_bn" + "_scale"), - bias_attr=ParamAttr(name=self.full_name() + "_bn" + "_offset"), - moving_mean_name=self.full_name() + "_bn" + '_mean', - moving_variance_name=self.full_name() + "_bn" + '_variance') + self._norm_layer = nn.BatchNorm2d(out_channels) + self._act = nn.ReLU() - def forward(self, inputs): - y = self._conv(inputs) - y = self._batch_norm(y) - return y + def forward(self, x): + x = self._conv(x) + x = self._norm_layer(x) + x = self._act(x) + return x -class DepthwiseSeparable(fluid.dygraph.Layer): - def __init__(self, - num_channels, - num_filters1, - num_filters2, - num_groups, - stride, - scale, - name=None): +class DepthwiseSeparable(nn.Layer): + def __init__(self, in_channels, out_channels1, out_channels2, num_groups, + stride, scale): super(DepthwiseSeparable, self).__init__() self._depthwise_conv = ConvBNLayer( - num_channels=num_channels, - num_filters=int(num_filters1 * scale), - filter_size=3, + in_channels, + int(out_channels1 * scale), + kernel_size=3, stride=stride, padding=1, - num_groups=int(num_groups * scale), - use_cudnn=False) + num_groups=int(num_groups * scale)) self._pointwise_conv = ConvBNLayer( - num_channels=int(num_filters1 * scale), - filter_size=1, - num_filters=int(num_filters2 * scale), + int(out_channels1 * scale), + int(out_channels2 * scale), + kernel_size=1, stride=1, padding=0) - def forward(self, inputs): - y = self._depthwise_conv(inputs) - y = self._pointwise_conv(y) - return y + def forward(self, x): + x = self._depthwise_conv(x) + x = self._pointwise_conv(x) + return x -class MobileNetV1(fluid.dygraph.Layer): +class MobileNetV1(nn.Layer): """MobileNetV1 model from `"MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications" `_. @@ -111,7 +90,6 @@ class MobileNetV1(fluid.dygraph.Layer): num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer will not be defined. Default: 1000. with_pool (bool): use pool before the last fc layer or not. Default: True. - classifier_activation (str): activation for the last fc layer. Default: 'softmax'. Examples: .. code-block:: python @@ -121,11 +99,7 @@ class MobileNetV1(fluid.dygraph.Layer): model = MobileNetV1() """ - def __init__(self, - scale=1.0, - num_classes=1000, - with_pool=True, - classifier_activation='softmax'): + def __init__(self, scale=1.0, num_classes=1000, with_pool=True): super(MobileNetV1, self).__init__() self.scale = scale self.dwsl = [] @@ -133,18 +107,17 @@ class MobileNetV1(fluid.dygraph.Layer): self.with_pool = with_pool self.conv1 = ConvBNLayer( - num_channels=3, - filter_size=3, - channels=3, - num_filters=int(32 * scale), + in_channels=3, + out_channels=int(32 * scale), + kernel_size=3, stride=2, padding=1) dws21 = self.add_sublayer( sublayer=DepthwiseSeparable( - num_channels=int(32 * scale), - num_filters1=32, - num_filters2=64, + in_channels=int(32 * scale), + out_channels1=32, + out_channels2=64, num_groups=32, stride=1, scale=scale), @@ -153,9 +126,9 @@ class MobileNetV1(fluid.dygraph.Layer): dws22 = self.add_sublayer( sublayer=DepthwiseSeparable( - num_channels=int(64 * scale), - num_filters1=64, - num_filters2=128, + in_channels=int(64 * scale), + out_channels1=64, + out_channels2=128, num_groups=64, stride=2, scale=scale), @@ -164,9 +137,9 @@ class MobileNetV1(fluid.dygraph.Layer): dws31 = self.add_sublayer( sublayer=DepthwiseSeparable( - num_channels=int(128 * scale), - num_filters1=128, - num_filters2=128, + in_channels=int(128 * scale), + out_channels1=128, + out_channels2=128, num_groups=128, stride=1, scale=scale), @@ -175,9 +148,9 @@ class MobileNetV1(fluid.dygraph.Layer): dws32 = self.add_sublayer( sublayer=DepthwiseSeparable( - num_channels=int(128 * scale), - num_filters1=128, - num_filters2=256, + in_channels=int(128 * scale), + out_channels1=128, + out_channels2=256, num_groups=128, stride=2, scale=scale), @@ -186,9 +159,9 @@ class MobileNetV1(fluid.dygraph.Layer): dws41 = self.add_sublayer( sublayer=DepthwiseSeparable( - num_channels=int(256 * scale), - num_filters1=256, - num_filters2=256, + in_channels=int(256 * scale), + out_channels1=256, + out_channels2=256, num_groups=256, stride=1, scale=scale), @@ -197,9 +170,9 @@ class MobileNetV1(fluid.dygraph.Layer): dws42 = self.add_sublayer( sublayer=DepthwiseSeparable( - num_channels=int(256 * scale), - num_filters1=256, - num_filters2=512, + in_channels=int(256 * scale), + out_channels1=256, + out_channels2=512, num_groups=256, stride=2, scale=scale), @@ -209,9 +182,9 @@ class MobileNetV1(fluid.dygraph.Layer): for i in range(5): tmp = self.add_sublayer( sublayer=DepthwiseSeparable( - num_channels=int(512 * scale), - num_filters1=512, - num_filters2=512, + in_channels=int(512 * scale), + out_channels1=512, + out_channels2=512, num_groups=512, stride=1, scale=scale), @@ -220,9 +193,9 @@ class MobileNetV1(fluid.dygraph.Layer): dws56 = self.add_sublayer( sublayer=DepthwiseSeparable( - num_channels=int(512 * scale), - num_filters1=512, - num_filters2=1024, + in_channels=int(512 * scale), + out_channels1=512, + out_channels2=1024, num_groups=512, stride=2, scale=scale), @@ -231,9 +204,9 @@ class MobileNetV1(fluid.dygraph.Layer): dws6 = self.add_sublayer( sublayer=DepthwiseSeparable( - num_channels=int(1024 * scale), - num_filters1=1024, - num_filters2=1024, + in_channels=int(1024 * scale), + out_channels1=1024, + out_channels2=1024, num_groups=1024, stride=1, scale=scale), @@ -241,29 +214,23 @@ class MobileNetV1(fluid.dygraph.Layer): self.dwsl.append(dws6) if with_pool: - self.pool2d_avg = Pool2D(pool_type='avg', global_pooling=True) - - if num_classes > -1: - self.out = Linear( - int(1024 * scale), - num_classes, - act=classifier_activation, - param_attr=ParamAttr( - initializer=MSRA(), name=self.full_name() + "fc7_weights"), - bias_attr=ParamAttr(name="fc7_offset")) - - def forward(self, inputs): - y = self.conv1(inputs) + self.pool2d_avg = nn.AdaptiveAvgPool2d(1) + + if num_classes > 0: + self.fc = nn.Linear(int(1024 * scale), num_classes) + + def forward(self, x): + x = self.conv1(x) for dws in self.dwsl: - y = dws(y) + x = dws(x) if self.with_pool: - y = self.pool2d_avg(y) + x = self.pool2d_avg(x) if self.num_classes > 0: - y = fluid.layers.reshape(y, shape=[-1, 1024]) - y = self.out(y) - return y + x = paddle.flatten(x, 1) + x = self.fc(x) + return x def _mobilenet(arch, pretrained=False, **kwargs): @@ -275,7 +242,7 @@ def _mobilenet(arch, pretrained=False, **kwargs): model_urls[arch][1]) assert weight_path.endswith( '.pdparams'), "suffix of weight must be .pdparams" - param, _ = fluid.load_dygraph(weight_path) + param, _ = paddle.load(weight_path) model.load_dict(param) return model diff --git a/python/paddle/vision/models/mobilenetv2.py b/python/paddle/vision/models/mobilenetv2.py index c08fb88f8bdb234fec99ed139aa7eb6249965c79..bab8b7b2b1b93bb17612843bf0032ee278c3e93f 100644 --- a/python/paddle/vision/models/mobilenetv2.py +++ b/python/paddle/vision/models/mobilenetv2.py @@ -14,9 +14,9 @@ import numpy as np import paddle -import paddle.fluid as fluid -from paddle.fluid.param_attr import ParamAttr -from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear + +import paddle.nn as nn +import paddle.nn.functional as F from paddle.utils.download import get_weights_path_from_url @@ -25,221 +25,166 @@ __all__ = ['MobileNetV2', 'mobilenet_v2'] model_urls = { 'mobilenetv2_1.0': ('https://paddle-hapi.bj.bcebos.com/models/mobilenet_v2_x1.0.pdparams', - '8ff74f291f72533f2a7956a4efff9d88') + '0340af0a901346c8d46f4529882fb63d') } -class ConvBNLayer(fluid.dygraph.Layer): - def __init__(self, - num_channels, - filter_size, - num_filters, - stride, - padding, - channels=None, - num_groups=1, - use_cudnn=True): - super(ConvBNLayer, self).__init__() - - tmp_param = ParamAttr(name=self.full_name() + "_weights") - self._conv = Conv2D( - num_channels=num_channels, - num_filters=num_filters, - filter_size=filter_size, - stride=stride, - padding=padding, - groups=num_groups, - act=None, - use_cudnn=use_cudnn, - param_attr=tmp_param, - bias_attr=False) - - self._batch_norm = BatchNorm( - num_filters, - param_attr=ParamAttr(name=self.full_name() + "_bn" + "_scale"), - bias_attr=ParamAttr(name=self.full_name() + "_bn" + "_offset"), - moving_mean_name=self.full_name() + "_bn" + '_mean', - moving_variance_name=self.full_name() + "_bn" + '_variance') - - def forward(self, inputs, if_act=True): - y = self._conv(inputs) - y = self._batch_norm(y) - if if_act: - y = fluid.layers.relu6(y) - return y - - -class InvertedResidualUnit(fluid.dygraph.Layer): - def __init__( - self, - num_channels, - num_in_filter, - num_filters, - stride, - filter_size, - padding, - expansion_factor, ): - super(InvertedResidualUnit, self).__init__() - num_expfilter = int(round(num_in_filter * expansion_factor)) - self._expand_conv = ConvBNLayer( - num_channels=num_channels, - num_filters=num_expfilter, - filter_size=1, - stride=1, - padding=0, - num_groups=1) - - self._bottleneck_conv = ConvBNLayer( - num_channels=num_expfilter, - num_filters=num_expfilter, - filter_size=filter_size, - stride=stride, - padding=padding, - num_groups=num_expfilter, - use_cudnn=False) - - self._linear_conv = ConvBNLayer( - num_channels=num_expfilter, - num_filters=num_filters, - filter_size=1, - stride=1, - padding=0, - num_groups=1) - - def forward(self, inputs, ifshortcut): - y = self._expand_conv(inputs, if_act=True) - y = self._bottleneck_conv(y, if_act=True) - y = self._linear_conv(y, if_act=False) - if ifshortcut: - y = fluid.layers.elementwise_add(inputs, y) - return y - - -class InvresiBlocks(fluid.dygraph.Layer): - def __init__(self, in_c, t, c, n, s): - super(InvresiBlocks, self).__init__() - - self._first_block = InvertedResidualUnit( - num_channels=in_c, - num_in_filter=in_c, - num_filters=c, - stride=s, - filter_size=3, - padding=1, - expansion_factor=t) - - self._inv_blocks = [] - for i in range(1, n): - tmp = self.add_sublayer( - sublayer=InvertedResidualUnit( - num_channels=c, - num_in_filter=c, - num_filters=c, - stride=1, - filter_size=3, - padding=1, - expansion_factor=t), - name=self.full_name() + "_" + str(i + 1)) - self._inv_blocks.append(tmp) - - def forward(self, inputs): - y = self._first_block(inputs, ifshortcut=False) - for inv_block in self._inv_blocks: - y = inv_block(y, ifshortcut=True) - return y - - -class MobileNetV2(fluid.dygraph.Layer): - """MobileNetV2 model from - `"MobileNetV2: Inverted Residuals and Linear Bottlenecks" `_. - - Args: - scale (float): scale of channels in each layer. Default: 1.0. - num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer - will not be defined. Default: 1000. - with_pool (bool): use pool before the last fc layer or not. Default: True. - classifier_activation (str): activation for the last fc layer. Default: 'softmax'. - - Examples: - .. code-block:: python +def _make_divisible(v, divisor, min_value=None): + if min_value is None: + min_value = divisor + new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) - from paddle.vision.models import MobileNetV2 + if new_v < 0.9 * v: + new_v += divisor + return new_v - model = MobileNetV2() - """ +class ConvBNReLU(nn.Sequential): + def __init__(self, + in_planes, + out_planes, + kernel_size=3, + stride=1, + groups=1, + norm_layer=nn.BatchNorm2d): + padding = (kernel_size - 1) // 2 + + super(ConvBNReLU, self).__init__( + nn.Conv2d( + in_planes, + out_planes, + kernel_size, + stride, + padding, + groups=groups, + bias_attr=False), + norm_layer(out_planes), + nn.ReLU6()) + + +class InvertedResidual(nn.Layer): def __init__(self, - scale=1.0, - num_classes=1000, - with_pool=True, - classifier_activation='softmax'): + inp, + oup, + stride, + expand_ratio, + norm_layer=nn.BatchNorm2d): + super(InvertedResidual, self).__init__() + self.stride = stride + assert stride in [1, 2] + + hidden_dim = int(round(inp * expand_ratio)) + self.use_res_connect = self.stride == 1 and inp == oup + + layers = [] + if expand_ratio != 1: + layers.append( + ConvBNReLU( + inp, hidden_dim, kernel_size=1, norm_layer=norm_layer)) + layers.extend([ + ConvBNReLU( + hidden_dim, + hidden_dim, + stride=stride, + groups=hidden_dim, + norm_layer=norm_layer), + nn.Conv2d( + hidden_dim, oup, 1, 1, 0, bias_attr=False), + norm_layer(oup), + ]) + self.conv = nn.Sequential(*layers) + + def forward(self, x): + if self.use_res_connect: + return x + self.conv(x) + else: + return self.conv(x) + + +class MobileNetV2(nn.Layer): + def __init__(self, scale=1.0, num_classes=1000, with_pool=True): + """MobileNetV2 model from + `"MobileNetV2: Inverted Residuals and Linear Bottlenecks" `_. + + Args: + scale (float): scale of channels in each layer. Default: 1.0. + num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer + will not be defined. Default: 1000. + with_pool (bool): use pool before the last fc layer or not. Default: True. + + Examples: + .. code-block:: python + + from paddle.vision.models import MobileNetV2 + + model = MobileNetV2() + """ super(MobileNetV2, self).__init__() - self.scale = scale self.num_classes = num_classes self.with_pool = with_pool + input_channel = 32 + last_channel = 1280 + + block = InvertedResidual + round_nearest = 8 + norm_layer = nn.BatchNorm2d + inverted_residual_setting = [ + [1, 16, 1, 1], + [6, 24, 2, 2], + [6, 32, 3, 2], + [6, 64, 4, 2], + [6, 96, 3, 1], + [6, 160, 3, 2], + [6, 320, 1, 1], + ] - bottleneck_params_list = [ - (1, 16, 1, 1), - (6, 24, 2, 2), - (6, 32, 3, 2), - (6, 64, 4, 2), - (6, 96, 3, 1), - (6, 160, 3, 2), - (6, 320, 1, 1), + input_channel = _make_divisible(input_channel * scale, round_nearest) + self.last_channel = _make_divisible(last_channel * max(1.0, scale), + round_nearest) + features = [ + ConvBNReLU( + 3, input_channel, stride=2, norm_layer=norm_layer) ] - self._conv1 = ConvBNLayer( - num_channels=3, - num_filters=int(32 * scale), - filter_size=3, - stride=2, - padding=1) - - self._invl = [] - i = 1 - in_c = int(32 * scale) - for layer_setting in bottleneck_params_list: - t, c, n, s = layer_setting - i += 1 - tmp = self.add_sublayer( - sublayer=InvresiBlocks( - in_c=in_c, t=t, c=int(c * scale), n=n, s=s), - name='conv' + str(i)) - self._invl.append(tmp) - in_c = int(c * scale) - - self._out_c = int(1280 * scale) if scale > 1.0 else 1280 - self._conv9 = ConvBNLayer( - num_channels=in_c, - num_filters=self._out_c, - filter_size=1, - stride=1, - padding=0) + for t, c, n, s in inverted_residual_setting: + output_channel = _make_divisible(c * scale, round_nearest) + for i in range(n): + stride = s if i == 0 else 1 + features.append( + block( + input_channel, + output_channel, + stride, + expand_ratio=t, + norm_layer=norm_layer)) + input_channel = output_channel + + features.append( + ConvBNReLU( + input_channel, + self.last_channel, + kernel_size=1, + norm_layer=norm_layer)) + + self.features = nn.Sequential(*features) if with_pool: - self._pool2d_avg = Pool2D(pool_type='avg', global_pooling=True) - - if num_classes > 0: - tmp_param = ParamAttr(name=self.full_name() + "fc10_weights") - self._fc = Linear( - self._out_c, - num_classes, - act=classifier_activation, - param_attr=tmp_param, - bias_attr=ParamAttr(name="fc10_offset")) - - def forward(self, inputs): - y = self._conv1(inputs, if_act=True) - for inv in self._invl: - y = inv(y) - y = self._conv9(y, if_act=True) + self.pool2d_avg = nn.AdaptiveAvgPool2d(1) + + if self.num_classes > 0: + self.classifier = nn.Sequential( + nn.Dropout(0.2), nn.Linear(self.last_channel, num_classes)) + + def forward(self, x): + x = self.features(x) if self.with_pool: - y = self._pool2d_avg(y) + x = self.pool2d_avg(x) + if self.num_classes > 0: - y = fluid.layers.reshape(y, shape=[-1, self._out_c]) - y = self._fc(y) - return y + x = paddle.flatten(x, 1) + x = self.classifier(x) + return x def _mobilenet(arch, pretrained=False, **kwargs): @@ -251,7 +196,7 @@ def _mobilenet(arch, pretrained=False, **kwargs): model_urls[arch][1]) assert weight_path.endswith( '.pdparams'), "suffix of weight must be .pdparams" - param, _ = fluid.load_dygraph(weight_path) + param, _ = paddle.load(weight_path) model.load_dict(param) return model diff --git a/python/paddle/vision/models/resnet.py b/python/paddle/vision/models/resnet.py index da0c3e9eb3f67f0aad67cdef3c5527cb2275e844..f9e00aefd6bb2b6d0b7bf75055cc735c6651a52d 100644 --- a/python/paddle/vision/models/resnet.py +++ b/python/paddle/vision/models/resnet.py @@ -15,11 +15,8 @@ from __future__ import division from __future__ import print_function -import math -import paddle.fluid as fluid - -from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear -from paddle.fluid.dygraph.container import Sequential +import paddle +import paddle.nn as nn from paddle.utils.download import get_weights_path_from_url @@ -29,143 +26,129 @@ __all__ = [ model_urls = { 'resnet18': ('https://paddle-hapi.bj.bcebos.com/models/resnet18.pdparams', - '0ba53eea9bc970962d0ef96f7b94057e'), + 'cf548f46534aa3560945be4b95cd11c4'), 'resnet34': ('https://paddle-hapi.bj.bcebos.com/models/resnet34.pdparams', - '46bc9f7c3dd2e55b7866285bee91eff3'), + '8d2275cf8706028345f78ac0e1d31969'), 'resnet50': ('https://paddle-hapi.bj.bcebos.com/models/resnet50.pdparams', - '5ce890a9ad386df17cf7fe2313dca0a1'), + 'ca6f485ee1ab0492d38f323885b0ad80'), 'resnet101': ('https://paddle-hapi.bj.bcebos.com/models/resnet101.pdparams', - 'fb07a451df331e4b0bb861ed97c3a9b9'), + '02f35f034ca3858e1e54d4036443c92d'), 'resnet152': ('https://paddle-hapi.bj.bcebos.com/models/resnet152.pdparams', - 'f9c700f26d3644bb76ad2226ed5f5713'), + '7ad16a2f1e7333859ff986138630fd7a'), } -class ConvBNLayer(fluid.dygraph.Layer): +class BasicBlock(nn.Layer): + expansion = 1 + def __init__(self, - num_channels, - num_filters, - filter_size, + inplanes, + planes, stride=1, + downsample=None, groups=1, - act=None): - super(ConvBNLayer, self).__init__() - - self._conv = Conv2D( - num_channels=num_channels, - num_filters=num_filters, - filter_size=filter_size, - stride=stride, - padding=(filter_size - 1) // 2, - groups=groups, - act=None, - bias_attr=False) - - self._batch_norm = BatchNorm(num_filters, act=act) - - def forward(self, inputs): - x = self._conv(inputs) - x = self._batch_norm(x) - - return x - - -class BasicBlock(fluid.dygraph.Layer): - """residual block of resnet18 and resnet34 - """ - expansion = 1 - - def __init__(self, num_channels, num_filters, stride, shortcut=True): + base_width=64, + dilation=1, + norm_layer=None): super(BasicBlock, self).__init__() + if norm_layer is None: + norm_layer = nn.BatchNorm2d - self.conv0 = ConvBNLayer( - num_channels=num_channels, - num_filters=num_filters, - filter_size=3, - act='relu') - self.conv1 = ConvBNLayer( - num_channels=num_filters, - num_filters=num_filters, - filter_size=3, - stride=stride, - act='relu') + if dilation > 1: + raise NotImplementedError( + "Dilation > 1 not supported in BasicBlock") - if not shortcut: - self.short = ConvBNLayer( - num_channels=num_channels, - num_filters=num_filters, - filter_size=1, - stride=stride) + self.conv1 = nn.Conv2d( + inplanes, planes, 3, padding=1, stride=stride, bias_attr=False) + self.bn1 = norm_layer(planes) + self.relu = nn.ReLU() + self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias_attr=False) + self.bn2 = norm_layer(planes) + self.downsample = downsample + self.stride = stride - self.shortcut = shortcut + def forward(self, x): + identity = x - def forward(self, inputs): - y = self.conv0(inputs) - conv1 = self.conv1(y) + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) - if self.shortcut: - short = inputs - else: - short = self.short(inputs) + out = self.conv2(out) + out = self.bn2(out) - y = short + conv1 + if self.downsample is not None: + identity = self.downsample(x) - return fluid.layers.relu(y) + out += identity + out = self.relu(out) + return out -class BottleneckBlock(fluid.dygraph.Layer): - """residual block of resnet50, resnet101 amd resnet152 - """ + +class BottleneckBlock(nn.Layer): expansion = 4 - def __init__(self, num_channels, num_filters, stride, shortcut=True): + def __init__(self, + inplanes, + planes, + stride=1, + downsample=None, + groups=1, + base_width=64, + dilation=1, + norm_layer=None): super(BottleneckBlock, self).__init__() - - self.conv0 = ConvBNLayer( - num_channels=num_channels, - num_filters=num_filters, - filter_size=1, - act='relu') - self.conv1 = ConvBNLayer( - num_channels=num_filters, - num_filters=num_filters, - filter_size=3, + if norm_layer is None: + norm_layer = nn.BatchNorm2d + width = int(planes * (base_width / 64.)) * groups + + self.conv1 = nn.Conv2d(inplanes, width, 1, bias_attr=False) + self.bn1 = norm_layer(width) + + self.conv2 = nn.Conv2d( + width, + width, + 3, + padding=dilation, stride=stride, - act='relu') - self.conv2 = ConvBNLayer( - num_channels=num_filters, - num_filters=num_filters * self.expansion, - filter_size=1, - act=None) + groups=groups, + dilation=dilation, + bias_attr=False) + self.bn2 = norm_layer(width) - if not shortcut: - self.short = ConvBNLayer( - num_channels=num_channels, - num_filters=num_filters * self.expansion, - filter_size=1, - stride=stride) + self.conv3 = nn.Conv2d( + width, planes * self.expansion, 1, bias_attr=False) + self.bn3 = norm_layer(planes * self.expansion) + self.relu = nn.ReLU() + self.downsample = downsample + self.stride = stride - self.shortcut = shortcut + def forward(self, x): + identity = x - self._num_channels_out = num_filters * self.expansion + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) - def forward(self, inputs): - x = self.conv0(inputs) - conv1 = self.conv1(x) - conv2 = self.conv2(conv1) + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) - if self.shortcut: - short = inputs - else: - short = self.short(inputs) + out = self.conv3(out) + out = self.bn3(out) - x = fluid.layers.elementwise_add(x=short, y=conv2) + if self.downsample is not None: + identity = self.downsample(x) - return fluid.layers.relu(x) + out += identity + out = self.relu(out) + return out -class ResNet(fluid.dygraph.Layer): + +class ResNet(nn.Layer): """ResNet model from `"Deep Residual Learning for Image Recognition" `_ @@ -175,7 +158,6 @@ class ResNet(fluid.dygraph.Layer): num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer will not be defined. Default: 1000. with_pool (bool): use pool before the last fc layer or not. Default: True. - classifier_activation (str): activation for the last fc layer. Default: 'softmax'. Examples: .. code-block:: python @@ -189,82 +171,87 @@ class ResNet(fluid.dygraph.Layer): """ - def __init__(self, - Block, - depth=50, - num_classes=1000, - with_pool=True, - classifier_activation='softmax'): + def __init__(self, block, depth, num_classes=1000, with_pool=True): super(ResNet, self).__init__() - - self.num_classes = num_classes - self.with_pool = with_pool - - layer_config = { + layer_cfg = { 18: [2, 2, 2, 2], 34: [3, 4, 6, 3], 50: [3, 4, 6, 3], 101: [3, 4, 23, 3], - 152: [3, 8, 36, 3], + 152: [3, 8, 36, 3] } - assert depth in layer_config.keys(), \ - "supported depth are {} but input layer is {}".format( - layer_config.keys(), depth) - - layers = layer_config[depth] - - in_channels = 64 - out_channels = [64, 128, 256, 512] - - self.conv = ConvBNLayer( - num_channels=3, num_filters=64, filter_size=7, stride=2, act='relu') - self.pool = Pool2D( - pool_size=3, pool_stride=2, pool_padding=1, pool_type='max') - - self.layers = [] - for idx, num_blocks in enumerate(layers): - blocks = [] - shortcut = False - for b in range(num_blocks): - if b == 1: - in_channels = out_channels[idx] * Block.expansion - block = Block( - num_channels=in_channels, - num_filters=out_channels[idx], - stride=2 if b == 0 and idx != 0 else 1, - shortcut=shortcut) - blocks.append(block) - shortcut = True - layer = self.add_sublayer("layer_{}".format(idx), - Sequential(*blocks)) - self.layers.append(layer) + layers = layer_cfg[depth] + self.num_classes = num_classes + self.with_pool = with_pool + self._norm_layer = nn.BatchNorm2d + + self.inplanes = 64 + self.dilation = 1 + self.conv1 = nn.Conv2d( + 3, + self.inplanes, + kernel_size=7, + stride=2, + padding=3, + bias_attr=False) + self.bn1 = self._norm_layer(self.inplanes) + self.relu = nn.ReLU() + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + self.layer1 = self._make_layer(block, 64, layers[0]) + self.layer2 = self._make_layer(block, 128, layers[1], stride=2) + self.layer3 = self._make_layer(block, 256, layers[2], stride=2) + self.layer4 = self._make_layer(block, 512, layers[3], stride=2) if with_pool: - self.global_pool = Pool2D( - pool_size=7, pool_type='avg', global_pooling=True) + self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) if num_classes > 0: - stdv = 1.0 / math.sqrt(out_channels[-1] * Block.expansion * 1.0) - self.fc_input_dim = out_channels[-1] * Block.expansion * 1 * 1 - self.fc = Linear( - self.fc_input_dim, - num_classes, - act=classifier_activation, - param_attr=fluid.param_attr.ParamAttr( - initializer=fluid.initializer.Uniform(-stdv, stdv))) - - def forward(self, inputs): - x = self.conv(inputs) - x = self.pool(x) - for layer in self.layers: - x = layer(x) - - if self.with_pool: - x = self.global_pool(x) - - if self.num_classes > -1: - x = fluid.layers.reshape(x, shape=[-1, self.fc_input_dim]) + self.fc = nn.Linear(512 * block.expansion, num_classes) + + def _make_layer(self, block, planes, blocks, stride=1, dilate=False): + norm_layer = self._norm_layer + downsample = None + previous_dilation = self.dilation + if dilate: + self.dilation *= stride + stride = 1 + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv2d( + self.inplanes, + planes * block.expansion, + 1, + stride=stride, + bias_attr=False), + norm_layer(planes * block.expansion), ) + + layers = [] + layers.append( + block(self.inplanes, planes, stride, downsample, 1, 64, + previous_dilation, norm_layer)) + self.inplanes = planes * block.expansion + for _ in range(1, blocks): + layers.append(block(self.inplanes, planes, norm_layer=norm_layer)) + + return nn.Sequential(*layers) + + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + x = self.maxpool(x) + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + + if self.with_pool > 0: + x = self.avgpool(x) + + if self.num_classes > 0: + x = paddle.flatten(x, 1) x = self.fc(x) + return x @@ -277,7 +264,7 @@ def _resnet(arch, Block, depth, pretrained, **kwargs): model_urls[arch][1]) assert weight_path.endswith( '.pdparams'), "suffix of weight must be .pdparams" - param, _ = fluid.load_dygraph(weight_path) + param, _ = paddle.load(weight_path) model.set_dict(param) return model diff --git a/python/paddle/vision/models/vgg.py b/python/paddle/vision/models/vgg.py index 8bfacda2476d0e24e549513b379181bf47e40d45..d11845b6616267d1cdfff197cc2c4a25a62c7d9e 100644 --- a/python/paddle/vision/models/vgg.py +++ b/python/paddle/vision/models/vgg.py @@ -12,9 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import paddle.fluid as fluid -from paddle.nn import Conv2d, Pool2D, BatchNorm, Linear, ReLU, Softmax -from paddle.fluid.dygraph.container import Sequential +import paddle +import paddle.nn as nn from paddle.utils.download import get_weights_path_from_url @@ -28,39 +27,18 @@ __all__ = [ model_urls = { 'vgg16': ('https://paddle-hapi.bj.bcebos.com/models/vgg16.pdparams', - 'c788f453a3b999063e8da043456281ee') + '89bbffc0f87d260be9b8cdc169c991c4') } -class Classifier(fluid.dygraph.Layer): - def __init__(self, num_classes, classifier_activation='softmax'): - super(Classifier, self).__init__() - self.linear1 = Linear(512 * 7 * 7, 4096) - self.linear2 = Linear(4096, 4096) - self.linear3 = Linear(4096, num_classes) - self.act = Softmax() #Todo: accept any activation - - def forward(self, x): - x = self.linear1(x) - x = fluid.layers.relu(x) - x = fluid.layers.dropout(x, 0.5) - x = self.linear2(x) - x = fluid.layers.relu(x) - x = fluid.layers.dropout(x, 0.5) - x = self.linear3(x) - out = self.act(x) - return out - - -class VGG(fluid.dygraph.Layer): +class VGG(nn.Layer): """VGG model from `"Very Deep Convolutional Networks For Large-Scale Image Recognition" `_ Args: - features (fluid.dygraph.Layer): vgg features create by function make_layers. + features (nn.Layer): vgg features create by function make_layers. num_classes (int): output dim of last fc layer. If num_classes <=0, last fc layer will not be defined. Default: 1000. - classifier_activation (str): activation for the last fc layer. Default: 'softmax'. Examples: .. code-block:: python @@ -76,44 +54,41 @@ class VGG(fluid.dygraph.Layer): """ - def __init__(self, - features, - num_classes=1000, - classifier_activation='softmax'): + def __init__(self, features, num_classes=1000): super(VGG, self).__init__() self.features = features - self.num_classes = num_classes - - if num_classes > 0: - classifier = Classifier(num_classes, classifier_activation) - self.classifier = self.add_sublayer("classifier", - Sequential(classifier)) + self.avgpool = nn.AdaptiveAvgPool2d((7, 7)) + self.classifier = nn.Sequential( + nn.Linear(512 * 7 * 7, 4096), + nn.ReLU(), + nn.Dropout(), + nn.Linear(4096, 4096), + nn.ReLU(), + nn.Dropout(), + nn.Linear(4096, num_classes), ) def forward(self, x): x = self.features(x) - - if self.num_classes > 0: - x = fluid.layers.flatten(x, 1) - x = self.classifier(x) + x = self.avgpool(x) + x = paddle.flatten(x, 1) + x = self.classifier(x) return x def make_layers(cfg, batch_norm=False): layers = [] in_channels = 3 - for v in cfg: if v == 'M': - layers += [Pool2D(pool_size=2, pool_stride=2)] + layers += [nn.MaxPool2d(kernel_size=2, stride=2)] else: + conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1) if batch_norm: - conv2d = Conv2d(in_channels, v, kernel_size=3, padding=1) - layers += [conv2d, BatchNorm(v), ReLU()] + layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU()] else: - conv2d = Conv2d(in_channels, v, kernel_size=3, padding=1) - layers += [conv2d, ReLU()] + layers += [conv2d, nn.ReLU()] in_channels = v - return Sequential(*layers) + return nn.Sequential(*layers) cfgs = { @@ -144,7 +119,7 @@ def _vgg(arch, cfg, batch_norm, pretrained, **kwargs): model_urls[arch][1]) assert weight_path.endswith( '.pdparams'), "suffix of weight must be .pdparams" - param, _ = fluid.load_dygraph(weight_path) + param, _ = paddle.load(weight_path) model.load_dict(param) return model diff --git a/tools/is_ut_disabled.py b/tools/is_ut_disabled.py deleted file mode 100644 index a21fe39e71e516cf14d1eda970d2deba986ef8a3..0000000000000000000000000000000000000000 --- a/tools/is_ut_disabled.py +++ /dev/null @@ -1,40 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" Check whether ut is disabled. """ - -import os -import sys - - -def check_ut(): - """ Get disabled unit tests. """ - disable_ut_file = 'disable_ut' - cmd = 'wget -q --no-check-certificate https://sys-p0.bj.bcebos.com/prec/{}'.format( - disable_ut_file) - os.system(cmd) - with open(disable_ut_file) as utfile: - for u in utfile: - if u.rstrip('\r\n') == sys.argv[1]: - exit(0) - exit(1) - - -if __name__ == '__main__': - if len(sys.argv) != 2: - exit(1) - try: - check_ut() - except Exception as e: - print(e) - exit(1) diff --git a/tools/wlist.json b/tools/wlist.json index 0ed0b4e40698ce26fbddb7e5a421143749b3a3ef..3ca14cd1dd6f964f87031e31128bfb2cb1c733a1 100644 --- a/tools/wlist.json +++ b/tools/wlist.json @@ -251,9 +251,10 @@ "BilinearTensorProduct", "GroupNorm", "SpectralNorm", - "TreeConv", + "TreeConv" + ], + "wlist_temp":[ "prroi_pool", - "to_tensor", "ChunkEvaluator", "EditDistance", "ErrorClipByValue", @@ -406,7 +407,9 @@ "TransformerDecoder.prepare_incremental_cache", "LinearChainCRF.forward", "CRFDecoding.forward", - "SequenceTagging.forward" + "SequenceTagging.forward", + "XPUPlace", + "is_compiled_with_xpu" ], "gpu_not_white":[ "deformable_conv",