diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc index 1cf4eb6c2989346c9e9acef648aa74615c7bcb10..d42bd0b16d7a84987517326af9567809fd29da4d 100644 --- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc @@ -45,14 +45,35 @@ inline void InitVarsInScope(const std::vector &var_infos, Scope *scope, // get CommContext and remote send and recv op void ProcessGraph(std::vector graphs, Scope *scope) { #ifdef PADDLE_WITH_DISTRIBUTE - // init communicator here - auto *instance = operators::distributed::Communicator::GetInstance(); - auto initialized = instance ? true : false; - PADDLE_ENFORCE_EQ(initialized, true, - platform::errors::InvalidArgument( - "Communicator is not Initialized, you may use " - "FleetAPI(https://github.com/PaddlePaddle/Fleet/tree/" - "develop/markdown_doc/transpiler)")); + + bool need_communicator = false; + + for (auto &node : graphs[0]->Nodes()) { + VLOG(3) << "node name " << node->Name(); + if (node && node->IsOp()) { + if (node->Name() == "send") { + auto send_varnames = + BOOST_GET_CONST(std::vector, + node->Op()->GetNullableAttr("send_varnames")); + + if (send_varnames.size() > 0) { + need_communicator = true; + break; + } + } + } + } + + if (need_communicator) { + // init communicator here + auto *instance = operators::distributed::Communicator::GetInstance(); + auto initialized = instance ? true : false; + PADDLE_ENFORCE_EQ(initialized, true, + platform::errors::InvalidArgument( + "Communicator is not Initialized, you may use " + "FleetAPI(https://github.com/PaddlePaddle/Fleet/tree/" + "develop/markdown_doc/transpiler)")); + } #endif } diff --git a/paddle/fluid/framework/fleet/CMakeLists.txt b/paddle/fluid/framework/fleet/CMakeLists.txt index 55ac44c5e32cef47f5204182a84b05ba0cd1ef1d..0d62488bfe67a316f4840107508129c49b36f23c 100644 --- a/paddle/fluid/framework/fleet/CMakeLists.txt +++ b/paddle/fluid/framework/fleet/CMakeLists.txt @@ -19,6 +19,6 @@ else() cc_library(gloo_wrapper SRCS gloo_wrapper.cc DEPS framework_proto variable_helper scope) endif(WITH_GLOO) -cc_library(heter_wrapper SRCS heter_wrapper.cc DEPS framework_proto) +cc_library(heter_wrapper SRCS heter_wrapper.cc DEPS framework_proto device_context) cc_test(test_fleet SRCS test_fleet.cc DEPS fleet_wrapper gloo_wrapper fs shell) diff --git a/paddle/fluid/framework/fleet/gloo_wrapper.cc b/paddle/fluid/framework/fleet/gloo_wrapper.cc index 49181cd05f3fac259489fcfc67fd99c559fafeef..bb958f1ac015bfd1a71b3ccd530406a33e4e37cb 100644 --- a/paddle/fluid/framework/fleet/gloo_wrapper.cc +++ b/paddle/fluid/framework/fleet/gloo_wrapper.cc @@ -54,10 +54,9 @@ void HdfsStore::set(const std::string& key, const std::vector& data) { paddle::framework::fs_remove(tmp); if (i == retry_times_) { VLOG(0) << "fs_open_write failed, retry times reaches limit"; - // PADDLE_THROW(platform::errors::PreconditionNotMet( - // "fs_open_write failed, retry times reaches" - // " limit ", - // retry_times_)); + PADDLE_THROW(paddle::platform::errors::PreconditionNotMet( + "fs_open_write failed, retry times reaches %d limit.", + retry_times_)); } } else { break; @@ -143,9 +142,9 @@ void HdfsStore::wait(const std::vector& keys, break; } } - // PADDLE_THROW(platform::errors::ExecutionTimeout( - VLOG(0) << "TIMEOUT self_rank = " << self_rank_ - << " pair_rank = " << last_check_rank; + PADDLE_THROW(paddle::platform::errors::ExecutionTimeout( + "TIMEOUT self_rank = %d pair_rank = %d", self_rank_, + last_check_rank)); } std::this_thread::sleep_for(std::chrono::milliseconds(wait_sleep_ms_)); } diff --git a/paddle/fluid/framework/unused_var_check.cc b/paddle/fluid/framework/unused_var_check.cc index eee100bc81c337942fe9e051f63155e8b07c1cb8..e7e964b41818597404d9a6e510c0743f4f8c3f7e 100644 --- a/paddle/fluid/framework/unused_var_check.cc +++ b/paddle/fluid/framework/unused_var_check.cc @@ -28,38 +28,6 @@ DEFINE_bool(enable_unused_var_check, false, "Checking whether operator contains unused inputs, " "especially for grad operator. It should be in unittest."); -// NOTE(zhiqiu): Currently, there are some operators which involves unused -// inputs and cannot be removed from the allow_list below. -// They can be mainly divided into four categories: -// 0: the inputs of which are only used in if branch, or used in cuda kernel but -// not in cpu kernel; -// 1: the inputs of which are used to indicate dtype of outputs; -// 2: the inputs of which are used in fused operators. -// The category number is presented in the comments after each operator. - -const std::unordered_set op_with_unsed_vars_allow_list = { - "batch_norm", // 0 - "batch_norm_grad", // 0 - "sync_batch_norm", // 0 - "sync_batch_norm_grad", // 0 - "inplace_abn", // 0 - "inplace_abn_grad", // 0 - "dgc_momentum", // 0 - "fake_quantize_range_abs_max", // 0 - "rmsprop", // 0 - "sequence_conv_grad", // 0 - "roi_perspective_transform_grad", // 0 - "fill_zeros_like", // 1 - "fill_any_like", // 1 - "nce_grad", // 1 - "precision_recall", // 1 - "fusion_seqpool_cvm_concat", // 2 - "fused_batch_norm_act", // 2 - "fused_batch_norm_act_grad", // 2 - "data_norm", // 0 - "data_norm_grad", // 0 -}; - namespace paddle { namespace framework { @@ -75,9 +43,44 @@ void LogVarUsageIfUnusedVarCheckEnabled(const std::string &name) { } } +static const std::unordered_set &GetOpWithUnusedVarAllowSet() { + // NOTE(zhiqiu): Currently, there are some operators which involves unused + // inputs and cannot be removed from the allow_list below. + // They can be mainly divided into four categories: + // 0: the inputs of which are only used in if branch, or used in cuda kernel + // but not in cpu kernel; 1: the inputs of which are used to indicate dtype of + // outputs; 2: the inputs of which are used in fused operators. The category + // number is presented in the comments after each operator. + // Use pointer here for safe static deinitialization + static auto *allow_set = new std::unordered_set({ + // called once + "batch_norm", // 0 + "batch_norm_grad", // 0 + "sync_batch_norm", // 0 + "sync_batch_norm_grad", // 0 + "inplace_abn", // 0 + "inplace_abn_grad", // 0 + "dgc_momentum", // 0 + "fake_quantize_range_abs_max", // 0 + "rmsprop", // 0 + "sequence_conv_grad", // 0 + "roi_perspective_transform_grad", // 0 + "fill_zeros_like", // 1 + "fill_any_like", // 1 + "nce_grad", // 1 + "precision_recall", // 1 + "fusion_seqpool_cvm_concat", // 2 + "fused_batch_norm_act", // 2 + "fused_batch_norm_act_grad", // 2 + "data_norm", // 0 + "data_norm_grad", // 0); + }); + return *allow_set; +} + void CheckUnusedVar(const OperatorBase &op, const Scope &scope) { // skip op in allow list. - if (op_with_unsed_vars_allow_list.count(op.Type()) != 0) { + if (GetOpWithUnusedVarAllowSet().count(op.Type()) != 0) { return; } auto *used_set = GetThreadLocalUsedVarNameSet(); diff --git a/paddle/fluid/inference/tensorrt/helper.h b/paddle/fluid/inference/tensorrt/helper.h index 55a57caf9a0d6eb44399ceb8064b613afb955d47..971f99e69197226bb7d7b26135f0b667f8ebdf30 100644 --- a/paddle/fluid/inference/tensorrt/helper.h +++ b/paddle/fluid/inference/tensorrt/helper.h @@ -56,9 +56,11 @@ static nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger* logger) { return static_cast( dy::createInferRuntime_INTERNAL(logger, NV_TENSORRT_VERSION)); } -static nvinfer1::IPluginRegistry* getPluginRegistry() { +#if IS_TRT_VERSION_GE(6000) +static nvinfer1::IPluginRegistry* GetPluginRegistry() { return static_cast(dy::getPluginRegistry()); } +#endif // A logger for create TensorRT infer builder. class NaiveLogger : public nvinfer1::ILogger { diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h index f4424b8b7851fbf41611d4048a4981982179200f..528adacb27c9897420a5115a93c88c246c0d78d8 100644 --- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h @@ -178,12 +178,16 @@ class DynamicPluginTensorRT : public nvinfer1::IPluginV2DynamicExt { std::string name_space_; std::string plugin_base_; }; -#endif template class TrtPluginRegistrarV2 { public: - TrtPluginRegistrarV2() { getPluginRegistry()->registerCreator(creator, ""); } + TrtPluginRegistrarV2() { + static auto func_ptr = GetPluginRegistry(); + if (func_ptr != nullptr) { + func_ptr->registerCreator(creator, ""); + } + } private: T creator; @@ -193,6 +197,8 @@ class TrtPluginRegistrarV2 { static paddle::inference::tensorrt::plugin::TrtPluginRegistrarV2 \ plugin_registrar_##name {} +#endif + } // namespace plugin } // namespace tensorrt } // namespace inference diff --git a/paddle/fluid/operators/squeeze_op.cc b/paddle/fluid/operators/squeeze_op.cc index 859776bc2a0f0056224b69f74a7e423ff2dd0a01..93d8f42ce2175bbf554eef9892f4da4b9da524ec 100644 --- a/paddle/fluid/operators/squeeze_op.cc +++ b/paddle/fluid/operators/squeeze_op.cc @@ -304,6 +304,7 @@ REGISTER_OPERATOR(squeeze2_grad, ops::Squeeze2GradOp, REGISTER_OP_CPU_KERNEL( squeeze, ops::SqueezeKernel, ops::SqueezeKernel, + ops::SqueezeKernel, ops::SqueezeKernel, ops::SqueezeKernel, ops::SqueezeKernel); @@ -311,12 +312,14 @@ REGISTER_OP_CPU_KERNEL( squeeze_grad, ops::SqueezeGradKernel, ops::SqueezeGradKernel, + ops::SqueezeGradKernel, ops::SqueezeGradKernel, ops::SqueezeGradKernel, ops::SqueezeGradKernel); REGISTER_OP_CPU_KERNEL( squeeze2, ops::Squeeze2Kernel, ops::Squeeze2Kernel, + ops::Squeeze2Kernel, ops::Squeeze2Kernel, ops::Squeeze2Kernel, ops::Squeeze2Kernel); @@ -324,6 +327,7 @@ REGISTER_OP_CPU_KERNEL( squeeze2_grad, ops::Squeeze2GradKernel, ops::Squeeze2GradKernel, + ops::Squeeze2GradKernel, ops::Squeeze2GradKernel, ops::Squeeze2GradKernel, ops::Squeeze2GradKernel); diff --git a/paddle/fluid/operators/squeeze_op.cu.cc b/paddle/fluid/operators/squeeze_op.cu.cc index 61a3a39de4a3f16135ea39afbba458dbfc9aa734..f469118fae7099999389076733c9143d56c0e770 100644 --- a/paddle/fluid/operators/squeeze_op.cu.cc +++ b/paddle/fluid/operators/squeeze_op.cu.cc @@ -21,6 +21,7 @@ REGISTER_OP_CUDA_KERNEL( squeeze, ops::SqueezeKernel, ops::SqueezeKernel, ops::SqueezeKernel, + ops::SqueezeKernel, ops::SqueezeKernel, ops::SqueezeKernel, ops::SqueezeKernel); @@ -29,6 +30,7 @@ REGISTER_OP_CUDA_KERNEL( ops::SqueezeGradKernel, ops::SqueezeGradKernel, ops::SqueezeGradKernel, + ops::SqueezeGradKernel, ops::SqueezeGradKernel, ops::SqueezeGradKernel, ops::SqueezeGradKernel); @@ -36,6 +38,7 @@ REGISTER_OP_CUDA_KERNEL( squeeze2, ops::Squeeze2Kernel, ops::Squeeze2Kernel, ops::Squeeze2Kernel, + ops::Squeeze2Kernel, ops::Squeeze2Kernel, ops::Squeeze2Kernel, ops::Squeeze2Kernel); @@ -44,6 +47,7 @@ REGISTER_OP_CUDA_KERNEL( ops::Squeeze2GradKernel, ops::Squeeze2GradKernel, ops::Squeeze2GradKernel, + ops::Squeeze2GradKernel, ops::Squeeze2GradKernel, ops::Squeeze2GradKernel, ops::Squeeze2GradKernel); diff --git a/paddle/fluid/platform/dynload/tensorrt.h b/paddle/fluid/platform/dynload/tensorrt.h index 60e299385d6a6433d11753c7a0b96958b48a8e2a..67a79ce4bb1594afd23d960d18b75a8f0f1b2513 100644 --- a/paddle/fluid/platform/dynload/tensorrt.h +++ b/paddle/fluid/platform/dynload/tensorrt.h @@ -36,26 +36,29 @@ extern void* tensorrt_dso_handle; struct DynLoad__##__name { \ template \ auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ - using tensorrt_func = decltype(&::__name); \ std::call_once(tensorrt_dso_flag, []() { \ tensorrt_dso_handle = paddle::platform::dynload::GetTensorRtHandle(); \ - PADDLE_ENFORCE_NOT_NULL(tensorrt_dso_handle, \ - platform::errors::Unavailable( \ - "Load tensorrt %s failed", #__name)); \ }); \ static void* p_##__name = dlsym(tensorrt_dso_handle, #__name); \ - PADDLE_ENFORCE_NOT_NULL( \ - p_##__name, \ - platform::errors::Unavailable("Load tensorrt %s failed", #__name)); \ + if (p_##__name == nullptr) { \ + return nullptr; \ + } \ + using tensorrt_func = decltype(&::__name); \ return reinterpret_cast(p_##__name)(args...); \ } \ }; \ extern DynLoad__##__name __name +#if (NV_TENSORRT_MAJOR >= 6) #define TENSORRT_RAND_ROUTINE_EACH(__macro) \ __macro(createInferBuilder_INTERNAL); \ __macro(createInferRuntime_INTERNAL); \ __macro(getPluginRegistry); +#else +#define TENSORRT_RAND_ROUTINE_EACH(__macro) \ + __macro(createInferBuilder_INTERNAL); \ + __macro(createInferRuntime_INTERNAL); +#endif TENSORRT_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP) diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 9a3a639579bd9d44f257c3f0f1aa63e0ae27e8e2..5b612677da3554f17ab3ac29ddc241eee5f7c768 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -33,6 +33,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/platform/cuda_error.pb.h" #endif // PADDLE_WITH_CUDA @@ -69,6 +70,8 @@ limitations under the License. */ #include "paddle/fluid/framework/type_defs.h" #include "paddle/fluid/imperative/type_defs.h" +DECLARE_int32(call_stack_level); + namespace paddle { namespace platform { @@ -226,9 +229,7 @@ inline std::string SimplifyDemangleStr(std::string str) { return str; } -template -inline std::string GetTraceBackString(StrType&& what, const char* file, - int line) { +inline std::string GetCurrentTraceBackString() { static constexpr int TRACE_STACK_LIMIT = 100; std::ostringstream sout; @@ -256,6 +257,13 @@ inline std::string GetTraceBackString(StrType&& what, const char* file, #else sout << "Windows not support stack backtrace yet.\n"; #endif + return sout.str(); +} + +template +inline std::string GetErrorSumaryString(StrType&& what, const char* file, + int line) { + std::ostringstream sout; sout << "\n----------------------\nError Message " "Summary:\n----------------------\n"; sout << string::Sprintf("%s at (%s:%d)", std::forward(what), file, @@ -264,6 +272,17 @@ inline std::string GetTraceBackString(StrType&& what, const char* file, return sout.str(); } +template +inline std::string GetTraceBackString(StrType&& what, const char* file, + int line) { + if (FLAGS_call_stack_level > 1) { + // FLAGS_call_stack_level>1 means showing c++ call stack + return GetCurrentTraceBackString() + GetErrorSumaryString(what, file, line); + } else { + return GetErrorSumaryString(what, file, line); + } +} + inline bool is_error(bool stat) { return !stat; } inline void throw_on_error(bool stat, const std::string& msg) { @@ -427,7 +446,7 @@ struct EnforceNotMet : public std::exception { * * Examples: * GET_DATA_SAFELY(ctx.Input("X"), "Input", "X", "Mul"); -*/ + */ #define GET_DATA_SAFELY(__PTR, __ROLE, __NAME, __OP_TYPE) \ (([&]() -> std::add_lvalue_reference::type { \ auto* __ptr = (__PTR); \ @@ -463,7 +482,7 @@ struct EnforceNotMet : public std::exception { * * Examples: * OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Mul"); -*/ + */ #define OP_INOUT_CHECK(__EXPR, __ROLE, __NAME, __OP_TYPE) \ do { \ PADDLE_ENFORCE_EQ(__EXPR, true, paddle::platform::errors::NotFound( \ @@ -491,7 +510,7 @@ struct EnforceNotMet : public std::exception { * Note: GCC 4.8 cannot select right overloaded function here, so need * to define different functions and macros here, after we upgreade * CI gcc version, we can only define one BOOST_GET macro. -*/ + */ namespace details { #define DEFINE_SAFE_BOOST_GET(__InputType, __OutputType, __OutputTypePtr, \ diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index 98bdf1f8c675da4e3a272945d605563e35016f8d..8667375c6f2726f1099c6e57c6e793252b01d454 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -483,3 +483,28 @@ DEFINE_double(local_exe_sub_scope_limit, 256.0, // MBytes * Note: */ DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run"); + +/** + * Debug related FLAG + * Name: FLAGS_call_stack_level + * Since Version: 2.0.0 + * Value Range: int, default=2 + * Example: + * Note: Used to debug. Determine the call stack to print when error or + * exeception happens. + * If FLAGS_call_stack_level == 0, only the error message summary will be shown. + * If FLAGS_call_stack_level == 1, the python stack and error message summary + * will be shown. + * If FLAGS_call_stack_level == 2, the python stack, c++ stack, and error + * message summary will be shown. + */ +DEFINE_int32( + call_stack_level, 2, + "Determine the call stack to print when error or exeception happens." + // TODO(zhiqiu): implement logic of FLAGS_call_stack_level==0 + // "If FLAGS_call_stack_level == 0, only the error message summary will be " + // "shown. " + "If FLAGS_call_stack_level == 1, the python stack and error message " + "summary will be shown." + "If FLAGS_call_stack_level == 2, the python stack, c++ stack, and " + "error message summary will be shown."); diff --git a/paddle/fluid/pybind/global_value_getter_setter.cc b/paddle/fluid/pybind/global_value_getter_setter.cc index 5178b5f89adf3b8a39b303228d1e674b22e7dc2d..deca9625e63d05625c407a1282b396398bb78ccc 100644 --- a/paddle/fluid/pybind/global_value_getter_setter.cc +++ b/paddle/fluid/pybind/global_value_getter_setter.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/pybind/global_value_getter_setter.h" + #include #include #include @@ -20,6 +21,7 @@ #include #include #include + #include "gflags/gflags.h" #include "paddle/fluid/framework/python_headers.h" #include "paddle/fluid/platform/enforce.h" @@ -35,6 +37,7 @@ DECLARE_bool(cpu_deterministic); DECLARE_bool(enable_rpc_profiler); DECLARE_int32(multiple_of_cupti_buffer_size); DECLARE_bool(reader_queue_speed_test_mode); +DECLARE_int32(call_stack_level); // device management DECLARE_int32(paddle_num_threads); // executor @@ -337,14 +340,15 @@ static void RegisterGlobalVarGetterSetter() { REGISTER_PUBLIC_GLOBAL_VAR( FLAGS_eager_delete_tensor_gb, FLAGS_enable_parallel_graph, FLAGS_allocator_strategy, FLAGS_use_system_allocator, FLAGS_check_nan_inf, - FLAGS_cpu_deterministic, FLAGS_enable_rpc_profiler, - FLAGS_multiple_of_cupti_buffer_size, FLAGS_reader_queue_speed_test_mode, - FLAGS_pe_profile_fname, FLAGS_print_sub_graph_dir, - FLAGS_fraction_of_cpu_memory_to_use, FLAGS_fuse_parameter_groups_size, - FLAGS_fuse_parameter_memory_size, FLAGS_init_allocated_mem, - FLAGS_initial_cpu_memory_in_mb, FLAGS_memory_fraction_of_eager_deletion, - FLAGS_use_pinned_memory, FLAGS_benchmark, FLAGS_inner_op_parallelism, - FLAGS_tracer_profile_fname, FLAGS_paddle_num_threads); + FLAGS_call_stack_level, FLAGS_cpu_deterministic, + FLAGS_enable_rpc_profiler, FLAGS_multiple_of_cupti_buffer_size, + FLAGS_reader_queue_speed_test_mode, FLAGS_pe_profile_fname, + FLAGS_print_sub_graph_dir, FLAGS_fraction_of_cpu_memory_to_use, + FLAGS_fuse_parameter_groups_size, FLAGS_fuse_parameter_memory_size, + FLAGS_init_allocated_mem, FLAGS_initial_cpu_memory_in_mb, + FLAGS_memory_fraction_of_eager_deletion, FLAGS_use_pinned_memory, + FLAGS_benchmark, FLAGS_inner_op_parallelism, FLAGS_tracer_profile_fname, + FLAGS_paddle_num_threads); #ifdef PADDLE_WITH_CUDA REGISTER_PUBLIC_GLOBAL_VAR( diff --git a/python/paddle/fleet/base/role_maker.py b/python/paddle/fleet/base/role_maker.py index f6b5c8ac12e92dcbe6ca710f20d509cabaafac63..b3e8120af6f855bb6dba157af107f4ca7ca3b3a4 100644 --- a/python/paddle/fleet/base/role_maker.py +++ b/python/paddle/fleet/base/role_maker.py @@ -12,5 +12,523 @@ # See the License for the specific language governing permissions and # limitations under the License. """Defination of Role Makers.""" +import os +import numpy as np +from multiprocessing import Process, Manager +import paddle.fluid as fluid -# __all__ = ['RoleMakerBase', 'UserDefinedRoleMaker', 'PaddleCloudRoleMaker'] +__all__ = ['RoleMakerBase', 'UserDefinedRoleMaker', 'PaddleCloudRoleMaker'] + + +class Role: + WORKER = 1 + SERVER = 2 + + +class RoleMakerBase(object): + """ + RoleMakerBase is a base class for assigning a role to current process + in distributed training. + A paddle developer can implement RoleMakerBase to design a role maker + for worker or pserver assignment. + """ + + def __init__(self): + self._worker_endpoints = [] + self._server_endpoints = [] + self._role_is_generated = False + self._role = None + self._current_id = -1 + + self._node_type = None + self._node_type_comm = None + self._all_comm = None + + def is_worker(self): + """ + return is_worker() of current process + """ + raise NotImplementedError("Please implement this method in child class") + + def is_server(self): + """ + return is_server() of current process + """ + raise NotImplementedError("Please implement this method in child class") + + def is_first_worker(self): + """ + Check whether the node is the first instance of worker. + Returns: + bool: True if this is the first node of worker, + False if not. + """ + raise NotImplementedError("Please implement this method in child class") + + def worker_num(self): + """ + Get current total worker number. + + Returns: + int: worker number + """ + raise NotImplementedError("Please implement this method in child class") + + def server_num(self): + """ + Get current total server number. + + Returns: + int: server number + """ + raise NotImplementedError("Please implement this method in child class") + + def worker_index(self): + """ + Get current worker id. + + Returns: + int: node id + """ + raise NotImplementedError("Please implement this method in child class") + + def server_index(self): + """ + Get current server id. + + Returns: + int: node id + """ + raise NotImplementedError("Please implement this method in child class") + + def role_id(self): + """ + Get current id. + + Returns: + int: node id + """ + raise NotImplementedError("Please implement this method in child class") + + def get_trainer_endpoints(self): + """ + return trainer endpoints + """ + return self._worker_endpoints + + def get_pserver_endpoints(self): + """ + return pserver endpoints + """ + return self._server_endpoints + + def to_string(self): + return "role: {}, current_id: {}, worker_endpoints: {}, server_endpoints: {}".format( + self._role, self._current_id, self._worker_endpoints, + self._server_endpoints) + + def _all_gather(self, comm_world, input): + """ + + Args: + input(int|float): input value + + Returns: + return a list of values + """ + print("warning: RoleMakerBase does not have all gather.") + return None + + def _all_reduce(self, comm_world, input, mode="sum"): + """ + Args: + input(list/numpy.array): array of one dim + output(list/numpy.array): array of one dim + mode(str): "sum" or "min" or "max" + """ + print("warning: RoleMakerBase does not have all reduce worker.") + return None + + def _barrier(self, comm_world): + """ + barrier between trainers if current role is TRAINER + """ + print("warning: RoleMakerBase does not have barrier worker.") + + +class PaddleCloudRoleMaker(RoleMakerBase): + def __init__(self, is_collective=False, init_gloo=True, **kwargs): + super(PaddleCloudRoleMaker, self).__init__() + self._is_collective = is_collective + self._init_gloo = init_gloo + self._kwargs = kwargs + + self._role_is_generated = False + + self._server_endpoints = None + self._worker_endpoints = None + + self._node_type_comm = None + self._all_comm = None + + if not self._is_collective: + self._hdfs_name = kwargs.get("hdfs_name", "") + self._hdfs_ugi = kwargs.get("hdfs_ugi", "") + self._hdfs_path = kwargs.get("path", "").rstrip("/") + self._init_timeout_seconds = kwargs.get("init_timeout_seconds", + 3600) + self._run_timeout_seconds = kwargs.get("run_timeout_seconds", + 9999999) + ip_port = kwargs.get("http_ip_port", "") + self._http_ip_port = [] + self._http_server = None + # if ip_port is not empty, it will use http instead of hdfs + if ip_port != "": + self._http_ip_port = ip_port.split(":") + # it's for communication between processes + self._manager = Manager() + # global dict to store status + self._http_server_d = self._manager.dict() + # set running status of http server + self._http_server_d["running"] = False + self._iface = self.__get_default_iface() + # this environment variable can be empty + self._prefix = os.getenv("SYS_JOB_ID", "") + + def _barrier(self, comm_world): + if comm_world: + comm_world.barrier() + + def _all_gather(self, comm_world, input): + if comm_world: + self._barrier(comm_world) + output = comm_world.all_gather(input) + return output + else: + return None + + def _all_reduce(self, comm_world, input, mode="sum"): + if not comm_world: + return None + + input = np.array(input) + + input_shape = input.shape + input_list = input.reshape(-1).tolist() + + self._barrier(comm_world) + ans = comm_world.all_reduce(input_list, mode) + output = np.array(ans).reshape(input_shape) + return output + + def is_worker(self): + """ + whether current process is worker + """ + if not self._role_is_generated: + self.generate_role() + return self._role == Role.WORKER + + def is_server(self): + """ + whether current process is server + """ + if not self._role_is_generated: + self.generate_role() + return self._role == Role.SERVER + + def is_first_worker(self): + """ + whether current process is worker of rank 0 + """ + if not self._role_is_generated: + self.generate_role() + return self._role == Role.WORKER and self._current_id == 0 + + def worker_index(self): + """ + get index of current worker + """ + if not self._role_is_generated: + self.generate_role() + return self._current_id + + def server_index(self): + """ + get index of current server + """ + if not self._role_is_generated: + self.generate_role() + return self._current_id + + def role_id(self): + """ + get index of current node + """ + if self.is_server(): + return self.server_index() + elif self.is_worker(): + return self.worker_index() + + def worker_num(self): + """ + retrun the current number of worker + """ + if not self._role_is_generated: + self.generate_role() + return self._trainers_num + + def server_num(self): + """ + return the current number of server + """ + if not self._role_is_generated: + self.generate_role() + return self._trainers_num + + def get_trainer_endpoints(self): + """ + get endpoint of all trainers + """ + if not self._role_is_generated: + self.generate_role() + return self._worker_endpoints + + def get_pserver_endpoints(self): + """ + get endpoint of all pservers + """ + if not self._role_is_generated: + self.generate_role() + return self._server_endpoints + + def _get_rank(self): + """ + get current rank in all workers and pservers + """ + if not self._role_is_generated: + self.generate_role() + return self._rank + + def _get_size(self): + """ + get total num of all workers and pservers + """ + if not self._role_is_generated: + self.generate_role() + return self._size + + def _ps_env(self): + try: + # Environment variable PADDLE_PSERVERS_IP_PORT_LIST must be set + # format: string(ip:port), eg. 127.0.0.1:6001 + self._server_endpoints = os.environ[ + "PADDLE_PSERVERS_IP_PORT_LIST"].split(",") + self._worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS", + "").split(",") + + trainers_num = int(os.environ["PADDLE_TRAINERS_NUM"]) + training_role = os.environ["TRAINING_ROLE"] + + if training_role not in ["TRAINER", "PSERVER"]: + raise ValueError("TRAINING_ROLE must be PSERVER or TRAINER") + + if training_role == "TRAINER": + role = Role.WORKER + current_id = int(os.environ["PADDLE_TRAINER_ID"]) + if len(self._worker_endpoints) > 0: + self._cur_endpoint = self._worker_endpoints[current_id] + elif training_role == "PSERVER": + role = Role.SERVER + port = os.environ["PADDLE_PORT"] + ip = os.environ["POD_IP"] + self._cur_endpoint = ip + ":" + port + current_id = self._server_endpoints.index(self._cur_endpoint) + else: + raise ValueError("TRAINING_ROLE must be PSERVER or TRAINER") + except ValueError as ve: + raise ValueError( + "something wrong with PaddleCloud, please check environment") + + self._trainers_num = trainers_num + self._role = role + self._current_id = current_id + + def _collective_env(self): + self._current_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) + self._training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER") + assert (self._training_role == "TRAINER") + self._worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS") + self._cur_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") + assert self._worker_endpoints is not None, "can't find PADDLE_TRAINER_ENDPOINTS" + self._worker_endpoints = self._worker_endpoints.split(",") + self._trainers_num = len(self._worker_endpoints) + + def _init_gloo_env(self): + def init_gloo_instance(role="trainer"): + role = role.lower() + assert role in ["trainer", "pserver", "all"] + if role == "trainer": + all_list = self._worker_endpoints + rank = self._current_id + elif role == "pserver": + all_list = self._server_endpoints + rank = self._current_id + else: + all_list = self._worker_endpoints + self._server_endpoints + rank = all_list.index(self._cur_endpoint) + gloo = fluid.core.Gloo() + gloo.set_rank(rank) + gloo.set_size(len(all_list)) + gloo.set_prefix(self._prefix) + gloo.set_iface(self._iface) + gloo.set_timeout_seconds(self._init_timeout_seconds, + self._run_timeout_seconds) + if len(self._http_ip_port) != 0: + gloo.set_http_store(self._http_ip_port[0], + int(self._http_ip_port[1]), role) + else: + gloo.set_hdfs_store(self._hdfs_path + "/" + role, + self._hdfs_name, self._hdfs_ugi) + gloo.init() + return gloo + + # paddlecloud support gloo + if self._role == Role.WORKER: + if self._current_id == 0 and len(self._http_ip_port) != 0: + size_d = { + "trainer": len(self._worker_endpoints), + "pserver": len(self._server_endpoints), + "all": + len(self._worker_endpoints) + len(self._server_endpoints) + } + # child process for http server + self._http_server = Process( + target=self.__start_kv_server, + args=(self._http_server_d, size_d)) + self._http_server.daemon = True + # set running status to True + self._http_server_d["running"] = True + # start child process + self._http_server.start() + self._node_type = 1 + gloo = init_gloo_instance("trainer") + self._node_type_comm = gloo + else: + assert self._role == Role.SERVER + self._node_type = 0 + gloo = init_gloo_instance("pserver") + self._node_type_comm = gloo + + all_list = self._worker_endpoints + self._server_endpoints + self._rank = all_list.index(self._cur_endpoint) + self._size = len(all_list) + + gloo = init_gloo_instance("all") + self._all_comm = gloo + + if self._http_server is not None: + # set running status to False + self._http_server_d["running"] = False + # wait until child process exits + self._http_server.join() + + def generate_role(self): + """ + generate role for role maker + """ + if not self._role_is_generated: + if not self._is_collective: + self._ps_env() + if self._init_gloo: + self._init_gloo_env() + else: + self._collective_env() + self._role_is_generated = True + + def __get_default_iface(self): + """ + get default physical interface + """ + default1 = self.__get_default_iface_from_gateway() + default2 = self.__get_default_iface_from_interfaces() + return default2 if default1 == "lo" else default1 + + def __get_default_iface_from_gateway(self): + """ + get default physical interface + """ + import netifaces + gateways = netifaces.gateways() + if gateways.get(netifaces.AF_INET) != None: + gateway = gateways[netifaces.AF_INET] + if len(gateway) > 0 and len(gateway[0]) > 1: + return gateway[0][1] + return "lo" + + def __get_default_iface_from_interfaces(self): + """ + get default physical interface + """ + import netifaces + for intf_name in netifaces.interfaces(): + addresses = netifaces.ifaddresses(intf_name) + if netifaces.AF_INET in addresses: + ipv4_addresses = addresses[netifaces.AF_INET] + for ipv4_address in ipv4_addresses: + if 'broadcast' in ipv4_address: + return intf_name + return "lo" + + def __start_kv_server(self, http_server_d, size_d): + from paddle.fleet.utils import KVServer + http_server = KVServer(int(self._http_ip_port[1]), size_d) + http_server.start() + wait_seconds = 5 + while http_server_d.get("running", + False) and not http_server.shoud_stop(): + time.sleep(wait_seconds) + http_server.stop() + + +class UserDefinedRoleMaker(PaddleCloudRoleMaker): + def __init__(self, is_collective=False, init_gloo=False, **kwargs): + super(UserDefinedRoleMaker, self).__init__( + is_collective=is_collective, init_gloo=init_gloo, **kwargs) + + def _user_defined_ps_env(self): + self._server_endpoints = self._kwargs.get("server_endpoints") + self._worker_endpoints = self._kwargs.get("worker_endpoints", []) + self._trainers_num = self._kwargs.get("worker_num", 0) + + if self._trainers_num == 0: + assert (len(self._worker_endpoints) > 0) + self._trainers_num = len(self._worker_endpoints) + + self._role = self._kwargs.get("role") + self._current_id = self._kwargs.get("current_id") + + if self._role == Role.WORKER and len( + self._worker_endpoints) > self._current_id: + self._cur_endpoint = self._worker_endpoints[self._current_id] + elif self._role == Role.SERVER: + self._cur_endpoint = self._server_endpoints[self._current_id] + + def _user_defined_collective_env(self): + self._worker_endpoints = self._kwargs.get("worker_endpoints") + self._current_id = self._kwargs.get("current_id") + self._trainers_num = len(self._worker_endpoints) + self._training_role = Role.Worker + + def generate_role(self): + """ + generate role for role maker + """ + if not self._role_is_generated: + if not self._is_collective: + self._user_defined_ps_env() + if self._init_gloo: + self._init_gloo_env() + else: + self._user_defined_collective_env() + self._role_is_generated = True diff --git a/python/paddle/fleet/base/util_factory.py b/python/paddle/fleet/base/util_factory.py index 385500de8c018853fe46205fc3d5bc6aac1aa22d..ed2a8db586aa9c33a3aeed51b77af98e11b4dc5f 100644 --- a/python/paddle/fleet/base/util_factory.py +++ b/python/paddle/fleet/base/util_factory.py @@ -18,12 +18,27 @@ __all__ = ['UtilBase'] +import numpy as np +import os + +import subprocess +from paddle.fluid import core +from collections import OrderedDict +import paddle.fluid as fluid +from google.protobuf import text_format +from paddle.fluid import debugger +from paddle.fluid.framework import Program +from paddle.fluid.proto import framework_pb2 +from ..utils.fs import FS, LocalFS, HDFSClient + class UtilFactory(object): - def _create_util(self, context): + def _create_util(self, context=None): util = UtilBase() - util._set_strategy(context["valid_strategy"]) - util._set_role_maker(context["role_maker"]) + if context is not None and "valid_strategy" in context: + util._set_strategy(context["valid_strategy"]) + if context is not None and "role_maker" in context: + util._set_role_maker(context["role_maker"]) return util @@ -38,43 +53,390 @@ class UtilBase(object): def _set_role_maker(self, role_maker): self.role_maker = role_maker - ''' def set_file_system(self, fs_client): + assert isinstance( + fs_client, + FS), "fs_client must be the instance of paddle.fleet.utils.FS" self.fs_client = fs_client - def broadcast(self): - pass + def __check_comm_world(self, comm_world="worker"): + if not self.role_maker._role_is_generated: + self.role_maker.generate_role() - def all_gather(self): - pass + _comm_world = None + comm_world_upper = comm_world.upper() + if comm_world_upper == "WORKER": + if not self.role_maker.is_worker(): + print( + "warning: current role is not worker in collective_func(comm_world=\"worker\")" + ) + _comm_world = self.role_maker._node_type_comm + elif comm_world_upper == "SERVER": + if not self.role_maker.is_server(): + print( + "warning: current role is not server in collective_func(comm_world=\"server\")" + ) + _comm_world = self.role_maker._node_type_comm + elif comm_world_upper == "ALL": + _comm_world = self.role_maker._all_comm + else: + raise ValueError( + "not support comm_world, please choose one from [worker, server, all]" + ) - def all_reduce(self): - pass + return _comm_world - def reduce_scatter(self): + def all_reduce(self, input, mode, comm_world="worker"): + _comm_world = self.__check_comm_world(comm_world) + return self.role_maker._all_reduce(_comm_world, input, mode) + + def barrier(self, comm_world="worker"): + _comm_world = self.__check_comm_world(comm_world) + self.role_maker._barrier(_comm_world) + + def all_gather(self, input, comm_world="worker"): + _comm_world = self.__check_comm_world(comm_world) + return self.role_maker._all_gather(_comm_world, input) + + def broadcast(self): pass - def reduce(self): + def scatter(self): pass def get_file_shard(self, files): - pass + """ + split files before distributed training, + example 1: files is [a, b, c ,d, e] and trainer_num = 2, then trainer + 0 gets [a, b, c] and trainer 1 gets [d, e]. + example 2: files is [a, b], and trainer_num = 3, then trainer 0 gets + [a], trainer 1 gets [b], trainer 2 gets [] - def feed_gen(self, batch_size, feed_vars_dims, feeded_vars_filelist): - pass + Args: + files(list): file list need to be read. - def save_program(program, output_dir): - pass + Returns: + list: files belongs to this worker. + """ + if not isinstance(files, list): + raise TypeError("files should be a list of file need to be read.") - def load_program(input_dir): - pass + trainer_id = self.role_maker.worker_index() + trainers = self.role_maker.worker_num() - def load_var(): - pass + remainder = len(files) % trainers + blocksize = int(len(files) / trainers) - def save_var(): - pass + blocks = [blocksize] * trainers + for i in range(remainder): + blocks[i] += 1 - def print_on_rank(self): - pass - ''' + trainer_files = [[]] * trainers + begin = 0 + for i in range(trainers): + trainer_files[i] = files[begin:begin + blocks[i]] + begin += blocks[i] + + return trainer_files[trainer_id] + + def print_on_rank(self, message, rank_id): + if self.role_maker.worker_index() != rank_id: + return + print(message) + + def _save_program(self, program, model_filename='__model__', is_text=False): + if is_text: + with open(model_filename, "w") as f: + f.write(str(program)) + else: + with open(model_filename, "wb") as f: + f.write(program.desc.serialize_to_string()) + + def _load_program(self, path, is_text): + def load_program_binary(path): + """load program from binary string file""" + with open(path, "rb") as f: + program_desc_str = f.read() + return Program.parse_from_string(program_desc_str) + + def load_program_text(path): + """load program from human-readable text file""" + with open(path, "r") as f: + program_desc_text = f.read() + + prog_desc = framework_pb2.ProgramDesc() + text_format.Merge(program_desc_text, prog_desc) + return Program.parse_from_string(prog_desc.SerializeToString()) + + if is_text: + return load_program_text(path) + else: + return load_program_binary(path) + + def _program_type_trans(self, prog_dir, prog_fn, is_text): + prog = self._load_program(os.path.join(prog_dir, prog_fn), is_text) + prog_out_fn = prog_fn + ".bin" if is_text else prog_fn + ".pbtxt" + self._save_program(prog, + os.path.join(prog_dir, prog_out_fn), 1 - is_text) + return prog_out_fn + + def _visualize_graphviz(self, program, output_dir, output_filename): + block = program.global_block() + dot_path = os.path.join(output_dir, output_filename + '.dot') + pdf_path = os.path.join(output_dir, output_filename + '.pdf') + debugger.draw_block_graphviz(block, path=dot_path) + cmd = ["dot", "-Tpdf", dot_path, "-o", pdf_path] + p = subprocess.Popen( + cmd, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + p.wait() + + def _proto_check(self, config): + train_prog = self._load_program(config.train_prog_path, + config.is_text_train_program) + pruned_prog = self._load_program(config.pruned_prog_path, + config.is_text_pruned_program) + + is_match = True + + pruned_vars = [(v.name, v) for v in pruned_prog.list_vars() + if fluid.io.is_persistable(v)] + pruned_vars = OrderedDict(pruned_vars) + pruned_vars_name = [name for name in pruned_vars] + print("persistable vars in pruned program: {}".format(pruned_vars_name)) + + # feed and fetch op is added in pruned program when pruning, not need to be found in train program + feed_fetch_type_list = [ + core.VarDesc.VarType.FEED_MINIBATCH, core.VarDesc.VarType.FETCH_LIST + ] + + for var_name in pruned_vars: + var = pruned_vars[var_name] + # feed and fetch op is added in pruned program when pruning, not need to be found in train program + if var.type in feed_fetch_type_list: + break + try: + train_prog_var = train_prog.global_block().var(var_name) + except ValueError as e: + print( + "Not find variable '%s' in train program. please check pruning." + % var_name) + is_match = False + continue + if var.shape != train_prog_var.shape or var.dtype != train_prog_var.dtype: + print( + "variable: {} not match. in pruned program shape: {} dtype:{}, in train program shape: {} dtype: {}". + format(var_name, var.shape, var.dtype, train_prog_var.shape, + train_prog_var.dtype)) + is_match = False + return is_match + + def _params_check(self, config): + def feed_gen(batch_size, feeded_vars_dims, feeded_vars_filelist): + def reader(batch_size, fn, dim): + data = [] + if isinstance(dim, list) or isinstance(dim, tuple): + shape = list(dim) + _temp = 1 + for x in dim: + _temp = _temp * x + dim = _temp + else: + shape = [dim] + + shape = [batch_size] + shape + dim = dim * batch_size + + for line in open(fn, 'r'): + fields = line.strip().split(' ') + fields = [float(d) for d in fields] + while len(fields) >= dim: + tmp = fields[:dim] + fields = fields[dim:] + data.append(np.array(tmp).reshape(shape)) + return data + + batch_feed = [] + for i, fn in enumerate(feeded_vars_filelist): + batch_feed.append(reader(batch_size, fn, feeded_vars_dims[i])) + return batch_feed + + prog = self._load_program( + os.path.join(config.dump_model_dir, config.dump_program_filename), + config.is_text_dump_program) + if config.is_text_dump_program: + model_filename = self._program_type_trans( + config.dump_model_dir, config.dump_program_filename, + config.is_text_dump_program) + + saved_params = [ + v for v in prog.list_vars() if fluid.io.is_persistable(v) + ] + print("persistable vars in dump program: {}".format( + [v.name for v in saved_params])) + + def check_not_expected_ops(prog, not_expected_op_types): + op_types_set = set() + for op in prog.global_block().ops: + if op.type in not_expected_op_types and op.type not in op_types_set: + op_types_set.add(op.type) + return op_types_set + + not_expected_op_types = check_not_expected_ops(prog, ["lookup_table"]) + if len(not_expected_op_types) > 0: + print( + "find op type '{}' in program, please check if your program is pruned correctly !". + format(list(not_expected_op_types))) + return False + + place = fluid.CPUPlace() + exe = fluid.Executor(place) + scope = fluid.core.Scope() + with fluid.scope_guard(scope): + inference_program, feed_target_names, fetch_targets = \ + fluid.io.load_inference_model(config.dump_model_dir, exe, model_filename=model_filename, + params_filename=config.save_params_filename) + + # check program vars and saved vars shape + orig_para_shape = { + each_var.name: tuple(each_var.desc.shape()) + for each_var in saved_params + } + for each_var in saved_params: + var_temp = fluid.global_scope().find_var(each_var.name) + assert var_temp != None, "can't not find var: " + each_var.name + new_shape = (np.array(var_temp.get_tensor())).shape + assert each_var.name in orig_para_shape, each_var.name + "MUST in var list" + orig_shape = orig_para_shape.get(each_var.name) + if new_shape != orig_shape: + raise RuntimeError( + "Shape not matching: the Program requires a parameter with a shape of ({}), " + "while the loaded parameter (namely [ {} ]) has a shape of ({}).". + format(orig_shape, each_var.name, new_shape)) + + # check feed/fetch vars in program and config + feed_config = config.feed_config + fetch_config = config.fetch_config + fetch_targets_names = [v.name for v in fetch_targets] + if not feed_target_names: + print("warning! no feed targets in program.") + if not fetch_targets_names: + print("warning! no fetch targets in program.") + fetch_list = fetch_targets + feed_name_list = feed_target_names + if feed_config.feeded_vars_names is not None and feed_target_names != feed_config.feeded_vars_names: + print( + "warning! feed vars in program and config are diff: feed in program: {}. feed in config {}.". + format(feed_target_names, feed_config.feeded_vars_names)) + feed_name_list = feed_config.feeded_vars_names + # remove feed op in inference_program. new feed op will be added in exe.run + global_block = inference_program.global_block() + need_to_remove_op_index = [] + for i, op in enumerate(global_block.ops): + op.desc.set_is_target(False) + if op.type == "feed": # only remove feed op here + need_to_remove_op_index.append(i) + for index in need_to_remove_op_index[::-1]: + global_block._remove_op(index) + if fetch_config.fetch_vars_names is not None and fetch_targets_names != fetch_config.fetch_vars_names: + print( + "warning! fetch vars in program and config are diff: fetch in program: {}. fetch in config {}.". + format(fetch_targets_names, fetch_config.fetch_vars_names)) + fetch_list = [ + inference_program.global_block().var(i) + for i in fetch_config.fetch_vars_names + ] + # remove fetch op in inference_program. new fetch op will be added in exe.run + global_block = inference_program.global_block() + need_to_remove_op_index = [] + for i, op in enumerate(global_block.ops): + op.desc.set_is_target(False) + if op.type == "fetch": # only remove fetch op here + need_to_remove_op_index.append(i) + for index in need_to_remove_op_index[::-1]: + global_block._remove_op(index) + + # if fetch_list have lod tensor + return_numpy = all([v.lod_level == 0 for v in fetch_list]) + + # try dump fetch_targets + feed_tensors = [] + assert len(feed_config.feeded_vars_names) == len( + feed_config.feeded_vars_dims) == len( + feed_config.feeded_vars_types) + # check program vars and feed tensor shape in config + for i in range(len(feed_config.feeded_vars_names)): + var = inference_program.global_block().var( + feed_config.feeded_vars_names[i]) + if not isinstance(feed_config.feeded_vars_dims[i], + (list, tuple)): + tensor_shape = (feed_config.feeded_vars_dims[i], ) + else: + tensor_shape = tuple(feed_config.feeded_vars_dims[i]) + feed_config.feeded_vars_dims[i] = tensor_shape + var_shape = var.shape[1:] + if tensor_shape != var_shape: + raise RuntimeError( + "feed variable '{}' shape not match. infer program shape: {}. feed tensor shape: {}". + format(feed_config.feeded_vars_names[i], var_shape, + tensor_shape)) + + if not feed_config.feeded_vars_filelist: + print("generate random feed vars.") + for i in range(len(feed_config.feeded_vars_names)): + var = inference_program.global_block().var( + feed_config.feeded_vars_names[i]) + # create fake feed tensor. if lod_level > 1, should create_lod_tensor() + if var.lod_level == 0: + feed_tensors.append( + np.array( + np.random.random( + tuple([config.batch_size] + list( + feed_config.feeded_vars_dims[i]))), + dtype=feed_config.feeded_vars_types[i])) + elif var.lod_level == 1: + t = np.array( + np.random.random( + tuple([config.batch_size] + list( + feed_config.feeded_vars_dims[i]))), + dtype=feed_config.feeded_vars_types[i]) + feed_tensors.append( + fluid.create_lod_tensor(t, [[1] * config.batch_size + ], place)) + else: + raise RuntimeError( + "vars with lod_level >= 2 is not supported now in this infer program check tool." + ) + results = exe.run(inference_program, + feed={ + name: feed_tensors[i] + for i, name in enumerate(feed_name_list) + }, + fetch_list=fetch_list, + return_numpy=return_numpy) + else: + print("load feed vars from files: {}.".format( + feed_config.feeded_vars_filelist)) + feed_vars = [ + inference_program.global_block().var( + feed_config.feeded_vars_names[i]) + for i in range(len(feed_config.feeded_vars_names)) + ] + feeder = fluid.DataFeeder(feed_list=feed_vars, place=place) + batch_feed = feed_gen(config.batch_size, + feed_config.feeded_vars_dims, + feed_config.feeded_vars_filelist) + slots = [batch_feed] + results = exe.run(inference_program, + feed=feeder.feed(slots), + fetch_list=fetch_list, + return_numpy=return_numpy) + for i, v in enumerate(fetch_list): + print("fetch_targets name: %s" % v.name) + print("fetch_targets: {}".format(results[i])) + return results + + +fleet_util = UtilFactory()._create_util(None) diff --git a/python/paddle/fleet/utils/__init__.py b/python/paddle/fleet/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..212308159aabb123fde11543b3482f2232b4925d --- /dev/null +++ b/python/paddle/fleet/utils/__init__.py @@ -0,0 +1,18 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .fs import * +from .http_server import KVHandler, KVHTTPServer, KVServer + +__all__ = ['KVHandler', 'KVHTTPServer', 'KVServer'] + fs.__all__ diff --git a/python/paddle/fleet/utils/fs.py b/python/paddle/fleet/utils/fs.py new file mode 100644 index 0000000000000000000000000000000000000000..3fec773f2731803cd9166ae0500dba68f4f0011b --- /dev/null +++ b/python/paddle/fleet/utils/fs.py @@ -0,0 +1,382 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import subprocess +import multiprocessing +from datetime import datetime + +import re +import copy +import errno +import time +import logging +import six +import abc +import paddle.fluid as fluid +import functools + +from pathlib import PurePosixPath, Path +import shutil + +__all__ = [ + 'FS', 'LocalFS', 'HDFSClient', 'ExecuteError', 'FSTimeOut', + 'FSFileExistsError', 'FSFileNotExistsError' +] + + +class ExecuteError(Exception): + pass + + +class FSFileExistsError(Exception): + pass + + +class FSFileNotExistsError(Exception): + pass + + +class FSTimeOut(Exception): + pass + + +class FS(object): + @abc.abstractmethod + def ls_dir(self, fs_path): + raise NotImplementedError + + @abc.abstractmethod + def is_file(self, fs_path): + raise NotImplementedError + + @abc.abstractmethod + def is_dir(self, fs_path): + raise NotImplementedError + + @abc.abstractmethod + def is_exist(self, fs_path): + raise NotImplementedError + + @abc.abstractmethod + def upload(self, local_path, fs_path): + raise NotImplementedError + + @abc.abstractmethod + def download(self, fs_path, local_path): + raise NotImplementedError + + @abc.abstractmethod + def mkdirs(self, fs_path): + raise NotImplementedError + + @abc.abstractmethod + def delete(self, fs_path): + raise NotImplementedError + + @abc.abstractmethod + def need_upload_download(self): + raise NotImplementedError + + @abc.abstractmethod + def rename(self, fs_src_path, fs_dst_path): + raise NotImplementedError + + @abc.abstractmethod + def mv(self, fs_src_path, fs_dst_path): + raise NotImplementedError + + @abc.abstractmethod + def upload_dir(self, local_dir, dest_dir): + raise NotImplementedError + + @abc.abstractmethod + def glob(self, fs_path): + raise NotImplementedError + + @abc.abstractmethod + def stat(self, fs_path): + raise NotImplementedError + + @abc.abstractmethod + def walk(self, fs_path): + raise NotImplementedError + + +class LocalFS(FS): + def ls_dir(self, fs_path): + if not self.is_exist(fs_path): + return [], [] + + dirs = [] + files = [] + for f in os.listdir(fs_path): + if os.path.isdir(fs_path + "/" + f): + dirs.append(f) + else: + files.append(f) + + return dirs, files + + def mkdirs(self, fs_path): + assert not os.path.isfile(fs_path), "{} is already a file".format( + fs_path) + os.system("mkdir -p {}".format(fs_path)) + + def is_file(self, fs_path): + return os.path.isfile(fs_path) + + def is_dir(self, fs_path): + return os.path.isdir(fs_path) + + def is_exist(self, fs_path): + return os.path.exists(fs_path) + + def _rmr(self, fs_path): + shutil.rmtree(fs_path) + + def _rm(self, fs_path): + os.remove(fs_path) + + def delete(self, fs_path): + if not self.is_exist(fs_path): + return + + if os.path.isfile(fs_path): + return self._rm(fs_path) + + return self._rmr(fs_path) + + def rename(self, fs_src_path, fs_dst_path): + os.rename(fs_src_path, fs_dst_path) + + def need_upload_download(self): + return False + + def touch(self, fs_path): + return Path(fs_path).touch() + + def mv(self, src_path, dst_path): + if not self.is_exist(src_path): + raise FSFileNotExistsError + + if self.is_exist(dst_path): + raise FSFileExistsError + + return self.rename(src_path, dst_path) + + +"""HDFS Utils.""" + + +def _handle_errors(f): + def handler(*args, **kwargs): + start = time.time() + while True: + try: + return f(*args, **kwargs) + except ExecuteError as e: + o = args[0] + time_out = float(o._time_out) / 1000.0 + inter = float(o._sleep_inter) / 1000.0 + if time.time() - start >= time_out: + raise FSTimeOut + time.sleep(inter) + + return functools.wraps(f)(handler) + + +class HDFSClient(FS): + def __init__( + self, + hadoop_home, + configs, + time_out=5 * 60 * 1000, #ms + sleep_inter=1000): #ms + # Raise exception if JAVA_HOME not exists. + java_home = os.environ["JAVA_HOME"] + + self.pre_commands = [] + hadoop_bin = '%s/bin/hadoop' % hadoop_home + self.pre_commands.append(hadoop_bin) + dfs = 'fs' + self.pre_commands.append(dfs) + + if configs: + for k, v in six.iteritems(configs): + self.pre_commands.append('-D%s=%s' % (k, v)) + + self._time_out = time_out + self._sleep_inter = sleep_inter + self._base_cmd = " ".join(self.pre_commands) + self._bd_err_re = re.compile( + r'\s?responseErrorMsg\s?\:.*, errorCode\:\s?[0-9]+, path\:') + + def _run_cmd(self, cmd, redirect_stderr=False): + ret, output = fluid.core.shell_execute_cmd(cmd, 0, 0, redirect_stderr) + return int(ret), output.splitlines() + + @_handle_errors + def ls_dir(self, fs_path): + """ + list directory under fs_path, and only give the pure name, not include the fs_path + """ + if not self.is_exist(fs_path): + return [], [] + + cmd = "{} -ls {}".format(self._base_cmd, fs_path) + ret, lines = self._run_cmd(cmd) + + if ret != 0: + raise ExecuteError + + dirs = [] + files = [] + for line in lines: + arr = line.split() + if len(arr) != 8: + continue + + if fs_path not in arr[7]: + continue + + p = PurePosixPath(arr[7]) + if arr[0][0] == 'd': + dirs.append(p.name) + else: + files.append(p.name) + + return dirs, files + + def _test_match(self, lines): + for l in lines: + m = self._bd_err_re.match(l) + if m != None: + return m + + return None + + @_handle_errors + def is_dir(self, fs_path): + if not self.is_exist(fs_path): + return False + + cmd = "{} -test -d {}".format( + self._base_cmd, fs_path, redirect_stderr=True) + ret, lines = self._run_cmd(cmd) + if ret: + # other error + if self._test_match(lines) != None: + raise ExecuteError + + return False + + return True + + def is_file(self, fs_path): + if not self.is_exist(fs_path): + return False + + return not self.is_dir(fs_path) + + @_handle_errors + def is_exist(self, fs_path): + cmd = "{} -ls {} ".format(self._base_cmd, fs_path) + ret, out = self._run_cmd(cmd, redirect_stderr=True) + if ret != 0: + for l in out: + if "No such file or directory" in l: + return False + raise ExecuteError + + return True + + @_handle_errors + def upload(self, local_path, fs_path): + if self.is_exist(fs_path): + raise FSFileExistsError + + local = LocalFS() + if not local.is_exist(local_path): + raise FSFileNotExistsError + + cmd = "{} -put {} {}".format(self._base_cmd, local_path, fs_path) + ret, lines = self._run_cmd(cmd) + if ret != 0: + raise ExecuteError + + @_handle_errors + def download(self, fs_path, local_path): + if self.is_exist(local_path): + raise FSFileExistsError + + if not self.is_exist(fs_path): + raise FSFileNotExistsError + + cmd = "{} -get {} {}".format(self._base_cmd, fs_path, local_path) + ret, lines = self._run_cmd(cmd) + if ret != 0: + raise ExecuteError + + @_handle_errors + def mkdirs(self, fs_path): + if self.is_exist(fs_path): + return + + cmd = "{} -mkdir {}".format(self._base_cmd, fs_path) + ret, lines = self._run_cmd(cmd) + if ret != 0: + raise ExecuteError + + @_handle_errors + def mv(self, fs_src_path, fs_dst_path, test_exists=True): + if test_exists: + if not self.is_exist(fs_src_path): + raise FSFileNotExistsError + + if self.is_exist(fs_dst_path): + raise FSFileExistsError + + cmd = "{} -mv {} {}".format(self._base_cmd, fs_src_path, fs_dst_path) + ret, _ = self._run_cmd(cmd) + if ret != 0: + raise ExecuteError + + @_handle_errors + def _rmr(self, fs_path): + cmd = "{} -rmr {}".format(self._base_cmd, fs_path) + ret, _ = self._run_cmd(cmd) + if ret != 0: + raise ExecuteError + + @_handle_errors + def _rm(self, fs_path): + cmd = "{} -rm {}".format(self._base_cmd, fs_path) + ret, _ = self._run_cmd(cmd) + if ret != 0: + raise ExecuteError + + def delete(self, fs_path): + if not self.is_exist(fs_path): + return + + is_dir = self.is_dir(fs_path) + if is_dir: + return self._rmr(fs_path) + + return self._rm(fs_path) + + def need_upload_download(self): + return True diff --git a/python/paddle/fleet/utils/http_server.py b/python/paddle/fleet/utils/http_server.py new file mode 100644 index 0000000000000000000000000000000000000000..78e310b0a5a516aaaebe6f35822243c56e2ba905 --- /dev/null +++ b/python/paddle/fleet/utils/http_server.py @@ -0,0 +1,195 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Http Server.""" + +import logging + +import six +# NOTE: HTTPServer has a different name in python2 and python3 +if six.PY2: + from BaseHTTPServer import HTTPServer + import SimpleHTTPServer +else: + from http.server import HTTPServer + import http.server as SimpleHTTPServer + +import time +import threading +import socket + + +def get_logger(name, level, fmt): + logger = logging.getLogger(name) + logger.setLevel(level) + handler = logging.FileHandler('http.log', mode='w') + formatter = logging.Formatter(fmt=fmt) + handler.setFormatter(formatter) + logger.addHandler(handler) + return logger + + +_http_server_logger = get_logger( + __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s') + + +class KVHandler(SimpleHTTPServer.SimpleHTTPRequestHandler): + """ + kv handler class for kv http server, + it defines the way to get/set kv in server. + """ + + def do_GET(self): + """ + get method for kv handler, get value according to key. + """ + log_str = "GET " + self.address_string() + self.path + paths = self.path.split('/') + if len(paths) < 3: + print('len of request path must be 3: ' + self.path) + self.send_status_code(400) + return + _, scope, key = paths + with self.server.kv_lock: + value = self.server.kv.get(scope, {}).get(key) + if value is None: + log_str += ' , key not found: ' + key + self.send_status_code(404) + else: + log_str += ' , key found: ' + key + self.send_response(200) + self.send_header("Content-Length", str(len(value))) + self.end_headers() + self.wfile.write(value) + _http_server_logger.info(log_str) + + def do_PUT(self): + """ + put method for kv handler, set value according to key. + """ + log_str = "PUT " + self.address_string() + self.path + paths = self.path.split('/') + if len(paths) < 3: + print('len of request path must be 3: ' + self.path) + self.send_status_code(400) + return + _, scope, key = paths + content_length = int(self.headers['Content-Length']) + try: + value = self.rfile.read(content_length) + except: + print("receive error invalid request") + self.send_status_code(404) + return + with self.server.kv_lock: + if self.server.kv.get(scope) is None: + self.server.kv[scope] = {} + self.server.kv[scope][key] = value + self.send_status_code(200) + _http_server_logger.info(log_str) + + def do_DELETE(self): + """ + delete method for kv handler, set value according to key. + """ + log_str = "DELETE " + self.address_string() + self.path + paths = self.path.split('/') + if len(paths) < 3: + print('len of request path must be 3: ' + self.path) + self.send_status_code(400) + return + _, scope, key = paths + with self.server.delete_kv_lock: + if self.server.delete_kv.get(scope) is None: + self.server.delete_kv[scope] = [] + self.server.delete_kv[scope].append(key) + self.send_status_code(200) + _http_server_logger.info(log_str) + + def log_message(self, format, *args): + """ + ignore all logging messages in kv handler. + """ + pass + + def send_status_code(self, code): + """ + send status code back to client. + """ + self.send_response(code) + self.send_header("Content-Length", 0) + self.end_headers() + + +class KVHTTPServer(HTTPServer, object): + """ + it is a http server storing kv pairs. + """ + + def __init__(self, port, handler): + """Init.""" + super(KVHTTPServer, self).__init__(('', port), handler) + self.delete_kv_lock = threading.Lock() + self.delete_kv = {} + self.kv_lock = threading.Lock() + self.kv = {} + + def get_deleted_size(self, key): + """ + get deleted size in key. + """ + ret = 0 + with self.delete_kv_lock: + ret = self.delete_kv.get(key, 0) + return ret + + +class KVServer: + """ + it is a server storing kv pairs, has a http server inside. + """ + + def __init__(self, port, size={}): + """Init.""" + self.http_server = KVHTTPServer(port, KVHandler) + self.listen_thread = None + self.size = {} + + def start(self): + """ + start server until user calls stop to let it quit. + """ + self.listen_thread = threading.Thread( + target=lambda: self.http_server.serve_forever()) + self.listen_thread.start() + + def stop(self): + """ + stop server and clear its resources. + """ + self.http_server.shutdown() + self.listen_thread.join() + self.http_server.server_close() + + def shoud_stop(self): + """ + return whether the server should stop. + + Returns: + ret(bool): whether the server should stop + """ + for key in self.size: + s = self.http_server.get_deleted_size(key) + if s != self.size.get(key, 0): + return False + return True diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 776a52b300fe0c7c582b59947e13e5ca98daf4e4..88dd815d937a4778b0d24a90d448a262689907f3 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -166,17 +166,34 @@ def __bootstrap__(): os.environ['OMP_NUM_THREADS'] = str(num_threads) sysstr = platform.system() read_env_flags = [ - 'check_nan_inf', 'fast_check_nan_inf', 'benchmark', - 'eager_delete_scope', 'fraction_of_cpu_memory_to_use', - 'initial_cpu_memory_in_mb', 'init_allocated_mem', 'paddle_num_threads', - 'dist_threadpool_size', 'eager_delete_tensor_gb', - 'fast_eager_deletion_mode', 'memory_fraction_of_eager_deletion', - 'allocator_strategy', 'reader_queue_speed_test_mode', - 'print_sub_graph_dir', 'pe_profile_fname', 'inner_op_parallelism', - 'enable_parallel_graph', 'fuse_parameter_groups_size', - 'multiple_of_cupti_buffer_size', 'fuse_parameter_memory_size', - 'tracer_profile_fname', 'dygraph_debug', 'use_system_allocator', - 'enable_unused_var_check', 'free_idle_chunk', 'free_when_no_cache_hit' + 'check_nan_inf', + 'fast_check_nan_inf', + 'benchmark', + 'eager_delete_scope', + 'fraction_of_cpu_memory_to_use', + 'initial_cpu_memory_in_mb', + 'init_allocated_mem', + 'paddle_num_threads', + 'dist_threadpool_size', + 'eager_delete_tensor_gb', + 'fast_eager_deletion_mode', + 'memory_fraction_of_eager_deletion', + 'allocator_strategy', + 'reader_queue_speed_test_mode', + 'print_sub_graph_dir', + 'pe_profile_fname', + 'inner_op_parallelism', + 'enable_parallel_graph', + 'fuse_parameter_groups_size', + 'multiple_of_cupti_buffer_size', + 'fuse_parameter_memory_size', + 'tracer_profile_fname', + 'dygraph_debug', + 'use_system_allocator', + 'enable_unused_var_check', + 'free_idle_chunk', + 'free_when_no_cache_hit', + 'call_stack_level', ] if 'Darwin' not in sysstr: read_env_flags.append('use_pinned_memory') @@ -208,12 +225,19 @@ def __bootstrap__(): if core.is_compiled_with_cuda(): read_env_flags += [ - 'fraction_of_gpu_memory_to_use', 'initial_gpu_memory_in_mb', - 'reallocate_gpu_memory_in_mb', 'cudnn_deterministic', - 'enable_cublas_tensor_op_math', 'conv_workspace_size_limit', - 'cudnn_exhaustive_search', 'selected_gpus', 'sync_nccl_allreduce', - 'cudnn_batchnorm_spatial_persistent', 'gpu_allocator_retry_time', - 'local_exe_sub_scope_limit', 'gpu_memory_limit_mb' + 'fraction_of_gpu_memory_to_use', + 'initial_gpu_memory_in_mb', + 'reallocate_gpu_memory_in_mb', + 'cudnn_deterministic', + 'enable_cublas_tensor_op_math', + 'conv_workspace_size_limit', + 'cudnn_exhaustive_search', + 'selected_gpus', + 'sync_nccl_allreduce', + 'cudnn_batchnorm_spatial_persistent', + 'gpu_allocator_retry_time', + 'local_exe_sub_scope_limit', + 'gpu_memory_limit_mb', ] core.init_gflags(["--tryfromenv=" + ",".join(read_env_flags)]) core.init_glog(sys.argv[0]) diff --git a/python/paddle/fluid/dygraph/checkpoint.py b/python/paddle/fluid/dygraph/checkpoint.py index de4330cf51669ebbbfb1ca7e9edcc0c82b1d0e72..82018132cc8b8600958e5cd52df5844e3d37638e 100644 --- a/python/paddle/fluid/dygraph/checkpoint.py +++ b/python/paddle/fluid/dygraph/checkpoint.py @@ -16,12 +16,13 @@ from __future__ import print_function import os import collections -from ..framework import Variable, default_main_program, in_dygraph_mode, dygraph_only, Parameter, ParamBase +from ..framework import Variable, default_main_program, in_dygraph_mode, dygraph_only, Parameter, ParamBase, _varbase_creator, _dygraph_tracer import pickle import six from . import learning_rate_scheduler import warnings from .. import core +from paddle.fluid.dygraph.io import VARIABLE_FILENAME, EXTRA_VAR_INFO_FILENAME, _load_persistable_vars __all__ = [ 'save_dygraph', @@ -140,22 +141,83 @@ def load_dygraph(model_path, keep_name_table=False): elif model_prefix.endswith(".pdopt"): model_prefix = model_prefix[:-6] - params_file_path = model_prefix + ".pdparams" - if not os.path.exists(params_file_path): - raise RuntimeError("Parameter file [ {} ] not exists".format( - params_file_path)) - - with open(params_file_path, 'rb') as f: - para_dict = pickle.load(f) if six.PY2 else pickle.load( - f, encoding='latin1') - - if not keep_name_table and "StructuredToParameterName@@" in para_dict: - del para_dict["StructuredToParameterName@@"] + para_dict = None opti_dict = None + params_file_path = model_prefix + ".pdparams" opti_file_path = model_prefix + ".pdopt" - if os.path.exists(opti_file_path): - with open(opti_file_path, 'rb') as f: - opti_dict = pickle.load(f) if six.PY2 else pickle.load( - f, encoding='latin1') + if not os.path.exists(params_file_path) and not os.path.exists( + opti_file_path): + # Load state dict by `jit.save` save format + # TODO(chenweihang): [Why not support `io.save_infernece_model` save format here] + # The model saved by `save_inference_model` does not completely correspond to + # the information required by the `state_dict` under the dygraph. + # Although we reluctantly restore the `state_dict` in some scenarios, + # this may not be complete and there are some limitations, so this function + # will be considered later. The limitations include: + # 1. `save_inference_model` not save structured name, we need to remind + # the user to configure the `use_structured_name` argument when `set_dict`, + # but this argument is currently not public + # 2. if `save_inference_model` save all persistable variables in a single file, + # user need to give the variable name list to load `state_dict` + + # 1. check model path + if not os.path.isdir(model_prefix): + raise ValueError("Model saved directory '%s' is not exists." % + model_prefix) + # 2. load `__variables.info__` + var_info_path = os.path.join(model_prefix, EXTRA_VAR_INFO_FILENAME) + if not os.path.exists(var_info_path): + raise RuntimeError( + "No target can be loaded. Now only supports loading `state_dict` from " + "the result saved by `imperative.save` and `imperative.jit.save`." + ) + with open(var_info_path, 'rb') as f: + extra_var_info = pickle.load(f) + # 3. load `__variables__` + # TODO(chenweihang): now only supports loading from default save format: + # - all persistable vars saved in one file named `__variables__` + # for other case, we may need to modify the arguments of this API + var_file_path = os.path.join(model_prefix, VARIABLE_FILENAME) + if not os.path.exists(var_file_path): + raise RuntimeError( + "The parameter file to be loaded was not found. " + "Now only supports loading from the default save format, " + "and does not support custom params_filename and " + "save parameters separately.") + # 4. load all persistable vars + load_var_list = [] + for name in sorted(extra_var_info): + var = _varbase_creator(name=name, persistable=True) + load_var_list.append(var) + _dygraph_tracer().trace_op( + type='load_combine', + inputs={}, + outputs={'Out': load_var_list}, + attrs={'file_path': var_file_path}) + # 5. construct state_dict + para_dict = dict() + for var in load_var_list: + structured_name = extra_var_info[var.name].get('structured_name', + None) + if structured_name is None: + raise RuntimeError( + "Cannot find saved variable (%s)'s structured name in saved model.", + var.name) + para_dict[structured_name] = var.numpy() + # NOTE: `jit.save` doesn't save optimizer state + else: + # Load state dict by `save_dygraph` save format + if os.path.exists(params_file_path): + with open(params_file_path, 'rb') as f: + para_dict = pickle.load(f) if six.PY2 else pickle.load( + f, encoding='latin1') + + if not keep_name_table and "StructuredToParameterName@@" in para_dict: + del para_dict["StructuredToParameterName@@"] + + if os.path.exists(opti_file_path): + with open(opti_file_path, 'rb') as f: + opti_dict = pickle.load(f) if six.PY2 else pickle.load( + f, encoding='latin1') return para_dict, opti_dict diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/error.py b/python/paddle/fluid/dygraph/dygraph_to_static/error.py new file mode 100644 index 0000000000000000000000000000000000000000..74895f08d0f09a4aaae73b868dadc4525dc1c750 --- /dev/null +++ b/python/paddle/fluid/dygraph/dygraph_to_static/error.py @@ -0,0 +1,133 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import traceback + +from paddle.fluid.dygraph.dygraph_to_static.origin_info import Location, OriginInfo, global_origin_info_map + +ERROR_DATA = "Error data about original source code information and traceback." + + +def attach_error_data(error, in_runtime=False): + """ + Attachs error data about original source code information and traceback to an error. + + Args: + error(Exception): An native error. + in_runtime(bool): `error` is raised in runtime if in_runtime is True, otherwise in compile time + Returns: + An error attached data about original source code information and traceback. + """ + e_type, e_value, e_traceback = sys.exc_info() + tb = traceback.extract_tb(e_traceback)[1:] + + error_data = ErrorData(e_type, e_value, tb, global_origin_info_map) + error_data.in_runtime = in_runtime + + setattr(error, ERROR_DATA, error_data) + + return error + + +class TraceBackFrame(OriginInfo): + """ + Traceback frame information. + """ + + def __init__(self, location, function_name, source_code): + self.location = location + self.function_name = function_name + self.source_code = source_code + + +class ErrorData(object): + """ + Error data attached to an exception which is raised in un-transformed code. + """ + + def __init__(self, error_type, error_value, origin_traceback, + origin_info_map): + self.error_type = error_type + self.error_value = error_value + self.origin_traceback = origin_traceback + self.origin_info_map = origin_info_map + self.in_runtime = False + + def create_exception(self): + message = self.create_message() + new_exception = self.error_type(message) + setattr(new_exception, ERROR_DATA, self) + return new_exception + + def create_message(self): + """ + Creates a custom error message which includes trace stack with source code information of dygraph from user. + """ + message_lines = [] + + # Step1: Adds header message to prompt users that the following is the original information. + header_message = "In user code:" + message_lines.append(header_message) + message_lines.append("") + + # Simplify error value to improve readability if error is raised in runtime + if self.in_runtime: + self._simplify_error_value() + message_lines.append(str(self.error_value)) + return '\n'.join(message_lines) + + # Step2: Optimizes stack information with source code information of dygraph from user. + for filepath, lineno, funcname, code in self.origin_traceback: + loc = Location(filepath, lineno) + + dygraph_func_info = self.origin_info_map.get(loc.line_location, + None) + if dygraph_func_info: + # TODO(liym27): more information to prompt users that this is the original information. + # Replaces trace stack information about transformed static code with original dygraph code. + traceback_frame = self.origin_info_map[loc.line_location] + else: + traceback_frame = TraceBackFrame(loc, funcname, code) + + message_lines.append(traceback_frame.formated_message()) + + # Step3: Adds error message like "TypeError: dtype must be int32, but received float32". + error_message = " " * 4 + traceback.format_exception_only( + self.error_type, self.error_value)[0].strip("\n") + message_lines.append(error_message) + + return '\n'.join(message_lines) + + def _simplify_error_value(self): + """ + Simplifies error value to improve readability if error is raised in runtime. + + NOTE(liym27): The op callstack information about transformed static code has been replaced with original dygraph code. + + TODO(liym27): + 1. Need a more robust way because the code of start_trace may change. + 2. Set the switch to determine whether to simplify error_value + """ + assert self.in_runtime is True + + error_value_lines = str(self.error_value).split("\n") + error_value_lines_strip = [mes.lstrip(" ") for mes in error_value_lines] + + start_trace = "outputs = static_func(*inputs)" + start_idx = error_value_lines_strip.index(start_trace) + error_value_lines = error_value_lines[start_idx + 1:] + + error_value_str = '\n'.join(error_value_lines) + self.error_value = self.error_type(error_value_str) diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py index 6b9ee9cbbe21b405b93a7ba2e86b39b47225196c..c66778992c25c68a5adf6a17ae0cd46b57b02fd6 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py @@ -39,32 +39,21 @@ GENERATE_VARIABLE_PREFIX = 'generate_variable' def create_while_node(condition_name, body_name, loop_var_names): - while_args = [] - while_args.append( - gast.Name( - id=condition_name, - ctx=gast.Param(), - annotation=None, - type_comment=None)) - while_args.append( - gast.Name( - id=body_name, ctx=gast.Param(), annotation=None, type_comment=None)) - assign_targets = [ - gast.Name( - id=var_name, ctx=gast.Param(), annotation=None, type_comment=None) - for var_name in loop_var_names - ] - while_args.append(gast.List(elts=assign_targets, ctx=gast.Param())) - - while_func_id = gast.parse( - 'fluid.dygraph.dygraph_to_static.convert_operators.convert_while_loop' - ).body[0].value - while_node = gast.Call(func=while_func_id, args=while_args, keywords=[]) - assign_node = gast.Assign( - targets=[gast.Tuple( - elts=assign_targets, ctx=gast.Store())], - value=while_node) - return assign_node + # NOTE(liym27): + # It's better to parse the source code into an AST node than to customize an AST node + # including child nodes, because it is easy to mistake the ast node type when customizing the node. + # + # For example: loop_var_names = [a, b, foo.x], the type of `a` or `b` is gast.Name, + # but the type of `foo.x` gast.Attribute. + + while_func_name = "fluid.dygraph.dygraph_to_static.convert_operators.convert_while_loop" + while_node_str = "[{}] = {}({}, {}, [{}])".format( + ",".join(loop_var_names), while_func_name, condition_name, body_name, + ",".join(loop_var_names)) + + while_node = gast.parse(while_node_str).body[0] + + return while_node class NameVisitor(gast.NodeVisitor): diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py b/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py index 429fa27f618765c78ad8b7e171b5b6341ed7335d..aeece9513b57710b767322c2a7986eec087b4f8d 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py @@ -19,8 +19,12 @@ import inspect import gast +from paddle.fluid import core +from paddle.fluid.framework import Program + # NOTE(liym27): Please use `getattr(ast_node, ORIGI_INFO)` instead of . operation to get the original information of ast node. ORIGI_INFO = "Original information of source code for ast node." +ORIGI_INFO_MAP = "Original information map of source code." class Location(object): @@ -64,6 +68,15 @@ class OriginInfo(object): return "{} \nsource_code: {} in function {}\n ".format( self.location, self.source_code, self.function_name) + def formated_message(self): + return ' File "{}", line {}, in {}\n\t{}'.format( + self.location.filepath, self.location.lineno, self.function_name, + self.source_code.lstrip()) + + def as_frame(self): + return (self.location.filepath, self.location.lineno, + self.function_name, self.source_code.lstrip()) + class OriginInfoAttacher(gast.NodeTransformer): """ @@ -119,7 +132,12 @@ class OriginInfoAttacher(gast.NodeTransformer): return self.col_offset + node.col_offset -def create_origin_info_map(transformed_node, static_func): +global_origin_info_map = {} + + +def create_and_update_origin_info_map(transformed_node, + static_func, + is_global=True): """ Creates a original information map between transformed static function and original dygraph function. @@ -156,6 +174,10 @@ def create_origin_info_map(transformed_node, static_func): origin_info_map[static_loc] = dygraph_info + global_origin_info_map.update(origin_info_map) + if is_global: + return global_origin_info_map + return origin_info_map @@ -234,3 +256,63 @@ def ast_walk(transformed_node, static_node): if isinstance(d_item, gast.AST): transformed_node_list.append(d_item) static_node_list.append(s_item) + + +def update_op_callstack_with_origin_info(program): + """ + Replaces op callstack information about transformed static code with original dygraph code. + """ + + assert isinstance(program, Program) + + def get_new_op_callstack(callstack): + """ + An example of callstack: + + File "path1/to/file.py", line 10, in func_1 + y = fluid.layers.fill_constant(x, shape=[1], dtype="int32") + File "path2/to/file.py", line 740, in fill_constant + stop_gradient=True) + File "path3/to/file.py", line 43, in append_op + return self.main_program.current_block().append_op(*args, **kwargs) + File "path4/to/file.py", line 2811, in append_op + attrs=kwargs.get("attrs", None)) + File "path5/to/file.py", line 1919, in __init__ + for frame in traceback.extract_stack(): + """ + + assert len(callstack) % 2 == 0 + for i in range(0, len(callstack), 2): + + file_line = callstack[i].lstrip(" ").split(",") + + filepath = file_line[0][6:-1] + lineno = int(file_line[1][6:]) + funcname = file_line[2][4:] + code = callstack[i + 1].lstrip(" ") + + loc = Location(filepath, lineno) + dygraph_func_info = global_origin_info_map.get(loc.line_location) + if dygraph_func_info: + filepath, lineno, funcname, code = \ + dygraph_func_info.as_frame() + + callstack[i] = ' File "{}", line {}, in {}'.format( + filepath, lineno, funcname) + callstack[i + 1] = ' {}'.format(code) + + return callstack + + op_maker = core.op_proto_and_checker_maker + callstack_var_name = op_maker.kOpCreationCallstackAttrName() + + for block in program.blocks: + for i, op in enumerate(block.ops): + if op.has_attr(callstack_var_name): + callstack = op.attr(callstack_var_name) + + callstack = get_new_op_callstack(callstack) + + op._set_attr(callstack_var_name, callstack) + + return program diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py index 05fce7bf837664eafd89319eb6cdd973b745605f..7d2a767dd8f86fbf7e0908720d4d8a81a4885685 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py @@ -130,8 +130,6 @@ class PartialProgramLayer(layers.Layer): self._check_params_all_inited(main_program) # 2. Prune the parameters not used anywhere in the program. self._prune_unused_params(main_program) - # 3. Remove op's python call stack with redundant low-level error messages. - main_program = self._remove_op_call_stack(main_program) return main_program diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py index 1b6b64ae1fdee89b8e7d9bfcb6601d27f76d10a5..d555c8ed28f358a43e53966dd30d76d85a03dde5 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py @@ -47,8 +47,7 @@ class PrintTransformer(gast.NodeTransformer): # NOTE: deal with print in PY3 def visit_Call(self, node): if isinstance(node.func, gast.Name) and node.func.id == 'print': - convert_print_node = self._create_print_node(node.args) - return gast.Expr(value=convert_print_node) + node = self._create_print_node(node.args) return node # NOTE: deal with print in PY2 diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py index 79e812ff6192bc09e6e8c71397c6a239011dfae6..88562dd40a63b3da50b34bd1cb5c1094aef1ae42 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py @@ -36,6 +36,9 @@ from paddle.fluid.wrapped_decorator import signature_safe_contextmanager from paddle.fluid.dygraph.base import param_guard from paddle.fluid.data_feeder import check_type from paddle.fluid.dygraph.dygraph_to_static.partial_program import partial_program_from +from paddle.fluid.dygraph.dygraph_to_static.origin_info import attach_origin_info, create_and_update_origin_info_map +from paddle.fluid.dygraph.dygraph_to_static.origin_info import update_op_callstack_with_origin_info +from paddle.fluid.dygraph.dygraph_to_static.error import attach_error_data, ERROR_DATA __all__ = ['ProgramTranslator', 'convert_to_static'] @@ -88,15 +91,23 @@ class FunctionCache(object): # with decorator directly and function.__wrapped__ holds the actual function. func = getattr(func, '__wrapped__', func) source_code = func_to_source_code(func) + + # TODO(liym27): + # Consider this case: source_code in self._code_to_ast_caches, + # but actually they are methods in different classes. + # Maybe use (__class__, source_code) as key if source_code in self._code_to_ast_caches: root_wrapper = self._code_to_ast_caches[source_code] else: root = gast.parse(source_code) + root = attach_origin_info(root, func) root_wrapper = self._dygraph_to_static.get_static_ast(root) self._code_to_ast_caches[source_code] = root_wrapper # Get static function from AST static_func, file_name = ast_to_func(root_wrapper.node, func) + + create_and_update_origin_info_map(root_wrapper.node, static_func) return static_func def exist(self, func): @@ -125,6 +136,7 @@ class FunctionSpec(object): self._args = args self._kwargs = kwargs + # TODO(liym27): func has multi layer decorator dyfunc = getattr(func, '__wrapped__', func) self._dyfunc_code = inspect.getsource(dyfunc) @@ -282,11 +294,19 @@ class ConcreteProgram(object): # 3. Builds program only once and returns the output Variables. with param_guard(func_spec.parameters(False)), param_guard( func_spec.buffers(False)): - outputs = static_func(*inputs) + try: + outputs = static_func(*inputs) + except BaseException as e: + # NOTE: If e is raised in compile time, e should be attached to ERROR_DATA here. + attach_error_data(e) + raise + if not isinstance(outputs, (tuple, list)) and outputs is not None: outputs = [outputs] + main_program = update_op_callstack_with_origin_info(main_program) + return ConcreteProgram( inputs=inputs, outputs=outputs, @@ -483,14 +503,24 @@ class ProgramTranslator(object): return dygraph_func(*args, **kwargs) function_spec = FunctionSpec(dygraph_func, args, kwargs) - _, partial_program_layer = self._program_cache[function_spec] + concrete_program, partial_program_layer = self._program_cache[ + function_spec] if args and isinstance(args[0], layers.Layer): # Synchronize self.training attribute. partial_program_layer.training = args[0].training args = args[1:] - - return partial_program_layer(args) + try: + return partial_program_layer(args) + + except BaseException as e: + # NOTE: + # 1. If e is raised in compile time, e should have been attached to ERROR_DATA before; + # 2. If e raised in runtime, e should be attached to ERROR_DATA here. + if not hasattr(e, ERROR_DATA): + # runtime error + attach_error_data(e, in_runtime=True) + raise def get_func(self, dygraph_func): """ diff --git a/python/paddle/fluid/dygraph/io.py b/python/paddle/fluid/dygraph/io.py index 38e4e517836ed8ddbeb36fb68a0c34fa9826f233..7396289392affa92e69e9f55fba622fd13fa979f 100644 --- a/python/paddle/fluid/dygraph/io.py +++ b/python/paddle/fluid/dygraph/io.py @@ -425,8 +425,7 @@ def _load_persistable_vars(model_path, params_filename=None): # 1. load extra var info with open(var_info_path, 'rb') as f: - extra_var_info = pickle.load(f) if six.PY2 else pickle.load( - f, encoding='latin1') + extra_var_info = pickle.load(f) # 2. construct var dict load_var_dict = dict() diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py index 64faae247fbf80637a45429eaa1d5833df122a1a..8439b87dd9ced618ad4f0b2e6d9d321d5f8662be 100644 --- a/python/paddle/fluid/dygraph/jit.py +++ b/python/paddle/fluid/dygraph/jit.py @@ -15,20 +15,23 @@ from __future__ import print_function import os -import six import pickle - import warnings + +import six from paddle.fluid import core from paddle.fluid.compiler import BuildStrategy, CompiledProgram, ExecutionStrategy from paddle.fluid.data_feeder import check_type from paddle.fluid.dygraph.base import program_desc_tracing_guard, switch_to_static_graph -from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator, FunctionSpec +from paddle.fluid.dygraph.dygraph_to_static.error import ERROR_DATA +from paddle.fluid.dygraph.dygraph_to_static.program_translator import FunctionSpec, ProgramTranslator +from paddle.fluid.dygraph.io import EXTRA_VAR_INFO_FILENAME, VARIABLE_FILENAME, TranslatedLayer from paddle.fluid.dygraph.layers import Layer from paddle.fluid.executor import Executor, scope_guard -from paddle.fluid.framework import Program, Block, Variable, ParamBase, _dygraph_tracer, dygraph_only, _dygraph_guard, _current_expected_place, in_dygraph_mode +from paddle.fluid.framework import Block, ParamBase, Program, Variable +from paddle.fluid.framework import _current_expected_place, _dygraph_guard, _dygraph_tracer +from paddle.fluid.framework import dygraph_only, in_dygraph_mode from paddle.fluid.wrapped_decorator import wrap_decorator -from paddle.fluid.dygraph.io import TranslatedLayer, VARIABLE_FILENAME, EXTRA_VAR_INFO_FILENAME __all__ = ['TracedLayer', 'declarative', 'dygraph_to_static_func'] @@ -167,7 +170,25 @@ def _declarative_(dygraph_func): "The decorator 'declarative' doesn't work when setting ProgramTranslator.enable=False. " "We will just return dygraph output.") return dygraph_func(*args, **kwargs) - return program_translator.get_output(dygraph_func, *args, **kwargs) + try: + return program_translator.get_output(dygraph_func, *args, **kwargs) + except Exception as e: + error_data = getattr(e, ERROR_DATA, None) + if error_data: + new_exception = error_data.create_exception() + if six.PY3: + # NOTE(liym27): + # 1. Why `raise new_exception from None`? + # In Python 3, by default, an new exception is raised with trace information of the caught exception. + # This only raises new_exception and hides unwanted implementation details from tracebacks of the + # caught exception. + # 2. Use exec to bypass syntax error checking in Python 2. + + six.exec_("raise new_exception from None") + else: + raise new_exception + else: + raise return __impl__ diff --git a/python/paddle/fluid/incubate/fleet/base/fleet_base.py b/python/paddle/fluid/incubate/fleet/base/fleet_base.py index 9be1fe92d1d0c77dd809a90ab00585ab5997f0a2..f236a3e98c61bade5804e7a91978352174a9c5b2 100644 --- a/python/paddle/fluid/incubate/fleet/base/fleet_base.py +++ b/python/paddle/fluid/incubate/fleet/base/fleet_base.py @@ -21,7 +21,7 @@ from paddle.fluid.executor import Executor from paddle.fluid.optimizer import SGD from paddle.fluid.incubate.fleet.base.mode import Mode -from paddle.fluid.incubate.fleet.base.role_maker import RoleMakerBase +from paddle.fleet.base.role_maker import RoleMakerBase from paddle.fluid.contrib.mixed_precision.decorator import OptimizerWithMixedPrecision from . import mode @@ -209,7 +209,10 @@ class Fleet(object): self._executor = Executor(fluid.CPUPlace()) if role_maker and not isinstance(role_maker, RoleMakerBase): - raise TypeError("role_maker must be an instance of RoleMakerBase") + from paddle.fluid.incubate.fleet.base.role_maker import RoleMakerBase as RoleMakerBaseIncubate + if role_maker and not isinstance(role_maker, RoleMakerBaseIncubate): + raise TypeError( + "role_maker must be an instance of RoleMakerBase") self._role_maker = role_maker self._role_maker.generate_role() diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py b/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py index a7d86411e203728116604ffafddf36a1cfaed9b3..d2c7397c85f8df155444d9272c7b75596f0fe169 100644 --- a/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py +++ b/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py @@ -579,7 +579,7 @@ class FleetTranspiler(Fleet): block.append_op( type='recv_save', attrs={ - "trainer_id": self._role_maker.worker_id(), + "trainer_id": self._role_maker.worker_index(), "shape": var.shape, "slice_shapes": [",".join([str(i) for i in var.shape])], diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py index 2056e3deb18476748df0e16bc18b59f0a1074d55..b96eff19e9b9c5d8e78b85e61b9a69afee106546 100644 --- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py +++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py @@ -329,7 +329,7 @@ class CompileTimeStrategy(object): is_distributed = True if param_name in distibuted_varnames else False - ctx = self.build_ctx(grad, self.grad_var_mapping, True, False, + ctx = self.build_ctx(grad, self.grad_var_mapping, True, True, True, is_distributed) send_ctx[ctx.var_name()] = ctx diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 1b8df4a098ff1c11270854976d82693932627477..1ef15ca4733518949a3d1856c20a0e5f70820554 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -6200,7 +6200,7 @@ def squeeze(input, axes, name=None): Out.shape = [1,3,5] Args: - input (Variable): The input Tensor. Support data type: float16, float32, float64, int8, int32, int64. + input (Variable): The input Tensor. Supported data type: float32, float64, bool, int8, int32, int64. axes (list): One integer or List of integers, indicating the dimensions to be squeezed. Axes range is :math:`[-rank(input), rank(input))`. If axes is negative, :math:`axes=axes+rank(input)`. @@ -6226,8 +6226,9 @@ def squeeze(input, axes, name=None): helper = LayerHelper("squeeze", **locals()) check_variable_and_dtype( input, 'input', - ['float16', 'float32', 'float64', 'int8', 'int32', 'int64'], 'squeeze') - check_type(axes, 'axes', (list, tuple), 'squeeze') + ['float16', 'float32', 'float64', 'bool', 'int8', 'int32', 'int64'], + 'squeeze') + check_type(axes, 'axis/axes', (list, tuple), 'squeeze') out = helper.create_variable_for_type_inference(dtype=input.dtype) x_shape = helper.create_variable_for_type_inference(dtype=input.dtype) helper.append_op( @@ -6254,12 +6255,12 @@ def unsqueeze(input, axes, name=None): then Unsqueezed tensor with axes=[0, 4] has shape [1, 3, 4, 5, 1]. Args: - input (Variable): The input Tensor to be unsqueezed. It is a N-D Tensor of data types float32, float64, int32. + input (Variable): The input Tensor to be unsqueezed. Supported data type: float32, float64, bool, int8, int32, int64. axes (int|list|tuple|Variable): Indicates the dimensions to be inserted. The data type is ``int32`` . If ``axes`` is a list or tuple, the elements of it should be integers or Tensors with shape [1]. If ``axes`` is an Variable, it should be an 1-D Tensor . name (str|None): Name for this layer. Returns: - Variable: Output unsqueezed Tensor, with data type being float32, float64, int32, int64. + Variable: Unsqueezed Tensor, with the same data type as input. Examples: .. code-block:: python @@ -6269,10 +6270,15 @@ def unsqueeze(input, axes, name=None): y = fluid.layers.unsqueeze(input=x, axes=[1]) """ - if not isinstance(axes, (int, list, tuple, Variable)): - raise TypeError( - "The type of 'axes' in unsqueeze must be int, list, tuple or Variable, but " - "received %s." % (type(axes))) + if in_dygraph_mode(): + out, _ = core.ops.unsqueeze2(input, 'axes', axes) + return out + + check_type(axes, 'axis/axes', (int, list, tuple, Variable), 'unsqueeze') + check_variable_and_dtype( + input, 'input', + ['float16', 'float32', 'float64', 'bool', 'int8', 'int32', 'int64'], + 'unsqueeze') helper = LayerHelper("unsqueeze2", **locals()) inputs = {"X": input} attrs = {} @@ -9966,7 +9972,7 @@ def stack(x, axis=0, name=None): must be the same. Supposing input is N dims Tensors :math:`[d_0, d_1, ..., d_{n-1}]`, the output is N+1 dims Tensor :math:`[d_0, d_1, d_{axis-1}, len(x), d_{axis}, ..., d_{n-1}]`. - Support data types: float32, float64, int32, int64. + Supported data types: float32, float64, int32, int64. axis (int, optional): The axis along which all inputs are stacked. ``axis`` range is :math:`[-(R+1), R+1)`. R is the first tensor of inputs. If ``axis`` < 0, :math:`axis=axis+rank(x[0])+1`. The default value of axis is 0. diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index e33b34cc9254b18a18c293fb3670203fecdeb38f..2d874b4806c9e1449a170017440c4b5038ff93bf 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -685,8 +685,9 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None): """ attrs = {'force_cpu': force_cpu} + dtype = convert_dtype(dtype) if not isinstance(value, Variable): - if convert_dtype(dtype) in ['int64', 'int32']: + if dtype in ['int64', 'int32']: attrs['str_value'] = str(int(value)) else: attrs['str_value'] = str(float(value)) @@ -697,7 +698,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None): out = _varbase_creator(dtype=dtype) if isinstance(value, Variable): - if convert_dtype(dtype) in ['int64', 'int32']: + if dtype in ['int64', 'int32']: attrs['str_value'] = str(int(value.numpy())) else: attrs['str_value'] = str(float(value.numpy())) @@ -712,6 +713,8 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None): helper = LayerHelper("fill_constant", **locals()) inputs = {} if isinstance(value, Variable): + if convert_dtype(value.dtype) != dtype: + value = cast(value, dtype) inputs['ValueTensor'] = value check_dtype(dtype, 'dtype', diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index d73b9511b76ed6585c662264e99fe41f3354bc29..686844fea76c01e3ebaa3430eb67fc35bc46fd86 100755 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -345,7 +345,6 @@ if(WITH_DISTRIBUTE) # FIXME(typhoonzero): add these tests back list(REMOVE_ITEM DIST_TEST_OPS "test_dist_transformer") list(REMOVE_ITEM DIST_TEST_OPS "test_dist_transpiler") - list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_ctr") #not need list(REMOVE_ITEM DIST_TEST_OPS "test_dist_base") diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py index 56ca3105dea792ed964d62057c07e2da7a4140fa..033bc3850052199ca8da6d4588851de9c9903193 100644 --- a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py +++ b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py @@ -28,6 +28,7 @@ import numpy as np import ctr_dataset_reader from test_dist_fleet_base import runtime_main, FleetDistRunnerBase +from paddle.fleet.base.util_factory import fleet_util # Fix seed for test fluid.default_startup_program().random_seed = 1 @@ -181,8 +182,14 @@ class TestDistCTR2x2(FleetDistRunnerBase): loss_val = exe.run(program=compiled_prog, fetch_list=[self.avg_cost.name]) loss_val = np.mean(loss_val) - print("TRAIN ---> pass: {} loss: {}\n".format(epoch_id, - loss_val)) + reduce_output = fleet_util.all_reduce( + np.array(loss_val), mode="sum") + loss_all_trainer = fleet_util.all_gather(float(loss_val)) + loss_val = float(reduce_output) / len(loss_all_trainer) + message = "TRAIN ---> pass: {} loss: {}\n".format(epoch_id, + loss_val) + fleet_util.print_on_rank(message, 0) + pass_time = time.time() - pass_start except fluid.core.EOFException: self.reader.reset() diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py new file mode 100644 index 0000000000000000000000000000000000000000..586020d434519b12c6fff4cbba812a013cf45c3d --- /dev/null +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py @@ -0,0 +1,147 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import inspect +import unittest + +import numpy as np +import paddle.fluid as fluid +from paddle.fluid.core import EnforceNotMet +from paddle.fluid.dygraph.dygraph_to_static.error import ERROR_DATA, ErrorData +from paddle.fluid.dygraph.dygraph_to_static.origin_info import unwrap +from paddle.fluid.dygraph.jit import declarative + + +def inner_func(): + fluid.layers.fill_constant(shape=[1, 2], value=9, dtype="int") + return + + +@declarative +def func_error_in_compile_time(x): + x = fluid.dygraph.to_variable(x) + inner_func() + if fluid.layers.mean(x) < 0: + x_v = x - 1 + else: + x_v = x + 1 + return x_v + + +@declarative +def func_error_in_compile_time_2(x): + x = fluid.dygraph.to_variable(x) + x = fluid.layers.reshape(x, shape=[1, 2]) + return x + + +@declarative +def func_error_in_runtime(x, iter_num=3): + x = fluid.dygraph.to_variable(x) + two = fluid.layers.fill_constant(shape=[1], value=2, dtype="int32") + x = fluid.layers.reshape(x, shape=[1, two]) + return x + + +class TestErrorInCompileTime(unittest.TestCase): + def setUp(self): + self.set_func() + self.set_input() + self.set_exception_type() + + def set_func(self): + self.func = func_error_in_compile_time + + def set_exception_type(self): + self.exception_type = TypeError + + def set_input(self): + self.input = np.ones([3, 2]) + + def set_message(self): + self.expected_message = \ + ['File "{}", line 36, in func_error_in_compile_time'.format(self.filepath), + 'inner_func()', + 'File "{}", line 29, in inner_func'.format(self.filepath), + 'fluid.layers.fill_constant(shape=[1, 2], value=9, dtype="int")', + ] + + def _test_create_message(self, error_data): + self.filepath = inspect.getfile(unwrap(self.func)) + self.set_message() + error_message = error_data.create_message() + + self.assertIn('In user code:', error_message) + for m in self.expected_message: + self.assertIn(m, error_message) + + def test(self): + with fluid.dygraph.guard(): + with self.assertRaises(self.exception_type) as cm: + self.func(self.input) + exception = cm.exception + error_data = getattr(exception, ERROR_DATA) + self.assertIsInstance(error_data, ErrorData) + self._test_create_message(error_data) + + +class TestErrorInCompileTime2(TestErrorInCompileTime): + def set_func(self): + self.func = func_error_in_compile_time_2 + + def set_exception_type(self): + self.exception_type = EnforceNotMet + + def set_message(self): + + self.expected_message = \ + [ + 'File "{}", line 47, in func_error_in_compile_time_2'.format(self.filepath), + 'x = fluid.layers.reshape(x, shape=[1, 2])' + ] + + +class TestErrorInRuntime(TestErrorInCompileTime): + def set_func(self): + self.func = func_error_in_runtime + + def set_exception_type(self): + self.exception_type = EnforceNotMet + + def set_message(self): + self.expected_message = \ + [ + 'File "{}", line 55, in func_error_in_runtime'.format(self.filepath), + 'x = fluid.layers.reshape(x, shape=[1, two])' + ] + + def _test_create_message(self, error_data): + self.filepath = inspect.getfile(unwrap(self.func)) + self.set_message() + + with self.assertRaises(ValueError): + error_data.create_message() + + error_data.in_runtime = False + error_message = error_data.create_message() + + self.assertIn('In user code:', error_message) + for m in self.expected_message: + self.assertIn(m, error_message) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py index 631655ec74428344376ea5b814ea443a91c49fc0..b03777b6ebc7f3cceb73cd32e6fdfea11755320e 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py @@ -90,7 +90,8 @@ class TestOriginInfo(unittest.TestCase): # step3 self.static_func, _ = ast_to_func(transformed_ast, self.dygraph_func) - info_map = create_origin_info_map(dygraph_ast, self.static_func) + info_map = create_and_update_origin_info_map(dygraph_ast, + self.static_func) return info_map diff --git a/python/paddle/fluid/tests/unittests/multi_process.py b/python/paddle/fluid/tests/unittests/multi_process.py index a67634adfcc0c27d5c9b470c81b880af9130462f..f999ce803a512403da14a4ea2064448aedfe242e 100644 --- a/python/paddle/fluid/tests/unittests/multi_process.py +++ b/python/paddle/fluid/tests/unittests/multi_process.py @@ -17,7 +17,7 @@ import sys import time -def train(): +def train(prefix): selected_gpus = os.getenv("FLAGS_selected_gpus") trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS") @@ -29,11 +29,12 @@ def train(): .format(selected_gpus, worker_endpoints, trainers_num, current_endpoint,trainer_id) print(name) - with open("multi_process.check_{}.log".format(trainer_id), "w") as f: + with open("multi_process_{}.check_{}.log".format(prefix, trainer_id), + "w") as f: f.write(name) -def train_abort(): +def train_abort(prefix): selected_gpus = os.getenv("FLAGS_selected_gpus") trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS") @@ -49,8 +50,9 @@ def train_abort(): name = "abort>>> selected_gpus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}"\ .format(selected_gpus, worker_endpoints, trainers_num, current_endpoint,trainer_id) print(name) - with open("multi_process.check_{}.log".format(trainer_id), - "w") as f: + with open( + "multi_process_{}.check_{}.log".format(prefix, trainer_id), + "w") as f: f.write(name) raise else: @@ -60,12 +62,15 @@ def train_abort(): .format(selected_gpus, worker_endpoints, trainers_num, current_endpoint,trainer_id) print(name) - with open("multi_process.check_{}.log".format(trainer_id), "w") as f: + with open("multi_process_{}.check_{}.log".format(prefix, trainer_id), + "w") as f: f.write(name) if __name__ == '__main__': - if len(sys.argv) == 2 and sys.argv[1] == "abort": - train_abort() + if len(sys.argv) == 3 and sys.argv[2] == "abort": + prefix = sys.argv[1] + train_abort(prefix) else: - train() + prefix = sys.argv[1] + train(prefix) diff --git a/python/paddle/fluid/tests/unittests/test_addmm_op.py b/python/paddle/fluid/tests/unittests/test_addmm_op.py index 8c0b599a37936ffe47ac44ae54fe6e25768e4a4f..0bcdc45a2ccd0fd240c42b68a657557e50e4dc02 100644 --- a/python/paddle/fluid/tests/unittests/test_addmm_op.py +++ b/python/paddle/fluid/tests/unittests/test_addmm_op.py @@ -63,18 +63,104 @@ class TestAddMMOpError(unittest.TestCase): def test_errors(self): with program_guard(Program(), Program()): # The input type of addmm_op must be Variable. + input = fluid.create_lod_tensor( - np.array([[-1]]), [[1]], fluid.CPUPlace()) + np.array([[-1, -1], [-1, -1]]), [[2]], fluid.CPUPlace()) x1 = fluid.create_lod_tensor( - np.array([[-1]]), [[1]], fluid.CPUPlace()) + np.array([[-1, -1], [-1, -1]]), [[2]], fluid.CPUPlace()) x2 = fluid.create_lod_tensor( - np.array([[-1]]), [[1]], fluid.CPUPlace()) + np.array([[-1, -1], [-1, -1]]), [[2]], fluid.CPUPlace()) self.assertRaises(TypeError, paddle.addmm, input, x1, x2) + # The input dtype of mul_op must be float32 or float64. - input = fluid.layers.data(name='input', shape=[4], dtype="int32") - x3 = fluid.layers.data(name='x3', shape=[4], dtype="int32") - x4 = fluid.layers.data(name='x4', shape=[4], dtype="int32") + input = fluid.layers.data( + name='input', + shape=[4, 4], + dtype="int32", + append_batch_size=False) + x3 = fluid.layers.data( + name='x3', shape=[4, 4], dtype="int32", append_batch_size=False) + x4 = fluid.layers.data( + name='x4', shape=[4, 4], dtype="int32", append_batch_size=False) self.assertRaises(TypeError, paddle.addmm, input, x3, x4) + # x and y dimension mismatch + x5 = fluid.layers.data( + name='x5', + shape=[4, 5], + dtype="float32", + append_batch_size=False) + x6 = fluid.layers.data( + name='x6', + shape=[4, 4], + dtype="float32", + append_batch_size=False) + self.assertRaises(ValueError, paddle.addmm, input, x5, x6) + # input and x are not broadcastable + x7 = fluid.layers.data( + name='x7', + shape=[4, 4], + dtype="float32", + append_batch_size=False) + x8 = fluid.layers.data( + name='x8', + shape=[4, 4], + dtype="float32", + append_batch_size=False) + input1 = fluid.layers.data( + name='input1', + shape=[2, 4], + dtype="float32", + append_batch_size=False) + self.assertRaises(ValueError, paddle.addmm, input1, x7, x8) + # input and x are not broadcastable + x9 = fluid.layers.data( + name='x9', + shape=[4, 4], + dtype="float32", + append_batch_size=False) + x10 = fluid.layers.data( + name='x10', + shape=[4, 4], + dtype="float32", + append_batch_size=False) + input2 = fluid.layers.data( + name='input2', + shape=[1, 2], + dtype="float32", + append_batch_size=False) + self.assertRaises(ValueError, paddle.addmm, input2, x9, x10) + x11 = fluid.layers.data( + name='x11', + shape=[4, 4], + dtype="float32", + append_batch_size=False) + x12 = fluid.layers.data( + name='x12', + shape=[4, 4], + dtype="float32", + append_batch_size=False) + input3 = fluid.layers.data( + name='input3', + shape=[4, 2], + dtype="float32", + append_batch_size=False) + self.assertRaises(ValueError, paddle.addmm, input3, x11, x12) + x13 = fluid.layers.data( + name='x13', + shape=[4, 4], + dtype="float32", + append_batch_size=False) + x14 = fluid.layers.data( + name='x14', + shape=[4, 4], + dtype="float32", + append_batch_size=False) + input4 = fluid.layers.data( + name='input4', + shape=[3, 1], + dtype="float32", + append_batch_size=False) + self.assertRaises(ValueError, paddle.addmm, input4, x13, x14) class TestAddMMOp2(TestAddMMOp): @@ -147,5 +233,23 @@ class TestAddMMOp4(unittest.TestCase): assert np.allclose(np_input + np.dot(np_x, np_y), out.numpy()) +''' +class TestAddMMAPI(unittest.TestCase): + def test_api_error(self): + data_x = np.ones((2, 2)).astype(np.float32) + data_y = np.ones((2, 2)).astype(np.float32) + data_input = np.ones((2, 2)).astype(np.float32) + + paddle.enable_imperative() + + def test_error1(): + data_x_wrong = np.ones((2, 3)).astype(np.float32) + x = paddle.imperative.to_variable(data_x_wrong) + y = paddle.imperative.to_variable(data_y) + input = paddle.imperative.to_variable(data_input) + out = paddle.tensor.addmm( input=input, x=x, y=y, beta=0.5, alpha=5.0 ) + self.assertRaises(ValueError, test_error1) +''' + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_bmm_op.py b/python/paddle/fluid/tests/unittests/test_bmm_op.py index 993ac25d8d4b638a56c9e2aa4f832f576f0b2ae7..cb1b3ded53472c022ef83539f573c9e6c192a966 100644 --- a/python/paddle/fluid/tests/unittests/test_bmm_op.py +++ b/python/paddle/fluid/tests/unittests/test_bmm_op.py @@ -73,5 +73,15 @@ class API_TestDygraphBmm(unittest.TestCase): self.assertTrue(np.allclose(expected_result, out_np)) +class TestBmmAPIError(unittest.TestCase): + def test_api_error(self): + x_data = np.arange(24, dtype='float32').reshape((2, 3, 4)) + y_data = np.arange(16, dtype='float32').reshape((2, 4, 2)) + y_data_wrong1 = np.arange(16, dtype='float32').reshape((2, 2, 4)) + y_data_wrong2 = np.arange(16, dtype='float32').reshape((2, 2, 2, 2)) + self.assertRaises(ValueError, paddle.bmm, x_data, y_data_wrong1) + self.assertRaises(ValueError, paddle.bmm, x_data, y_data_wrong2) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py index 16f0fc0a35e6140941da09c13bf67855670fc6a1..8b2f7118ea766a0a2e5a7f74daa243b99f64129d 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py @@ -21,6 +21,9 @@ import os import sys import subprocess +import six +import shutil +import numpy as np import argparse from contextlib import closing import socket @@ -29,7 +32,8 @@ import tempfile import unittest import paddle.fluid as fluid -import paddle.fluid.incubate.fleet.base.role_maker as role_maker +import paddle.fleet.base.role_maker as role_maker +from paddle.fleet.base.util_factory import fleet_util from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory @@ -48,18 +52,26 @@ class FleetDistRunnerBase(object): """ def build_role(self, args): + if args.role.upper() == "PSERVER": role = role_maker.UserDefinedRoleMaker( + is_collective=False, + init_gloo=True, + path=args.gloo_path, current_id=args.current_id, role=role_maker.Role.SERVER, - worker_num=args.trainers, + worker_endpoints=args.trainer_endpoints.split(","), server_endpoints=args.endpoints.split(",")) else: role = role_maker.UserDefinedRoleMaker( + is_collective=False, + init_gloo=True, + path=args.gloo_path, current_id=args.current_id, role=role_maker.Role.WORKER, - worker_num=args.trainers, + worker_endpoints=args.trainer_endpoints.split(","), server_endpoints=args.endpoints.split(",")) + self.role = role return role def build_strategy(self, args): @@ -114,26 +126,13 @@ class FleetDistRunnerBase(object): optimizer.minimize(avg_cost) def run_pserver(self, args): - fleet.init(self.build_role(args)) - strategy = self.build_strategy(args) - avg_cost = self.net(args) - self.build_optimizer(avg_cost, strategy) - fleet.init_server() fleet.run_server() def run_dataset_trainer(self, args): - fleet.init(self.build_role(args)) - strategy = self.build_strategy(args) - avg_cost = self.net(args) - self.build_optimizer(avg_cost, strategy) out = self.do_dataset_training(fleet) def run_pyreader_trainer(self, args): - fleet.init(self.build_role(args)) - strategy = self.build_strategy(args) - avg_cost = self.net(args) - self.build_optimizer(avg_cost, strategy) out = self.do_pyreader_training(fleet) def net(self, args, batch_size=4, lr=0.01): @@ -173,10 +172,14 @@ class TestFleetBase(unittest.TestCase): print("set begin_port:", DIST_UT_PORT) self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % ( DIST_UT_PORT, DIST_UT_PORT + 1) - DIST_UT_PORT += 2 + self._tr_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % ( + DIST_UT_PORT + 2, DIST_UT_PORT + 3) + DIST_UT_PORT += 4 else: self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % ( self._find_free_port(), self._find_free_port()) + self._tr_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % ( + self._find_free_port(), self._find_free_port()) self._python_interp = sys.executable self._geo_sgd_need_push_nums = 5 @@ -236,18 +239,22 @@ class TestFleetBase(unittest.TestCase): def _run_cluster(self, model, envs): env = {'GRAD_CLIP': str(self._grad_clip_mode)} python_path = self._python_interp + gloo_path = tempfile.mkdtemp() + if os.getenv('WITH_COVERAGE', 'OFF') == 'ON': envs['COVERAGE_FILE'] = os.getenv('COVERAGE_FILE', '') python_path += " -m coverage run --branch -p" env.update(envs) - tr_cmd = "{0} {1} --role trainer --endpoints {2} --current_id {{}} --trainers {3} --mode {4} --geo_sgd_need_push_nums {5} --reader {6}".format( - python_path, model, self._ps_endpoints, self._trainers, self._mode, - self._geo_sgd_need_push_nums, self._reader) + tr_cmd = "{0} {1} --role trainer --endpoints {2} --trainer_endpoints {3} --current_id {{}} --trainers {4} --mode {5} --geo_sgd_need_push_nums {6} --reader {7} --gloo_path {8}".format( + python_path, model, self._ps_endpoints, self._tr_endpoints, + self._trainers, self._mode, self._geo_sgd_need_push_nums, + self._reader, gloo_path) - ps_cmd = "{0} {1} --role pserver --endpoints {2} --current_id {{}} --trainers {3} --mode {4} --geo_sgd_need_push_nums {5} --reader {6}".format( - python_path, model, self._ps_endpoints, self._trainers, self._mode, - self._geo_sgd_need_push_nums, self._reader) + ps_cmd = "{0} {1} --role pserver --endpoints {2} --trainer_endpoints {3} --current_id {{}} --trainers {4} --mode {5} --geo_sgd_need_push_nums {6} --reader {7} --gloo_path {8}".format( + python_path, model, self._ps_endpoints, self._tr_endpoints, + self._trainers, self._mode, self._geo_sgd_need_push_nums, + self._reader, gloo_path) # Run dist train to compare with local results ps0, ps1, ps0_pipe, ps1_pipe = self._start_pserver(ps_cmd, env) @@ -284,6 +291,7 @@ class TestFleetBase(unittest.TestCase): ps0.terminate() ps1.terminate() + shutil.rmtree(gloo_path) return 0, 0 def check_with_place(self, @@ -313,6 +321,9 @@ def runtime_main(test_class): parser.add_argument( '--role', type=str, required=True, choices=['pserver', 'trainer']) parser.add_argument('--endpoints', type=str, required=False, default="") + parser.add_argument( + '--trainer_endpoints', type=str, required=False, default="") + parser.add_argument('--gloo_path', type=str, required=False, default="") parser.add_argument('--current_id', type=int, required=False, default=0) parser.add_argument('--trainers', type=int, required=False, default=1) parser.add_argument('--mode', type=str, required=False, default='geo') @@ -322,6 +333,13 @@ def runtime_main(test_class): args = parser.parse_args() model = test_class() + role = model.build_role(args) + fleet.init(role) + strategy = model.build_strategy(args) + avg_cost = model.net(args) + model.build_optimizer(avg_cost, strategy) + fleet_util._set_strategy(strategy) + fleet_util._set_role_maker(role) if args.role == "pserver": model.run_pserver(args) else: diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py index 5fc37335b21536cef160c9f72e68bf7eb0610e97..18629c4f996a6d068339bd6cad494e8e8d21123f 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py @@ -22,7 +22,7 @@ from test_dist_fleet_base import TestFleetBase class TestDistMnistSync2x2(TestFleetBase): def _setup_config(self): - self._mode = "sync" + self._mode = "async" self._reader = "pyreader" def check_with_place(self, diff --git a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py index 3eb761f925a677dcbaa3d7e39221299013f84b33..aefc809bd5cb852d3fde95dff4550e506c5f1c12 100644 --- a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py +++ b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py @@ -269,18 +269,26 @@ class TestFillConstantAPI(unittest.TestCase): out_6 = fluid.layers.fill_constant( shape=shape_tensor_int64, dtype=np.float32, value=1.1) - val = fluid.layers.fill_constant(shape=[1], dtype=np.float32, value=1.1) + val1 = fluid.layers.fill_constant( + shape=[1], dtype=np.float32, value=1.1) + val2 = fluid.layers.fill_constant( + shape=[1], dtype=np.float64, value=1.1) out_7 = fluid.layers.fill_constant( - shape=shape_tensor_int64, dtype=np.float32, value=val) + shape=shape_tensor_int64, dtype=np.float32, value=val1) + + out_8 = fluid.layers.fill_constant( + shape=shape_tensor_int64, dtype=np.float32, value=val2) exe = fluid.Executor(place=fluid.CPUPlace()) - res_1, res_2, res_3, res_4, res_5, res_6, res_7 = exe.run( + res_1, res_2, res_3, res_4, res_5, res_6, res_7, res_8 = exe.run( fluid.default_main_program(), feed={ "shape_tensor_int32": np.array([1, 2]).astype("int32"), "shape_tensor_int64": np.array([1, 2]).astype("int64"), }, - fetch_list=[out_1, out_2, out_3, out_4, out_5, out_6, out_7]) + fetch_list=[ + out_1, out_2, out_3, out_4, out_5, out_6, out_7, out_8 + ]) assert np.array_equal(res_1, np.full([1, 2], 1.1, dtype="float32")) assert np.array_equal(res_2, np.full([1, 2], 1.1, dtype="float32")) @@ -289,6 +297,31 @@ class TestFillConstantAPI(unittest.TestCase): assert np.array_equal(res_5, np.full([1, 2], 1.1, dtype="float32")) assert np.array_equal(res_6, np.full([1, 2], 1.1, dtype="float32")) assert np.array_equal(res_7, np.full([1, 2], 1.1, dtype="float32")) + assert np.array_equal(res_8, np.full([1, 2], 1.1, dtype="float32")) + + +class TestFillConstantImperative(unittest.TestCase): + def test_api(self): + with fluid.dygraph.guard(): + data1 = np.array([1, 2]).astype('int32') + data2 = np.array([1.1]).astype('float32') + shape = fluid.dygraph.to_variable(data1) + val = fluid.dygraph.to_variable(data2) + res1 = fluid.layers.fill_constant( + shape=[1, 2], dtype='float32', value=1.1) + res2 = fluid.layers.fill_constant( + shape=shape, dtype='float32', value=1.1) + res3 = fluid.layers.fill_constant( + shape=shape, dtype='float32', value=val) + assert np.array_equal( + res1.numpy(), np.full( + [1, 2], 1.1, dtype="float32")) + assert np.array_equal( + res2.numpy(), np.full( + [1, 2], 1.1, dtype="float32")) + assert np.array_equal( + res3.numpy(), np.full( + [1, 2], 1.1, dtype="float32")) class TestFillConstantOpError(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/test_fleet_launch.sh b/python/paddle/fluid/tests/unittests/test_fleet_launch.sh index 577f9f6504fd83377f481aeab63b1780d50f6abe..5e5c4e17f5b97b12b17c8145c449327bbdad1967 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_launch.sh +++ b/python/paddle/fluid/tests/unittests/test_fleet_launch.sh @@ -4,7 +4,6 @@ set -e function test_launch_ps(){ fleetrun --server_num=2 --worker_num=2 fleet_ps_training.py 2> ut.elog - if grep -q "server are killed" ut.elog; then echo "test pserver launch succeed" else @@ -20,7 +19,7 @@ fi test_launch_ps # use default values -fleetrun multi_process.py +fleetrun multi_process.py fleetrun # use paddlecloud echo "begin test use paddlecloud" @@ -30,16 +29,16 @@ export POD_IP=127.0.0.1 export PADDLE_TRAINERS=127.0.0.1,127.0.0.2 export PADDLE_TRAINER_ID=0 -export PADDLE_PORT=35019 +export PADDLE_PORT=35789 export TRAINER_PORTS_NUM=2 distributed_args="--ips=${cluster_node_ips} --gpus=0,1 --log_dir=testlog" -CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} multi_process.py +CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} multi_process.py fleetrun -str1="selected_gpus:0 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35019 trainer_id:0" -str2="selected_gpus:1 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35020 trainer_id:1" -file_0="multi_process.check_0.log" -file_1="multi_process.check_1.log" +str1="selected_gpus:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35789 trainer_id:0" +str2="selected_gpus:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35790 trainer_id:1" +file_0="multi_process_fleetrun.check_0.log" +file_1="multi_process_fleetrun.check_1.log" echo "paddlecloud params test" if grep -q "$str1" "$file_0"; then @@ -70,7 +69,7 @@ unset TRAINER_PORTS_NUM echo "" echo "paddle.distributed.launch async poll process test" -if ! CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} multi_process.py abort; then +if ! CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} multi_process.py fleetrun abort; then echo "train abort as planned" fi diff --git a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_4.py b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_4.py index dd5cd715ecd1ed9ebc30a22cb924255d278643ed..a91f6cbd69e18e949b14787f46923c6df11e9b04 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_4.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_4.py @@ -40,10 +40,9 @@ class TestCloudRoleMaker(unittest.TestCase): from paddle.fluid.incubate.fleet.parameter_server.pslib import PSLib from paddle.fluid.incubate.fleet.base.role_maker import \ GeneralRoleMaker - from paddle.fluid.incubate.fleet.utils.http_server import KVHandler - from paddle.fluid.incubate.fleet.utils.http_server import KVServer - from paddle.fluid.incubate.fleet.utils.http_server import \ - KVHTTPServer + from paddle.fleet.utils import KVHandler + from paddle.fleet.utils import KVServer + from paddle.fleet.utils import KVHTTPServer except: print("warning: no fleet, skip test_pslib_4") return diff --git a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py new file mode 100644 index 0000000000000000000000000000000000000000..659cc34b5495894e883f10fb73a56719c9c58442 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py @@ -0,0 +1,171 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Test cloud role maker.""" + +from __future__ import print_function +import os +import unittest +import paddle.fleet.base.role_maker as role_maker + + +class TestRoleMakerBase(unittest.TestCase): + """ + Test cases for RoleMakerBase + """ + + def test_rolemaker_base(self): + role = role_maker.RoleMakerBase() + self.assertRaises(Exception, role.is_worker) + self.assertRaises(Exception, role.is_server) + self.assertRaises(Exception, role.is_first_worker) + self.assertRaises(Exception, role.worker_num) + self.assertRaises(Exception, role.server_num) + self.assertRaises(Exception, role.worker_index) + self.assertRaises(Exception, role.server_index) + self.assertRaises(Exception, role.role_id) + + trainer_endpoints = role.get_trainer_endpoints() + self.assertTrue(len(trainer_endpoints) == 0) + pserver_endpoints = role.get_pserver_endpoints() + self.assertTrue(len(pserver_endpoints) == 0) + + print(role.to_string()) + self.assertTrue(role._all_gather(role._node_type_comm, 1) is None) + self.assertTrue(role._all_reduce(role._node_type_comm, 1) is None) + role._barrier(role._node_type_comm) + + +class TestCloudRoleMaker(unittest.TestCase): + """ + Test cases for PaddleCloudRoleMaker. + """ + + def setUp(self): + """Set up, set envs.""" + os.environ["PADDLE_TRAINERS_NUM"] = "2" + os.environ[ + "PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001,127.0.0.2:36001" + os.environ[ + "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.2:36001" + os.environ["POD_IP"] = "127.0.0.1" + + def test_tr_rolemaker(self): + """Test tr rolenamer.""" + os.environ["TRAINING_ROLE"] = "TRAINER" + os.environ["PADDLE_TRAINER_ID"] = "0" + + try: + import netifaces + except: + print("warning: no netifaces, skip test_tr_rolemaker") + return + + ro = role_maker.PaddleCloudRoleMaker( + is_collective=False, init_gloo=False) + self.assertTrue(ro.is_worker()) + self.assertFalse(ro.is_server()) + self.assertEqual(ro.worker_num(), 2) + self.assertTrue(ro.is_first_worker()) + worker_endpoints = ro.get_trainer_endpoints() + self.assertEqual(worker_endpoints[0], '127.0.0.1:36001') + self.assertEqual(ro.role_id(), 0) + + def test_tr_rolemaker_collective(self): + ro = role_maker.PaddleCloudRoleMaker(is_collective=True) + self.assertEqual(ro.worker_num(), 2) + + def test_ps_rolemaker(self): + """Test ps rolemaker.""" + os.environ["TRAINING_ROLE"] = "PSERVER" + os.environ["POD_IP"] = "127.0.0.1" + os.environ["PADDLE_PORT"] = "36001" + + try: + import netifaces + except: + print("warning: no netifaces, skip test_ps_rolemaker") + return + + ro = role_maker.PaddleCloudRoleMaker( + is_collective=False, init_gloo=False) + self.assertEqual(ro.server_index(), 0) + self.assertFalse(ro.is_worker()) + self.assertTrue(ro.is_server()) + self.assertEqual(ro.server_num(), 2) + pserver_endpoints = ro.get_pserver_endpoints() + self.assertEqual(pserver_endpoints[0], '127.0.0.1:36001') + self.assertTrue(ro._all_gather(ro._all_comm, 1) is None) + self.assertTrue(ro._all_reduce(ro._all_comm, 1) is None) + + def test_traing_role(self): + """Test training role.""" + os.environ["TRAINING_ROLE"] = "TEST" + try: + import netifaces + except: + print("warning: no netifaces, skip test_training_role") + return + + ro = role_maker.PaddleCloudRoleMaker(is_collective=False) + self.assertRaises(ValueError, ro.generate_role) + + +class TestUserDefinedRoleMaker(unittest.TestCase): + """ + Test cases for UserDefinedRoleMaker. + """ + + def setUp(self): + pass + + def test_ps_rolemaker(self): + try: + import netifaces + except: + print("warning: no netifaces, skip test_ps_rolemaker") + return + + ro = role_maker.UserDefinedRoleMaker( + is_collective=False, + init_gloo=False, + server_endpoints="127.0.0.1:36001,127.0.0.1:36001", + role=role_maker.Role.SERVER, + current_id=0, + worker_num=2) + self.assertEqual(ro.server_num(), 2) + ro.generate_role() + self.assertTrue(ro.is_server()) + self.assertEqual(ro.role_id(), 0) + + def test_tr_rolemaker(self): + try: + import netifaces + except: + print("warning: no netifaces, skip test_tr_rolemaker") + return + + ro = role_maker.UserDefinedRoleMaker( + is_collective=False, + init_gloo=False, + server_endpoints="127.0.0.1:36001,127.0.0.1:36001", + role=role_maker.Role.WORKER, + current_id=0, + worker_num=2) + self.assertIn("127.0.0.1:36001", ro.get_pserver_endpoints()) + self.assertTrue(ro.is_worker()) + self.assertEqual(ro.role_id(), 0) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_fleet_util.py b/python/paddle/fluid/tests/unittests/test_fleet_util.py index 427e077416e979ad5a77f4744ba6ffdb5064fdff..e52cb5f920c2ebdf54c8b3e64cf61d16baaeadf4 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_util.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_util.py @@ -12,12 +12,27 @@ # See the License for the specific language governing permissions and # limitations under the License. -import unittest +from __future__ import print_function import paddle +import paddle.fluid as fluid +import unittest +import numpy as np +import tarfile +import tempfile import os +import sys +from paddle.dataset.common import download, DATA_HOME +from paddle.fleet.base.util_factory import fleet_util +import paddle.fleet.base.role_maker as role_maker class TestFleetUtil(unittest.TestCase): + proto_data_url = "https://fleet.bj.bcebos.com/fleet_util_data.tgz" + proto_data_md5 = "59b7f12fd9dc24b64ae8e4629523a92a" + module_name = "fleet_util_data" + pruned_dir = os.path.join("fleet_util_data", "pruned_model") + train_dir = os.path.join("fleet_util_data", "train_program") + def test_util_base(self): import paddle.fleet as fleet util = fleet.UtilBase() @@ -65,6 +80,262 @@ class TestFleetUtil(unittest.TestCase): user_id = fleet.util.get_user_id() self.assertEqual(user_id, 10) + def test_fs(self): + from paddle.fleet.utils import LocalFS + fs = LocalFS() + dirs, files = fs.ls_dir("test_tmp") + dirs, files = fs.ls_dir("./") + self.assertFalse(fs.need_upload_download()) + fleet_util.set_file_system(fs) + + def test_barrier(self): + try: + import netifaces + except: + print("warning: no netifaces, skip test_barrier") + return + + gloo = fluid.core.Gloo() + gloo.set_rank(0) + gloo.set_size(1) + gloo.set_prefix("123") + gloo.set_iface("lo") + gloo.set_hdfs_store("./tmp_test_fleet_barrier", "", "") + gloo.init() + + role = role_maker.UserDefinedRoleMaker( + is_collective=False, + init_gloo=False, + current_id=0, + role=role_maker.Role.SERVER, + worker_endpoints=["127.0.0.1:6003"], + server_endpoints=["127.0.0.1:6001"]) + role._node_type_comm = gloo + role._role_is_generated = True + fleet_util._set_role_maker(role) + + fleet_util.barrier("worker") + + def test_all_reduce(self): + try: + import netifaces + except: + print("warning: no netifaces, skip test_all_reduce") + return + + gloo = fluid.core.Gloo() + gloo.set_rank(0) + gloo.set_size(1) + gloo.set_prefix("123") + gloo.set_iface("lo") + gloo.set_hdfs_store("./tmp_test_fleet_reduce", "", "") + gloo.init() + + role = role_maker.UserDefinedRoleMaker( + is_collective=False, + init_gloo=False, + current_id=0, + role=role_maker.Role.WORKER, + worker_endpoints=["127.0.0.1:6003"], + server_endpoints=["127.0.0.1:6001"]) + role._node_type_comm = gloo + role._role_is_generated = True + fleet_util._set_role_maker(role) + + output = fleet_util.all_reduce(1, "sum", comm_world="server") + print(output) + + # self.assertEqual(output, 1) + + def test_all_gather(self): + try: + import netifaces + except: + print("warning: no netifaces, skip test_all_gather") + return + + gloo = fluid.core.Gloo() + gloo.set_rank(0) + gloo.set_size(1) + gloo.set_prefix("123") + gloo.set_iface("lo") + gloo.set_hdfs_store("./tmp_test_fleet_reduce", "", "") + gloo.init() + + role = role_maker.UserDefinedRoleMaker( + is_collective=False, + init_gloo=False, + current_id=0, + role=role_maker.Role.SERVER, + worker_endpoints=["127.0.0.1:6003"], + server_endpoints=["127.0.0.1:6001"]) + role._node_type_comm = gloo + role._all_comm = gloo + role._role_is_generated = True + fleet_util._set_role_maker(role) + + output = fleet_util.all_gather(1, comm_world="all") + print(output) + # self.assertTrue(len(output) == 1 and output[0] == 1) + self.assertRaises(Exception, fleet_util.all_gather, 1, "test") + + def download_files(self): + path = download(self.proto_data_url, self.module_name, + self.proto_data_md5) + print('data is downloaded at ' + path) + tar = tarfile.open(path) + unzip_folder = tempfile.mkdtemp() + tar.extractall(unzip_folder) + return unzip_folder + + def test_get_file_shard(self): + self.assertRaises(Exception, fleet_util.get_file_shard, "files") + try: + import netifaces + except: + print("warning: no netifaces, skip test_get_file_shard") + return + + role = role_maker.UserDefinedRoleMaker( + is_collective=False, + init_gloo=False, + current_id=0, + role=role_maker.Role.WORKER, + worker_endpoints=["127.0.0.1:6003", "127.0.0.1:6004"], + server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"]) + fleet_util._set_role_maker(role) + files = fleet_util.get_file_shard(["1", "2", "3"]) + self.assertTrue(len(files) == 2 and "1" in files and "2" in files) + + def test_program_type_trans(self): + data_dir = self.download_files() + program_dir = os.path.join(data_dir, self.pruned_dir) + text_program = "pruned_main_program.pbtxt" + binary_program = "pruned_main_program.bin" + text_to_binary = fleet_util._program_type_trans(program_dir, + text_program, True) + binary_to_text = fleet_util._program_type_trans(program_dir, + binary_program, False) + self.assertTrue( + os.path.exists(os.path.join(program_dir, text_to_binary))) + self.assertTrue( + os.path.exists(os.path.join(program_dir, binary_to_text))) + + def test_prams_check(self): + data_dir = self.download_files() + + class config: + pass + + feed_config = config() + feed_config.feeded_vars_names = ['concat_1.tmp_0', 'concat_2.tmp_0'] + feed_config.feeded_vars_dims = [682, 1199] + feed_config.feeded_vars_types = [np.float32, np.float32] + feed_config.feeded_vars_filelist = [ + os.path.join(data_dir, os.path.join(self.pruned_dir, "concat_1")), + os.path.join(data_dir, os.path.join(self.pruned_dir, "concat_2")) + ] + + fetch_config = config() + fetch_config.fetch_vars_names = ['similarity_norm.tmp_0'] + + conf = config() + conf.batch_size = 1 + conf.feed_config = feed_config + conf.fetch_config = fetch_config + conf.dump_model_dir = os.path.join(data_dir, self.pruned_dir) + conf.dump_program_filename = "pruned_main_program.pbtxt" + conf.is_text_dump_program = True + conf.save_params_filename = None + + # test saved var's shape + conf.dump_program_filename = "pruned_main_program.save_var_shape_not_match" + + self.assertRaises(Exception, fleet_util._params_check) + + # test program.proto without feed_op and fetch_op + conf.dump_program_filename = "pruned_main_program.no_feed_fetch" + results = fleet_util._params_check(conf) + self.assertTrue(len(results) == 1) + np.testing.assert_array_almost_equal( + results[0], np.array( + [[3.0590223e-07]], dtype=np.float32)) + + # test feed_var's shape + conf.dump_program_filename = "pruned_main_program.feed_var_shape_not_match" + self.assertRaises(Exception, fleet_util._params_check) + + # test correct case with feed_vars_filelist + conf.dump_program_filename = "pruned_main_program.pbtxt" + results = fleet_util._params_check(conf) + self.assertTrue(len(results) == 1) + np.testing.assert_array_almost_equal( + results[0], np.array( + [[3.0590223e-07]], dtype=np.float32)) + + # test correct case without feed_vars_filelist + conf.feed_config.feeded_vars_filelist = None + # test feed var with lod_level >= 2 + conf.dump_program_filename = "pruned_main_program.feed_lod2" + self.assertRaises(Exception, fleet_util._params_check) + + conf.dump_program_filename = "pruned_main_program.pbtxt" + results = fleet_util._params_check(conf) + self.assertTrue(len(results) == 1) + + def test_proto_check(self): + data_dir = self.download_files() + + class config: + pass + + conf = config() + conf.train_prog_path = os.path.join( + data_dir, os.path.join(self.train_dir, "join_main_program.pbtxt")) + conf.is_text_train_program = True + + # test not match + conf.pruned_prog_path = os.path.join( + data_dir, + os.path.join(self.pruned_dir, + "pruned_main_program.save_var_shape_not_match")) + conf.is_text_pruned_program = True + conf.draw = False + res = fleet_util._proto_check(conf) + self.assertFalse(res) + + # test match + conf.pruned_prog_path = os.path.join( + data_dir, + os.path.join(self.pruned_dir, "pruned_main_program.pbtxt")) + if sys.platform == 'win32' or sys.platform == 'sys.platform': + conf.draw = False + else: + conf.draw = True + conf.draw_out_name = "pruned_check" + res = fleet_util._proto_check(conf) + self.assertTrue(res) + + def test_visualize(self): + if sys.platform == 'win32' or sys.platform == 'sys.platform': + pass + else: + data_dir = self.download_files() + program_path = os.path.join( + data_dir, + os.path.join(self.train_dir, "join_main_program.pbtxt")) + is_text = True + program = fleet_util._load_program(program_path, is_text) + output_dir = os.path.join(data_dir, self.train_dir) + output_filename = "draw_prog" + fleet_util._visualize_graphviz(program, output_dir, output_filename) + self.assertTrue( + os.path.exists( + os.path.join(output_dir, output_filename + ".dot"))) + self.assertTrue( + os.path.exists( + os.path.join(output_dir, output_filename + ".pdf"))) + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_fs_interface.py b/python/paddle/fluid/tests/unittests/test_fs_interface.py index 0d87b94538f05d734cb3e621fc0dfc7c48e8fea2..7f780bd44f7e2def44e8fdff7fa22d32633372ef 100644 --- a/python/paddle/fluid/tests/unittests/test_fs_interface.py +++ b/python/paddle/fluid/tests/unittests/test_fs_interface.py @@ -20,9 +20,7 @@ import os import sys import inspect -from paddle.fluid.incubate.fleet.utils.fs import LocalFS, FS -from paddle.fluid.incubate.fleet.utils.hdfs import HDFSClient -from paddle.fluid.incubate.fleet.utils.hdfs import FSTimeOut, FSFileExistsError, FSFileNotExistsError +from paddle.fleet.utils import LocalFS, FS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError class FSTest(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/test_hdfs.py b/python/paddle/fluid/tests/unittests/test_hdfs.py index 9826542cee3732a48e1c6b6959afb74063bb09d7..80c7fd4ad57d157146a1e78e1f057a3f389b5923 100644 --- a/python/paddle/fluid/tests/unittests/test_hdfs.py +++ b/python/paddle/fluid/tests/unittests/test_hdfs.py @@ -19,9 +19,7 @@ from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet, T import os import sys -from paddle.fluid.incubate.fleet.utils.fs import LocalFS -from paddle.fluid.incubate.fleet.utils.hdfs import HDFSClient -from paddle.fluid.incubate.fleet.utils.hdfs import FSTimeOut, FSFileExistsError, FSFileNotExistsError +from paddle.fleet.utils import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError java_home = os.environ["JAVA_HOME"] diff --git a/python/paddle/fluid/tests/unittests/test_jit_save_load.py b/python/paddle/fluid/tests/unittests/test_jit_save_load.py index abc46034957cf7414310f0f593f3bcce71a6d1de..a61d31e88253d7b45efde6226fe14cf5b5b11af9 100644 --- a/python/paddle/fluid/tests/unittests/test_jit_save_load.py +++ b/python/paddle/fluid/tests/unittests/test_jit_save_load.py @@ -14,13 +14,15 @@ from __future__ import print_function +import os import unittest import numpy as np import paddle import paddle.fluid as fluid from paddle.fluid.dygraph import Linear -from paddle.fluid.dygraph import declarative +from paddle.fluid.dygraph import declarative, ProgramTranslator +from paddle.fluid.dygraph.io import VARIABLE_FILENAME, EXTRA_VAR_INFO_FILENAME BATCH_SIZE = 32 BATCH_NUM = 20 @@ -77,8 +79,8 @@ class LinearNetReturnLoss(fluid.dygraph.Layer): def train(layer): # create optimizer - adam = fluid.optimizer.AdamOptimizer( - learning_rate=0.1, parameter_list=layer.parameters()) + adam = fluid.optimizer.SGDOptimizer( + learning_rate=0.01, parameter_list=layer.parameters()) # create data loader train_loader = fluid.io.DataLoader.from_generator(capacity=5) train_loader.set_batch_generator(random_batch_reader()) @@ -111,37 +113,43 @@ class TestJitSaveLoad(unittest.TestCase): # config seed fluid.default_main_program().random_seed = SEED - def train_and_save_model(self): + def train_and_save_model(self, model_path=None, configs=None): layer = LinearNet(784, 1) example_inputs, layer, _ = train(layer) + final_model_path = model_path if model_path else self.model_path orig_input_types = [type(x) for x in example_inputs] fluid.dygraph.jit.save( - layer=layer, model_path=self.model_path, input_spec=example_inputs) + layer=layer, + model_path=final_model_path, + input_spec=example_inputs, + configs=configs) new_input_types = [type(x) for x in example_inputs] self.assertEqual(orig_input_types, new_input_types) return layer - def test_save(self): - # train and save model - self.train_and_save_model() - - def test_load_infernece(self): + def test_save_load(self): # train and save model train_layer = self.train_and_save_model() # load model - infer_layer = fluid.dygraph.jit.load(self.model_path) + program_translator = ProgramTranslator() + program_translator.enable(False) + loaded_layer = fluid.dygraph.jit.load(self.model_path) + self.load_and_inference(train_layer, loaded_layer) + self.load_dygraph_state_dict(train_layer) + self.load_and_finetune(train_layer, loaded_layer) + program_translator.enable(True) + + def load_and_inference(self, train_layer, infer_layer): train_layer.eval() + infer_layer.eval() # inference & compare x = fluid.dygraph.to_variable( np.random.random((1, 784)).astype('float32')) self.assertTrue( np.array_equal(train_layer(x).numpy(), infer_layer(x).numpy())) - def test_load_finetune(self): - # train and save model - train_layer = self.train_and_save_model() - # load model - load_train_layer = fluid.dygraph.jit.load(self.model_path) + def load_and_finetune(self, train_layer, load_train_layer): + train_layer.train() load_train_layer.train() # train & compare _, _, train_loss = train(train_layer) @@ -149,6 +157,19 @@ class TestJitSaveLoad(unittest.TestCase): self.assertTrue( np.array_equal(train_loss.numpy(), load_train_loss.numpy())) + def load_dygraph_state_dict(self, train_layer): + train_layer.eval() + # contruct new model + new_layer = LinearNet(784, 1) + model_dict, _ = fluid.dygraph.load_dygraph(self.model_path) + new_layer.set_dict(model_dict) + new_layer.eval() + # inference & compare + x = fluid.dygraph.to_variable( + np.random.random((1, 784)).astype('float32')) + self.assertTrue( + np.array_equal(train_layer(x).numpy(), new_layer(x).numpy())) + def test_save_get_program_failed(self): layer = LinearNetNotDeclarative(784, 1) example_inputs, layer, _ = train(layer) @@ -158,6 +179,31 @@ class TestJitSaveLoad(unittest.TestCase): model_path=self.model_path, input_spec=example_inputs) + def test_load_dygraoh_no_path(self): + model_path = "model.test_jit_save_load.no_path" + new_layer = LinearNet(784, 1) + with self.assertRaises(ValueError): + model_dict, _ = fluid.dygraph.load_dygraph(model_path) + + def test_load_dygraph_no_var_info(self): + model_path = "model.test_jit_save_load.no_var_info" + self.train_and_save_model(model_path=model_path) + # remove `__variables.info__` + var_info_path = os.path.join(model_path, EXTRA_VAR_INFO_FILENAME) + os.remove(var_info_path) + new_layer = LinearNet(784, 1) + with self.assertRaises(RuntimeError): + model_dict, _ = fluid.dygraph.load_dygraph(model_path) + + def test_load_dygraph_not_var_file(self): + model_path = "model.test_jit_save_load.no_var_file" + configs = fluid.dygraph.jit.SaveLoadConfig() + configs.params_filename = "__params__" + self.train_and_save_model(model_path=model_path, configs=configs) + new_layer = LinearNet(784, 1) + with self.assertRaises(RuntimeError): + model_dict, _ = fluid.dygraph.load_dygraph(model_path) + class TestJitSaveLoadConfig(unittest.TestCase): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/test_launch.sh b/python/paddle/fluid/tests/unittests/test_launch.sh index f1bf6395f15ce0d5ce49eff241a752e4847d9d17..98c907a551965331f79d1635362213b43d867002 100644 --- a/python/paddle/fluid/tests/unittests/test_launch.sh +++ b/python/paddle/fluid/tests/unittests/test_launch.sh @@ -3,7 +3,7 @@ set -e # use default values # FIXME: random fails on Unknown command lines -c (or -m). launch_py=${PADDLE_BINARY_DIR}/python/paddle/distributed/launch.py -python ${launch_py} multi_process.py +python ${launch_py} multi_process.py launch # use paddlecloud echo "begin test use paddlecloud" @@ -18,12 +18,12 @@ export PADDLE_PORT=35019 export TRAINER_PORTS_NUM=2 distributed_args="--use_paddlecloud --cluster_node_ips=${cluster_node_ips} --node_ip=${node_ip} --selected_gpus=0,1 --log_dir=testlog" -CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} ${distributed_args} multi_process.py +CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} ${distributed_args} multi_process.py launch str1="selected_gpus:0 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35019 trainer_id:0" str2="selected_gpus:1 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35020 trainer_id:1" -file_0="multi_process.check_0.log" -file_1="multi_process.check_1.log" +file_0="multi_process_launch.check_0.log" +file_1="multi_process_launch.check_1.log" echo "paddlecloud params test" if grep -q "$str1" "$file_0"; then @@ -54,7 +54,7 @@ unset TRAINER_PORTS_NUM echo "" echo "paddle.distributed.launch async poll process test" -if ! CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} ${distributed_args} multi_process.py abort; then +if ! CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} ${distributed_args} multi_process.py launch abort; then echo "train abort as planned" fi diff --git a/python/paddle/fluid/tests/unittests/test_tril_triu_op.py b/python/paddle/fluid/tests/unittests/test_tril_triu_op.py index 0f14c9d1c3ba99b5c9b1500e4b7ddabb690e9290..aed265b21b5781d88da0380b04872061e893d736 100644 --- a/python/paddle/fluid/tests/unittests/test_tril_triu_op.py +++ b/python/paddle/fluid/tests/unittests/test_tril_triu_op.py @@ -63,7 +63,7 @@ def case_generator(op_type, Xshape, diagonal, expected): "diagonal: TypeError": "diagonal in {} must be a python Int".format(op_type), "input: ValueError": - "input shape in {} must be at least 2-D".format(op_type), + "x shape in {} must be at least 2-D".format(op_type), } class FailureCase(unittest.TestCase): @@ -71,7 +71,7 @@ def case_generator(op_type, Xshape, diagonal, expected): data = fluid.data(shape=Xshape, dtype='float64', name=cls_name) with self.assertRaisesRegexp( eval(expected.split(':')[-1]), errmsg[expected]): - getattr(tensor, op_type)(input=data, diagonal=diagonal) + getattr(tensor, op_type)(x=data, diagonal=diagonal) class SuccessCase(TrilTriuOpDefaultTest): def initTestCase(self): diff --git a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py index 1b353e1379076cc71e8013487e0b22f5bf03dc09..9382d53e7fec6ba9e1217f99ba5006b3dfe5c150 100644 --- a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py +++ b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py @@ -81,7 +81,7 @@ class API_TestUnsqueeze(unittest.TestCase): def test_out(self): with fluid.program_guard(fluid.Program(), fluid.Program()): data1 = fluid.layers.data('data1', shape=[-1, 10], dtype='float64') - result_squeeze = paddle.unsqueeze(data1, axes=[1]) + result_squeeze = paddle.unsqueeze(data1, axis=[1]) place = fluid.CPUPlace() exe = fluid.Executor(place) input1 = np.random.random([5, 1, 10]).astype('float64') @@ -98,7 +98,7 @@ class TestUnsqueezeOpError(unittest.TestCase): def test_axes_type(): x6 = fluid.layers.data( shape=[-1, 10], dtype='float16', name='x3') - paddle.unsqueeze(x6, axes=3.2) + paddle.unsqueeze(x6, axis=3.2) self.assertRaises(TypeError, test_axes_type) @@ -108,7 +108,7 @@ class API_TestUnsqueeze2(unittest.TestCase): with fluid.program_guard(fluid.Program(), fluid.Program()): data1 = fluid.data('data1', shape=[-1, 10], dtype='float64') data2 = fluid.data('data2', shape=[1], dtype='int32') - result_squeeze = paddle.unsqueeze(data1, axes=data2) + result_squeeze = paddle.unsqueeze(data1, axis=data2) place = fluid.CPUPlace() exe = fluid.Executor(place) input1 = np.random.random([5, 1, 10]).astype('float64') @@ -125,7 +125,7 @@ class API_TestUnsqueeze3(unittest.TestCase): with fluid.program_guard(fluid.Program(), fluid.Program()): data1 = fluid.data('data1', shape=[-1, 10], dtype='float64') data2 = fluid.data('data2', shape=[1], dtype='int32') - result_squeeze = paddle.unsqueeze(data1, axes=[data2, 3]) + result_squeeze = paddle.unsqueeze(data1, axis=[data2, 3]) place = fluid.CPUPlace() exe = fluid.Executor(place) input1 = np.random.random([5, 1, 10, 1]).astype('float64') @@ -143,7 +143,7 @@ class API_TestDyUnsqueeze(unittest.TestCase): input_1 = np.random.random([5, 1, 10]).astype("int32") input1 = np.squeeze(input_1, axis=1) input = fluid.dygraph.to_variable(input_1) - output = paddle.unsqueeze(input, axes=[1]) + output = paddle.unsqueeze(input, axis=[1]) out_np = output.numpy() self.assertTrue(np.allclose(input1, out_np)) @@ -154,7 +154,7 @@ class API_TestDyUnsqueeze2(unittest.TestCase): input_1 = np.random.random([5, 1, 10]).astype("int32") input1 = np.squeeze(input_1, axis=1) input = fluid.dygraph.to_variable(input_1) - output = paddle.unsqueeze(input, axes=1) + output = paddle.unsqueeze(input, axis=1) out_np = output.numpy() self.assertTrue(np.allclose(input1, out_np)) diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py index 10f93f90fbb875f3fd546fb8b561ec0d1933294c..02c908be347ab00ce29babd01f227e8367e259f2 100644 --- a/python/paddle/tensor/creation.py +++ b/python/paddle/tensor/creation.py @@ -248,7 +248,7 @@ def zeros(shape, dtype=None, name=None): # shape is a Tensor shape = paddle.fill_constant(shape=[2], dtype='int32', value=2) - data3 = paddle.ones(shape=shape, dtype='int32') + data3 = paddle.zeros(shape=shape, dtype='int32') # [[0 0] # [0 0]] """ @@ -490,14 +490,13 @@ def _tril_triu_op(helper): """Base op of tril_op and triu_op """ op_type = helper.layer_type - x = helper.kwargs.get('input', None) + x = helper.kwargs.get('x', None) assert x is not None, 'x cannot be None in {}'.format(op_type) check_variable_and_dtype(x, 'x', ['float32', 'float64', 'int32', 'int64'], op_type) if len(x.shape) < 2: - raise ValueError("input shape in {} must be at least 2-D".format( - op_type)) + raise ValueError("x shape in {} must be at least 2-D".format(op_type)) diagonal = helper.kwargs.get('diagonal', 0) if not isinstance(diagonal, (int, )): raise TypeError("diagonal in {} must be a python Int".format(op_type)) @@ -521,18 +520,18 @@ def _tril_triu_op(helper): return out -def tril(input, diagonal=0, name=None): +def tril(x, diagonal=0, name=None): """ :alias_main: paddle.tril :alias: paddle.tril,paddle.tensor.tril,paddle.tensor.creation.tril This op returns the lower triangular part of a matrix (2-D tensor) or batch - of matrices :attr:`input`, the other elements of the result tensor are set + of matrices :attr:`x`, the other elements of the result tensor are set to 0. The lower triangular part of the matrix is defined as the elements on and below the diagonal. Args: - input (Variable): The input variable which is a Tensor. + x (Variable): The input variable x which is a Tensor. Support data types: ``float64``, ``float32``, ``int32``, ``int64``. diagonal (int, optional): The diagonal to consider, default value is 0. If :attr:`diagonal` = 0, all elements on and below the main diagonal are @@ -545,47 +544,41 @@ def tril(input, diagonal=0, name=None): user to set this property. For more information, please refer to :ref:`api_guide_Name`. Returns: - Variable: Tensor, results of lower triangular operation by the specified diagonal of input tensor, - it's data type is the same as input's Tensor. + Variable: Tensor, results of lower triangular operation by the specified diagonal of input tensor x, + it's data type is the same as x's Tensor. Raises: TypeError: diagonal is not a int type. - ValueError: dimension of :attr:`input` is less than 2. + ValueError: dimension of :attr:`x` is less than 2. Examples: .. code-block:: python import numpy as np - import paddle.tensor as tensor - import paddle.fluid as fluid + import paddle data = np.arange(1, 13, dtype="int64").reshape(3,-1) # array([[ 1, 2, 3, 4], # [ 5, 6, 7, 8], # [ 9, 10, 11, 12]]) - x = fluid.data(shape=(-1, 4), dtype='int64', name='x') - exe = fluid.Executor(fluid.CPUPlace()) - # example 1, default diagonal - tril = tensor.tril(x) - tril_out, = exe.run(fluid.default_main_program(), feed={"x": data}, - fetch_list=[tril], return_numpy=True) + paddle.enable_imperative() + + x = paddle.imperative.to_variable(data) + + tril1 = paddle.tensor.tril(x) # array([[ 1, 0, 0, 0], # [ 5, 6, 0, 0], # [ 9, 10, 11, 0]]) # example 2, positive diagonal value - tril = tensor.tril(x, diagonal=2) - tril_out, = exe.run(fluid.default_main_program(), feed={"x": data}, - fetch_list=[tril], return_numpy=True) + tril2 = paddle.tensor.tril(x, diagonal=2) # array([[ 1, 2, 3, 0], # [ 5, 6, 7, 8], # [ 9, 10, 11, 12]]) # example 3, negative diagonal value - tril = tensor.tril(x, diagonal=-1) - tril_out, = exe.run(fluid.default_main_program(), feed={"x": data}, - fetch_list=[tril], return_numpy=True) + tril3 = paddle.tensor.tril(x, diagonal=-1) # array([[ 0, 0, 0, 0], # [ 5, 0, 0, 0], # [ 9, 10, 0, 0]]) @@ -593,23 +586,23 @@ def tril(input, diagonal=0, name=None): """ if in_dygraph_mode(): op = getattr(core.ops, 'tril_triu') - return op(input, 'diagonal', diagonal, "lower", True) + return op(x, 'diagonal', diagonal, "lower", True) return _tril_triu_op(LayerHelper('tril', **locals())) -def triu(input, diagonal=0, name=None): +def triu(x, diagonal=0, name=None): """ :alias_main: paddle.triu :alias: paddle.triu,paddle.tensor.triu,paddle.tensor.creation.triu This op returns the upper triangular part of a matrix (2-D tensor) or batch of matrices - :attr:`input`, the other elements of the result tensor are set to 0. + :attr:`x`, the other elements of the result tensor are set to 0. The upper triangular part of the matrix is defined as the elements on and above the diagonal. Args: - input (Variable): The input variable which is a Tensor. + x (Variable): The input variable x which is a Tensor. Support data types: ``float64``, ``float32``, ``int32``, ``int64``. diagonal (int, optional): The diagonal to consider, default value is 0. If :attr:`diagonal` = 0, all elements on and above the main diagonal are @@ -622,47 +615,41 @@ def triu(input, diagonal=0, name=None): user to set this property. For more information, please refer to :ref:`api_guide_Name`. Returns: - Variable: Tensor, results of upper triangular operation by the specified diagonal of input tensor, - it's data type is the same as input's Tensor. + Variable: Tensor, results of upper triangular operation by the specified diagonal of input tensor x, + it's data type is the same as x's Tensor. Raises: TypeError: diagonal is not a int type. - ValueError: dimension of :attr:`input` is less than 2. + ValueError: dimension of :attr:`x` is less than 2. Examples: .. code-block:: python import numpy as np - import paddle.fluid as fluid - import paddle.tensor as tensor + import paddle data = np.arange(1, 13, dtype="int64").reshape(3,-1) # array([[ 1, 2, 3, 4], # [ 5, 6, 7, 8], # [ 9, 10, 11, 12]]) - x = fluid.data(shape=(-1, 4), dtype='int64', name='x') - exe = fluid.Executor(fluid.CPUPlace()) + + paddle.enable_imperative() # example 1, default diagonal - triu = tensor.triu(x) - triu_out, = exe.run(fluid.default_main_program(), feed={"x": data}, - fetch_list=[triu], return_numpy=True) + x = paddle.imperative.to_variable(data) + triu1 = paddle.tensor.triu(x) # array([[ 1, 2, 3, 4], # [ 0, 6, 7, 8], # [ 0, 0, 11, 12]]) # example 2, positive diagonal value - triu = tensor.triu(x, diagonal=2) - triu_out, = exe.run(fluid.default_main_program(), feed={"x": data}, - fetch_list=[triu], return_numpy=True) + triu2 = paddle.tensor.triu(x, diagonal=2) # array([[0, 0, 3, 4], # [0, 0, 0, 8], # [0, 0, 0, 0]]) # example 3, negative diagonal value - triu = tensor.triu(x, diagonal=-1) - triu_out, = exe.run(fluid.default_main_program(), feed={"x": data}, - fetch_list=[triu], return_numpy=True) + triu3 = paddle.tensor.triu(x, diagonal=-1) # array([[ 1, 2, 3, 4], # [ 5, 6, 7, 8], # [ 0, 10, 11, 12]]) @@ -670,7 +657,7 @@ def triu(input, diagonal=0, name=None): """ if in_dygraph_mode(): op = getattr(core.ops, 'tril_triu') - return op(input, 'diagonal', diagonal, "lower", False) + return op(x, 'diagonal', diagonal, "lower", False) return _tril_triu_op(LayerHelper('triu', **locals())) diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py index 6b67394b6bd250282e2ea8f13134503ac6cbfc0a..fcff5585bc12a75f274bd29236648d5b201a2f2d 100644 --- a/python/paddle/tensor/linalg.py +++ b/python/paddle/tensor/linalg.py @@ -729,26 +729,32 @@ def bmm(x, y, name=None): Examples: import paddle - import paddle.fluid as fluid - x = fluid.layers.data(name='x', shape=[10, 3, 4], dtype='float32') - y = fluid.layers.data(name='y', shape=[10, 4, 5], dtype='float32') - out = paddle.bmm(x, y) - - # In dygraph mode: + + # In imperative mode: # size input1: (2, 2, 3) and input2: (2, 3, 2) input1 = np.array([[[1.0, 1.0, 1.0],[2.0, 2.0, 2.0]],[[3.0, 3.0, 3.0],[4.0, 4.0, 4.0]]]) input2 = np.array([[[1.0, 1.0],[2.0, 2.0],[3.0, 3.0]],[[4.0, 4.0],[5.0, 5.0],[6.0, 6.0]]]) - with fluid.dygraph.guard(): - x = fluid.dygraph.to_variable(input1) - y = fluid.dygraph.to_variable(input2) - out = paddle.bmm(x, y) - #output size: (2, 2, 2) - #output value: - #[[[6.0, 6.0],[12.0, 12.0]],[[45.0, 45.0],[60.0, 60.0]]] - out_np = out.numpy() + paddle.enable_imperative() + + x = paddle.imperative.to_variable(input1) + y = paddle.imperative.to_variable(input2) + out = paddle.bmm(x, y) + #output size: (2, 2, 2) + #output value: + #[[[6.0, 6.0],[12.0, 12.0]],[[45.0, 45.0],[60.0, 60.0]]] + out_np = out.numpy() """ - + x_shape = x.shape + y_shape = y.shape + if not len(x_shape) == len(y_shape) == 3: + raise ValueError( + "x and y should be 3-dimensional. But received x's dimention: {}, y's dimention: {}". + format(x_shape, y_shape)) + if x_shape[2] != y_shape[1]: + raise ValueError( + "x's width must be equal with y's height. But received x's shape: {}, y's shape: {}". + format(x_shape, y_shape)) helper = LayerHelper('bmm', **locals()) if in_dygraph_mode(): return core.ops.bmm(x, y) diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py index 07d327a21ede6c40578ab91fd28a5f011f95aba3..5b7c8c37b1b0a549f8c15af3e2d6425d5361de03 100644 --- a/python/paddle/tensor/manipulation.py +++ b/python/paddle/tensor/manipulation.py @@ -42,11 +42,32 @@ from ..fluid import layers import paddle __all__ = [ - 'cast', 'concat', 'expand', 'expand_as', 'flatten', 'gather', 'gather_nd', - 'reshape', 'reverse', 'scatter', 'scatter_nd_add', 'scatter_nd', - 'shard_index', 'slice', 'split', 'squeeze', 'stack', 'strided_slice', - 'transpose', 'unique', 'unique_with_counts', 'unsqueeze', 'unstack', 'flip', - 'unbind', 'roll' + 'cast', + 'concat', + 'expand', + 'expand_as', + 'flatten', + 'gather', + 'gather_nd', + 'reshape', + 'reverse', + 'scatter', + 'scatter_nd_add', + 'scatter_nd', + 'shard_index', + 'slice', + 'split', + 'squeeze', + 'stack', + 'strided_slice', + 'transpose', + 'unique', + 'unique_with_counts', + 'unsqueeze', + 'unstack', + 'flip', + 'unbind', + 'roll', ] @@ -417,7 +438,7 @@ def stack(x, axis=0, name=None): Args: x (Tensor|list[Tensor]): Input ``x`` can be a single tensor, or a ``list`` of tensors. If ``x`` is a ``list``, the Tensors in ``x`` - must be of the same shape and dtype. Support data types: float32, float64, int32, int64. + must be of the same shape and dtype. Supported data types: float32, float64, int32, int64. axis (int, optional): The axis along which all inputs are stacked. ``axis`` range is ``[-(R+1), R+1)``, where ``R`` is the number of dimensions of the first input tensor ``x[0]``. If ``axis < 0``, ``axis = axis+R+1``. The default value of axis is 0. @@ -559,18 +580,19 @@ def squeeze(x, axis=None, name=None): out.shape = [1, 3, 5] Args: - input (Tensor): The input Tensor. Support data type: float32, float64, int8, int32, int64. + x (Tensor): The input Tensor. Supported data type: float32, float64, bool, int8, int32, int64. axis (int|list|tuple, optional): An integer or list of integers, indicating the dimensions to be squeezed. Default is None. - The range of axis is :math:`[-ndim(input), ndim(input))`. - If axis is negative, :math:`axis = axis + ndim(input)`. - If axis is None, all the dimensions of input of size 1 will be removed. + The range of axis is :math:`[-ndim(x), ndim(x))`. + If axis is negative, :math:`axis = axis + ndim(x)`. + If axis is None, all the dimensions of x of size 1 will be removed. name (str, optional): Please refer to :ref:`api_guide_Name`, Default None. Returns: - Tensor: Output squeezed Tensor. Data type is same as input Tensor. + Tensor: Squeezed Tensor with the same data type as input Tensor. Examples: .. code-block:: python + import paddle paddle.enable_imperative() @@ -590,87 +612,50 @@ def squeeze(x, axis=None, name=None): return layers.squeeze(x, axis, name) -def unsqueeze(input, axes, out=None, name=None): +def unsqueeze(x, axis, name=None): """ :alias_main: paddle.unsqueeze - :alias: paddle.unsqueeze,paddle.tensor.unsqueeze,paddle.tensor.manipulation.unsqueeze - - Insert single-dimensional entries to the shape of a Tensor. Takes one - required argument axes, a list of dimensions that will be inserted. - Dimension indices in axes are as seen in the output tensor. - - For example: - - .. code-block:: text + :alias: paddle.unsqueeze, paddle.tensor.unsqueeze, paddle.tensor.manipulation.unsqueeze - Given a tensor such that tensor with shape [3, 4, 5], - then Unsqueezed tensor with axes=[0, 4] has shape [1, 3, 4, 5, 1]. + Insert single-dimensional entries to the shape of input Tensor ``x``. Takes one + required argument axis, a dimension or list of dimensions that will be inserted. + Dimension indices in axis are as seen in the output tensor. Args: - input (Variable): The input Tensor to be unsqueezed. It is a N-D Tensor of data types float32, float64, int32. - axes (int|list|tuple|Variable): Indicates the dimensions to be inserted. The data type is ``int32`` . If ``axes`` is a list or tuple, the elements of it should be integers or Tensors with shape [1]. If ``axes`` is an Variable, it should be an 1-D Tensor . - name (str|None): Name for this layer. + x (Tensor): The input Tensor to be unsqueezed. Supported data type: float32, float64, bool, int8, int32, int64. + axis (int|list|tuple|Tensor): Indicates the dimensions to be inserted. The data type is ``int32`` . + If ``axis`` is a list or tuple, the elements of it should be integers or Tensors with shape [1]. + If ``axis`` is a Tensor, it should be an 1-D Tensor . + If ``axis`` is negative, ``axis = axis + ndim(x) + 1``. + name (str|None): Name for this layer. Please refer to :ref:`api_guide_Name`, Default None. Returns: - Variable: Output unsqueezed Tensor, with data type being float32, float64, int32, int64. + Tensor: Unsqueezed Tensor with the same data type as input Tensor. Examples: .. code-block:: python - import numpy as np + import paddle - import paddle.fluid as fluid - with fluid.dygraph.guard(): - input_1 = np.random.random([5, 10]).astype("int32") - # input is a variable which shape is [5, 10] - input = fluid.dygraph.to_variable(input_1) + paddle.enable_imperative() + x = paddle.rand([5, 10]) + print(x.shape) # [5, 10] + + out1 = paddle.unsqueeze(x, axis=0) + print(out1.shape) # [1, 5, 10] + + out2 = paddle.unsqueeze(x, axis=[0, 2]) + print(out2.shape) # [1, 5, 1, 10] - output = paddle.unsqueeze(input, axes=[1]) - # output.shape [5, 1, 10] + axis = paddle.fluid.dygraph.to_variable([0, 1, 2]) + out3 = paddle.unsqueeze(x, axis=axis) + print(out3.shape) # [1, 1, 1, 5, 10] + """ - if not isinstance(axes, (int, list, tuple, Variable)): - raise TypeError( - "The type of 'axes' in unsqueeze must be int, list, tuple or Variable, but " - "received %s." % (type(axes))) - helper = LayerHelper("unsqueeze2", **locals()) - inputs = {"X": input} - attrs = {} - - def _to_Variable_list(one_list): - Variable_list = [] - for ele in one_list: - if isinstance(ele, Variable): - ele.stop_gradient = True - Variable_list.append(ele) - else: - assert (isinstance(ele, int)) - temp_out = helper.create_variable_for_type_inference('int32') - fill_constant([1], 'int32', ele, force_cpu=True, out=temp_out) - Variable_list.append(temp_out) - return Variable_list - - if isinstance(axes, int): - axes = [axes] - if isinstance(axes, Variable): - axes.stop_gradient = True - inputs["AxesTensor"] = axes - elif isinstance(axes, (list, tuple)): - contain_var = not all(not isinstance(ele, Variable) for ele in axes) - if contain_var: - inputs["AxesTensorList"] = _to_Variable_list(axes) - else: - attrs["axes"] = axes - - out = helper.create_variable_for_type_inference(dtype=input.dtype) - x_shape = helper.create_variable_for_type_inference(dtype=input.dtype) - helper.append_op( - type="unsqueeze2", - inputs=inputs, - attrs=attrs, - outputs={"Out": out, - "XShape": x_shape}) + if isinstance(axis, int): + axis = [axis] - return out + return layers.unsqueeze(x, axis, name) def gather(input, index, overwrite=True): diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py index 878fdbfc1f5761317fb5f8a32bbee5f5ef7f5bc0..9b1d7ec3a542c471d2f960b6f0b0b2b7f3509b99 100644 --- a/python/paddle/tensor/math.py +++ b/python/paddle/tensor/math.py @@ -915,7 +915,7 @@ def mm(input, mat2, name=None): return out -def addmm(input, x, y, alpha=1.0, beta=1.0, name=None): +def addmm(input, x, y, beta=1.0, alpha=1.0, name=None): """ :alias_main: paddle.addmm :alias: paddle.addmm,paddle.tensor.addmm,paddle.tensor.math.addmm @@ -935,8 +935,8 @@ def addmm(input, x, y, alpha=1.0, beta=1.0, name=None): input (Variable): The input Tensor/LoDTensor to be added to the final result. x (Variable): The first input Tensor/LoDTensor for matrix multiplication. y (Variable): The second input Tensor/LoDTensor for matrix multiplication. - alpha (float): Coefficient of $x*y$. beta (float): Coefficient of $input$. + alpha (float): Coefficient of $x*y$. name (str, optional): Name of the output. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`. Default is None. Returns: @@ -947,25 +947,43 @@ def addmm(input, x, y, alpha=1.0, beta=1.0, name=None): import numpy as np import paddle - import paddle.fluid as fluid - - input = fluid.data(name='input', shape=[2, 2], dtype='float32') - x = fluid.data(name='x', shape=[2, 2], dtype='float32') - y = fluid.data(name='y', shape=[2, 2], dtype='float32') - out = paddle.addmm( input=input, x=x, y=y, alpha=5.0, beta=0.5 ) data_x = np.ones((2, 2)).astype(np.float32) data_y = np.ones((2, 2)).astype(np.float32) data_input = np.ones((2, 2)).astype(np.float32) - place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace() - exe = fluid.Executor(place) - results = exe.run(fluid.default_main_program(), - fetch_list=[out], feed={"input": data_input, 'x': data_x, "y": data_y}) - print( np.array(results[0]) ) + paddle.enable_imperative() + + x = paddle.imperative.to_variable(data_x) + y = paddle.imperative.to_variable(data_y) + input = paddle.imperative.to_variable(data_input) + + out = paddle.tensor.addmm( input=input, x=x, y=y, beta=0.5, alpha=5.0 ) + + print( out.numpy() ) # [[10.5 10.5] # [10.5 10.5]] """ + input_shape = input.shape + x_shape = x.shape + y_shape = y.shape + if not len(input_shape) == len(x_shape) == len(y_shape) == 2: + raise ValueError("The dimention of input, x, y should be 2 but receive input's shape: {}, x's shape: {}, y's shape: {}".format(input_shape, x_shape, y_shape)) + if input_shape[0] != x_shape[0]: + if input_shape[0] != 1: + raise ValueError( "When x's dimension[0] is not equal with input's dimension[0], input's dimension[0] must be 1 but got {}".format(input_shape[0])) + if input_shape[1] != y_shape[1] and input_shape[1] != 1: + raise ValueError( "When y's dimension[1] is not equal with input's dimension[1], input's dimension[1] must be 1 but got {}".format(input_shape[1])) + if input_shape[1] != y_shape[1]: + if input_shape[1] != 1: + raise ValueError( "When y's dimension[1] is not equal with input's dimension[1], input's dimension[1] must be 1 but got {}".format(input_shape[1])) + if input_shape[0] != x_shape[0] and input_shape[0] != 1: + raise ValueError( "When x's dimension[0] is not equal with input's dimension[0], input's dimension[0] must be 1 but got {}".format(input_shape[0])) + if x_shape[1] != y_shape[0]: + raise ValueError("The input Variable x's width must be equal with Variable y' height. But received x's shape = {}, y's shape = {}.".format(x_shape, y_shape)) + + + if in_dygraph_mode(): out = core.ops.addmm(input, x, y, "Alpha", alpha, "Beta", beta) return out @@ -974,7 +992,7 @@ def addmm(input, x, y, alpha=1.0, beta=1.0, name=None): attrs = {'Alpha': alpha, 'Beta': beta} helper = LayerHelper("addmm", **locals()) - check_variable_and_dtype(x, 'Input', ['float32', 'float64'], 'addmm') + check_variable_and_dtype(input, 'Input', ['float32', 'float64'], 'addmm') check_variable_and_dtype(x, 'X', ['float32', 'float64'], 'addmm') check_variable_and_dtype(y, 'Y', ['float32', 'float64'], 'addmm') out = helper.create_variable_for_type_inference(dtype=x.dtype) diff --git a/python/requirements.txt b/python/requirements.txt index 5e081f5e85b6e0f645991ab70874d04ab93e3106..13a1c9a9d638daf6a78f52d9d66fcf3f15b74c37 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -21,3 +21,4 @@ prettytable objgraph astor pathlib +netifaces diff --git a/python/setup.py.in b/python/setup.py.in index df200da2cfc5b927402b2ed183eff5038aec8764..72819a7b9eed35e4be950addfdfe821a753eacbf 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -152,6 +152,7 @@ packages=['paddle', 'paddle.fleet.dataset', 'paddle.fleet.metrics', 'paddle.fleet.proto', + 'paddle.fleet.utils', 'paddle.framework', 'paddle.fluid', 'paddle.fluid.dygraph',