提交 7be555ee 编写于 作者: Z zlsh80826

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into trt_stack_op, test=develop

......@@ -45,14 +45,35 @@ inline void InitVarsInScope(const std::vector<VarInfo> &var_infos, Scope *scope,
// get CommContext and remote send and recv op
void ProcessGraph(std::vector<ir::Graph *> graphs, Scope *scope) {
#ifdef PADDLE_WITH_DISTRIBUTE
// init communicator here
auto *instance = operators::distributed::Communicator::GetInstance();
auto initialized = instance ? true : false;
PADDLE_ENFORCE_EQ(initialized, true,
platform::errors::InvalidArgument(
"Communicator is not Initialized, you may use "
"FleetAPI(https://github.com/PaddlePaddle/Fleet/tree/"
"develop/markdown_doc/transpiler)"));
bool need_communicator = false;
for (auto &node : graphs[0]->Nodes()) {
VLOG(3) << "node name " << node->Name();
if (node && node->IsOp()) {
if (node->Name() == "send") {
auto send_varnames =
BOOST_GET_CONST(std::vector<std::string>,
node->Op()->GetNullableAttr("send_varnames"));
if (send_varnames.size() > 0) {
need_communicator = true;
break;
}
}
}
}
if (need_communicator) {
// init communicator here
auto *instance = operators::distributed::Communicator::GetInstance();
auto initialized = instance ? true : false;
PADDLE_ENFORCE_EQ(initialized, true,
platform::errors::InvalidArgument(
"Communicator is not Initialized, you may use "
"FleetAPI(https://github.com/PaddlePaddle/Fleet/tree/"
"develop/markdown_doc/transpiler)"));
}
#endif
}
......
......@@ -19,6 +19,6 @@ else()
cc_library(gloo_wrapper SRCS gloo_wrapper.cc DEPS framework_proto variable_helper scope)
endif(WITH_GLOO)
cc_library(heter_wrapper SRCS heter_wrapper.cc DEPS framework_proto)
cc_library(heter_wrapper SRCS heter_wrapper.cc DEPS framework_proto device_context)
cc_test(test_fleet SRCS test_fleet.cc DEPS fleet_wrapper gloo_wrapper fs shell)
......@@ -54,10 +54,9 @@ void HdfsStore::set(const std::string& key, const std::vector<char>& data) {
paddle::framework::fs_remove(tmp);
if (i == retry_times_) {
VLOG(0) << "fs_open_write failed, retry times reaches limit";
// PADDLE_THROW(platform::errors::PreconditionNotMet(
// "fs_open_write failed, retry times reaches"
// " limit ",
// retry_times_));
PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
"fs_open_write failed, retry times reaches %d limit.",
retry_times_));
}
} else {
break;
......@@ -143,9 +142,9 @@ void HdfsStore::wait(const std::vector<std::string>& keys,
break;
}
}
// PADDLE_THROW(platform::errors::ExecutionTimeout(
VLOG(0) << "TIMEOUT self_rank = " << self_rank_
<< " pair_rank = " << last_check_rank;
PADDLE_THROW(paddle::platform::errors::ExecutionTimeout(
"TIMEOUT self_rank = %d pair_rank = %d", self_rank_,
last_check_rank));
}
std::this_thread::sleep_for(std::chrono::milliseconds(wait_sleep_ms_));
}
......
......@@ -28,38 +28,6 @@ DEFINE_bool(enable_unused_var_check, false,
"Checking whether operator contains unused inputs, "
"especially for grad operator. It should be in unittest.");
// NOTE(zhiqiu): Currently, there are some operators which involves unused
// inputs and cannot be removed from the allow_list below.
// They can be mainly divided into four categories:
// 0: the inputs of which are only used in if branch, or used in cuda kernel but
// not in cpu kernel;
// 1: the inputs of which are used to indicate dtype of outputs;
// 2: the inputs of which are used in fused operators.
// The category number is presented in the comments after each operator.
const std::unordered_set<std::string> op_with_unsed_vars_allow_list = {
"batch_norm", // 0
"batch_norm_grad", // 0
"sync_batch_norm", // 0
"sync_batch_norm_grad", // 0
"inplace_abn", // 0
"inplace_abn_grad", // 0
"dgc_momentum", // 0
"fake_quantize_range_abs_max", // 0
"rmsprop", // 0
"sequence_conv_grad", // 0
"roi_perspective_transform_grad", // 0
"fill_zeros_like", // 1
"fill_any_like", // 1
"nce_grad", // 1
"precision_recall", // 1
"fusion_seqpool_cvm_concat", // 2
"fused_batch_norm_act", // 2
"fused_batch_norm_act_grad", // 2
"data_norm", // 0
"data_norm_grad", // 0
};
namespace paddle {
namespace framework {
......@@ -75,9 +43,44 @@ void LogVarUsageIfUnusedVarCheckEnabled(const std::string &name) {
}
}
static const std::unordered_set<std::string> &GetOpWithUnusedVarAllowSet() {
// NOTE(zhiqiu): Currently, there are some operators which involves unused
// inputs and cannot be removed from the allow_list below.
// They can be mainly divided into four categories:
// 0: the inputs of which are only used in if branch, or used in cuda kernel
// but not in cpu kernel; 1: the inputs of which are used to indicate dtype of
// outputs; 2: the inputs of which are used in fused operators. The category
// number is presented in the comments after each operator.
// Use pointer here for safe static deinitialization
static auto *allow_set = new std::unordered_set<std::string>({
// called once
"batch_norm", // 0
"batch_norm_grad", // 0
"sync_batch_norm", // 0
"sync_batch_norm_grad", // 0
"inplace_abn", // 0
"inplace_abn_grad", // 0
"dgc_momentum", // 0
"fake_quantize_range_abs_max", // 0
"rmsprop", // 0
"sequence_conv_grad", // 0
"roi_perspective_transform_grad", // 0
"fill_zeros_like", // 1
"fill_any_like", // 1
"nce_grad", // 1
"precision_recall", // 1
"fusion_seqpool_cvm_concat", // 2
"fused_batch_norm_act", // 2
"fused_batch_norm_act_grad", // 2
"data_norm", // 0
"data_norm_grad", // 0);
});
return *allow_set;
}
void CheckUnusedVar(const OperatorBase &op, const Scope &scope) {
// skip op in allow list.
if (op_with_unsed_vars_allow_list.count(op.Type()) != 0) {
if (GetOpWithUnusedVarAllowSet().count(op.Type()) != 0) {
return;
}
auto *used_set = GetThreadLocalUsedVarNameSet();
......
......@@ -56,9 +56,11 @@ static nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger* logger) {
return static_cast<nvinfer1::IRuntime*>(
dy::createInferRuntime_INTERNAL(logger, NV_TENSORRT_VERSION));
}
static nvinfer1::IPluginRegistry* getPluginRegistry() {
#if IS_TRT_VERSION_GE(6000)
static nvinfer1::IPluginRegistry* GetPluginRegistry() {
return static_cast<nvinfer1::IPluginRegistry*>(dy::getPluginRegistry());
}
#endif
// A logger for create TensorRT infer builder.
class NaiveLogger : public nvinfer1::ILogger {
......
......@@ -178,12 +178,16 @@ class DynamicPluginTensorRT : public nvinfer1::IPluginV2DynamicExt {
std::string name_space_;
std::string plugin_base_;
};
#endif
template <typename T>
class TrtPluginRegistrarV2 {
public:
TrtPluginRegistrarV2() { getPluginRegistry()->registerCreator(creator, ""); }
TrtPluginRegistrarV2() {
static auto func_ptr = GetPluginRegistry();
if (func_ptr != nullptr) {
func_ptr->registerCreator(creator, "");
}
}
private:
T creator;
......@@ -193,6 +197,8 @@ class TrtPluginRegistrarV2 {
static paddle::inference::tensorrt::plugin::TrtPluginRegistrarV2<name> \
plugin_registrar_##name {}
#endif
} // namespace plugin
} // namespace tensorrt
} // namespace inference
......
......@@ -304,6 +304,7 @@ REGISTER_OPERATOR(squeeze2_grad, ops::Squeeze2GradOp,
REGISTER_OP_CPU_KERNEL(
squeeze, ops::SqueezeKernel<paddle::platform::CPUDeviceContext, float>,
ops::SqueezeKernel<paddle::platform::CPUDeviceContext, double>,
ops::SqueezeKernel<paddle::platform::CPUDeviceContext, bool>,
ops::SqueezeKernel<paddle::platform::CPUDeviceContext, int>,
ops::SqueezeKernel<paddle::platform::CPUDeviceContext, int8_t>,
ops::SqueezeKernel<paddle::platform::CPUDeviceContext, int64_t>);
......@@ -311,12 +312,14 @@ REGISTER_OP_CPU_KERNEL(
squeeze_grad,
ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext, double>,
ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext, bool>,
ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext, int>,
ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext, int8_t>,
ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
REGISTER_OP_CPU_KERNEL(
squeeze2, ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, float>,
ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, double>,
ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, bool>,
ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, int>,
ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, int8_t>,
ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, int64_t>);
......@@ -324,6 +327,7 @@ REGISTER_OP_CPU_KERNEL(
squeeze2_grad,
ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, float>,
ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, double>,
ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, bool>,
ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, int>,
ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, int8_t>,
ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, int64_t>);
......@@ -21,6 +21,7 @@ REGISTER_OP_CUDA_KERNEL(
squeeze, ops::SqueezeKernel<paddle::platform::CUDADeviceContext, float>,
ops::SqueezeKernel<paddle::platform::CUDADeviceContext, double>,
ops::SqueezeKernel<paddle::platform::CUDADeviceContext, plat::float16>,
ops::SqueezeKernel<paddle::platform::CUDADeviceContext, bool>,
ops::SqueezeKernel<paddle::platform::CUDADeviceContext, int>,
ops::SqueezeKernel<paddle::platform::CUDADeviceContext, int8_t>,
ops::SqueezeKernel<paddle::platform::CUDADeviceContext, int64_t>);
......@@ -29,6 +30,7 @@ REGISTER_OP_CUDA_KERNEL(
ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, double>,
ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, plat::float16>,
ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, bool>,
ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, int>,
ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, int8_t>,
ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
......@@ -36,6 +38,7 @@ REGISTER_OP_CUDA_KERNEL(
squeeze2, ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, float>,
ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, double>,
ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, plat::float16>,
ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, bool>,
ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, int>,
ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, int8_t>,
ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, int64_t>);
......@@ -44,6 +47,7 @@ REGISTER_OP_CUDA_KERNEL(
ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, float>,
ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, double>,
ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, plat::float16>,
ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, bool>,
ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, int>,
ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, int8_t>,
ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, int64_t>);
......@@ -36,26 +36,29 @@ extern void* tensorrt_dso_handle;
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
using tensorrt_func = decltype(&::__name); \
std::call_once(tensorrt_dso_flag, []() { \
tensorrt_dso_handle = paddle::platform::dynload::GetTensorRtHandle(); \
PADDLE_ENFORCE_NOT_NULL(tensorrt_dso_handle, \
platform::errors::Unavailable( \
"Load tensorrt %s failed", #__name)); \
}); \
static void* p_##__name = dlsym(tensorrt_dso_handle, #__name); \
PADDLE_ENFORCE_NOT_NULL( \
p_##__name, \
platform::errors::Unavailable("Load tensorrt %s failed", #__name)); \
if (p_##__name == nullptr) { \
return nullptr; \
} \
using tensorrt_func = decltype(&::__name); \
return reinterpret_cast<tensorrt_func>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name
#if (NV_TENSORRT_MAJOR >= 6)
#define TENSORRT_RAND_ROUTINE_EACH(__macro) \
__macro(createInferBuilder_INTERNAL); \
__macro(createInferRuntime_INTERNAL); \
__macro(getPluginRegistry);
#else
#define TENSORRT_RAND_ROUTINE_EACH(__macro) \
__macro(createInferBuilder_INTERNAL); \
__macro(createInferRuntime_INTERNAL);
#endif
TENSORRT_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP)
......
......@@ -33,6 +33,7 @@ limitations under the License. */
#include <curand.h>
#include <thrust/system/cuda/error.h>
#include <thrust/system_error.h>
#include "paddle/fluid/platform/cuda_error.pb.h"
#endif // PADDLE_WITH_CUDA
......@@ -69,6 +70,8 @@ limitations under the License. */
#include "paddle/fluid/framework/type_defs.h"
#include "paddle/fluid/imperative/type_defs.h"
DECLARE_int32(call_stack_level);
namespace paddle {
namespace platform {
......@@ -226,9 +229,7 @@ inline std::string SimplifyDemangleStr(std::string str) {
return str;
}
template <typename StrType>
inline std::string GetTraceBackString(StrType&& what, const char* file,
int line) {
inline std::string GetCurrentTraceBackString() {
static constexpr int TRACE_STACK_LIMIT = 100;
std::ostringstream sout;
......@@ -256,6 +257,13 @@ inline std::string GetTraceBackString(StrType&& what, const char* file,
#else
sout << "Windows not support stack backtrace yet.\n";
#endif
return sout.str();
}
template <typename StrType>
inline std::string GetErrorSumaryString(StrType&& what, const char* file,
int line) {
std::ostringstream sout;
sout << "\n----------------------\nError Message "
"Summary:\n----------------------\n";
sout << string::Sprintf("%s at (%s:%d)", std::forward<StrType>(what), file,
......@@ -264,6 +272,17 @@ inline std::string GetTraceBackString(StrType&& what, const char* file,
return sout.str();
}
template <typename StrType>
inline std::string GetTraceBackString(StrType&& what, const char* file,
int line) {
if (FLAGS_call_stack_level > 1) {
// FLAGS_call_stack_level>1 means showing c++ call stack
return GetCurrentTraceBackString() + GetErrorSumaryString(what, file, line);
} else {
return GetErrorSumaryString(what, file, line);
}
}
inline bool is_error(bool stat) { return !stat; }
inline void throw_on_error(bool stat, const std::string& msg) {
......@@ -427,7 +446,7 @@ struct EnforceNotMet : public std::exception {
*
* Examples:
* GET_DATA_SAFELY(ctx.Input<LoDTensor>("X"), "Input", "X", "Mul");
*/
*/
#define GET_DATA_SAFELY(__PTR, __ROLE, __NAME, __OP_TYPE) \
(([&]() -> std::add_lvalue_reference<decltype(*(__PTR))>::type { \
auto* __ptr = (__PTR); \
......@@ -463,7 +482,7 @@ struct EnforceNotMet : public std::exception {
*
* Examples:
* OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Mul");
*/
*/
#define OP_INOUT_CHECK(__EXPR, __ROLE, __NAME, __OP_TYPE) \
do { \
PADDLE_ENFORCE_EQ(__EXPR, true, paddle::platform::errors::NotFound( \
......@@ -491,7 +510,7 @@ struct EnforceNotMet : public std::exception {
* Note: GCC 4.8 cannot select right overloaded function here, so need
* to define different functions and macros here, after we upgreade
* CI gcc version, we can only define one BOOST_GET macro.
*/
*/
namespace details {
#define DEFINE_SAFE_BOOST_GET(__InputType, __OutputType, __OutputTypePtr, \
......
......@@ -483,3 +483,28 @@ DEFINE_double(local_exe_sub_scope_limit, 256.0, // MBytes
* Note:
*/
DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run");
/**
* Debug related FLAG
* Name: FLAGS_call_stack_level
* Since Version: 2.0.0
* Value Range: int, default=2
* Example:
* Note: Used to debug. Determine the call stack to print when error or
* exeception happens.
* If FLAGS_call_stack_level == 0, only the error message summary will be shown.
* If FLAGS_call_stack_level == 1, the python stack and error message summary
* will be shown.
* If FLAGS_call_stack_level == 2, the python stack, c++ stack, and error
* message summary will be shown.
*/
DEFINE_int32(
call_stack_level, 2,
"Determine the call stack to print when error or exeception happens."
// TODO(zhiqiu): implement logic of FLAGS_call_stack_level==0
// "If FLAGS_call_stack_level == 0, only the error message summary will be "
// "shown. "
"If FLAGS_call_stack_level == 1, the python stack and error message "
"summary will be shown."
"If FLAGS_call_stack_level == 2, the python stack, c++ stack, and "
"error message summary will be shown.");
......@@ -13,6 +13,7 @@
// limitations under the License.
#include "paddle/fluid/pybind/global_value_getter_setter.h"
#include <cctype>
#include <functional>
#include <string>
......@@ -20,6 +21,7 @@
#include <unordered_set>
#include <utility>
#include <vector>
#include "gflags/gflags.h"
#include "paddle/fluid/framework/python_headers.h"
#include "paddle/fluid/platform/enforce.h"
......@@ -35,6 +37,7 @@ DECLARE_bool(cpu_deterministic);
DECLARE_bool(enable_rpc_profiler);
DECLARE_int32(multiple_of_cupti_buffer_size);
DECLARE_bool(reader_queue_speed_test_mode);
DECLARE_int32(call_stack_level);
// device management
DECLARE_int32(paddle_num_threads);
// executor
......@@ -337,14 +340,15 @@ static void RegisterGlobalVarGetterSetter() {
REGISTER_PUBLIC_GLOBAL_VAR(
FLAGS_eager_delete_tensor_gb, FLAGS_enable_parallel_graph,
FLAGS_allocator_strategy, FLAGS_use_system_allocator, FLAGS_check_nan_inf,
FLAGS_cpu_deterministic, FLAGS_enable_rpc_profiler,
FLAGS_multiple_of_cupti_buffer_size, FLAGS_reader_queue_speed_test_mode,
FLAGS_pe_profile_fname, FLAGS_print_sub_graph_dir,
FLAGS_fraction_of_cpu_memory_to_use, FLAGS_fuse_parameter_groups_size,
FLAGS_fuse_parameter_memory_size, FLAGS_init_allocated_mem,
FLAGS_initial_cpu_memory_in_mb, FLAGS_memory_fraction_of_eager_deletion,
FLAGS_use_pinned_memory, FLAGS_benchmark, FLAGS_inner_op_parallelism,
FLAGS_tracer_profile_fname, FLAGS_paddle_num_threads);
FLAGS_call_stack_level, FLAGS_cpu_deterministic,
FLAGS_enable_rpc_profiler, FLAGS_multiple_of_cupti_buffer_size,
FLAGS_reader_queue_speed_test_mode, FLAGS_pe_profile_fname,
FLAGS_print_sub_graph_dir, FLAGS_fraction_of_cpu_memory_to_use,
FLAGS_fuse_parameter_groups_size, FLAGS_fuse_parameter_memory_size,
FLAGS_init_allocated_mem, FLAGS_initial_cpu_memory_in_mb,
FLAGS_memory_fraction_of_eager_deletion, FLAGS_use_pinned_memory,
FLAGS_benchmark, FLAGS_inner_op_parallelism, FLAGS_tracer_profile_fname,
FLAGS_paddle_num_threads);
#ifdef PADDLE_WITH_CUDA
REGISTER_PUBLIC_GLOBAL_VAR(
......
......@@ -12,5 +12,523 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Defination of Role Makers."""
import os
import numpy as np
from multiprocessing import Process, Manager
import paddle.fluid as fluid
# __all__ = ['RoleMakerBase', 'UserDefinedRoleMaker', 'PaddleCloudRoleMaker']
__all__ = ['RoleMakerBase', 'UserDefinedRoleMaker', 'PaddleCloudRoleMaker']
class Role:
WORKER = 1
SERVER = 2
class RoleMakerBase(object):
"""
RoleMakerBase is a base class for assigning a role to current process
in distributed training.
A paddle developer can implement RoleMakerBase to design a role maker
for worker or pserver assignment.
"""
def __init__(self):
self._worker_endpoints = []
self._server_endpoints = []
self._role_is_generated = False
self._role = None
self._current_id = -1
self._node_type = None
self._node_type_comm = None
self._all_comm = None
def is_worker(self):
"""
return is_worker() of current process
"""
raise NotImplementedError("Please implement this method in child class")
def is_server(self):
"""
return is_server() of current process
"""
raise NotImplementedError("Please implement this method in child class")
def is_first_worker(self):
"""
Check whether the node is the first instance of worker.
Returns:
bool: True if this is the first node of worker,
False if not.
"""
raise NotImplementedError("Please implement this method in child class")
def worker_num(self):
"""
Get current total worker number.
Returns:
int: worker number
"""
raise NotImplementedError("Please implement this method in child class")
def server_num(self):
"""
Get current total server number.
Returns:
int: server number
"""
raise NotImplementedError("Please implement this method in child class")
def worker_index(self):
"""
Get current worker id.
Returns:
int: node id
"""
raise NotImplementedError("Please implement this method in child class")
def server_index(self):
"""
Get current server id.
Returns:
int: node id
"""
raise NotImplementedError("Please implement this method in child class")
def role_id(self):
"""
Get current id.
Returns:
int: node id
"""
raise NotImplementedError("Please implement this method in child class")
def get_trainer_endpoints(self):
"""
return trainer endpoints
"""
return self._worker_endpoints
def get_pserver_endpoints(self):
"""
return pserver endpoints
"""
return self._server_endpoints
def to_string(self):
return "role: {}, current_id: {}, worker_endpoints: {}, server_endpoints: {}".format(
self._role, self._current_id, self._worker_endpoints,
self._server_endpoints)
def _all_gather(self, comm_world, input):
"""
Args:
input(int|float): input value
Returns:
return a list of values
"""
print("warning: RoleMakerBase does not have all gather.")
return None
def _all_reduce(self, comm_world, input, mode="sum"):
"""
Args:
input(list/numpy.array): array of one dim
output(list/numpy.array): array of one dim
mode(str): "sum" or "min" or "max"
"""
print("warning: RoleMakerBase does not have all reduce worker.")
return None
def _barrier(self, comm_world):
"""
barrier between trainers if current role is TRAINER
"""
print("warning: RoleMakerBase does not have barrier worker.")
class PaddleCloudRoleMaker(RoleMakerBase):
def __init__(self, is_collective=False, init_gloo=True, **kwargs):
super(PaddleCloudRoleMaker, self).__init__()
self._is_collective = is_collective
self._init_gloo = init_gloo
self._kwargs = kwargs
self._role_is_generated = False
self._server_endpoints = None
self._worker_endpoints = None
self._node_type_comm = None
self._all_comm = None
if not self._is_collective:
self._hdfs_name = kwargs.get("hdfs_name", "")
self._hdfs_ugi = kwargs.get("hdfs_ugi", "")
self._hdfs_path = kwargs.get("path", "").rstrip("/")
self._init_timeout_seconds = kwargs.get("init_timeout_seconds",
3600)
self._run_timeout_seconds = kwargs.get("run_timeout_seconds",
9999999)
ip_port = kwargs.get("http_ip_port", "")
self._http_ip_port = []
self._http_server = None
# if ip_port is not empty, it will use http instead of hdfs
if ip_port != "":
self._http_ip_port = ip_port.split(":")
# it's for communication between processes
self._manager = Manager()
# global dict to store status
self._http_server_d = self._manager.dict()
# set running status of http server
self._http_server_d["running"] = False
self._iface = self.__get_default_iface()
# this environment variable can be empty
self._prefix = os.getenv("SYS_JOB_ID", "")
def _barrier(self, comm_world):
if comm_world:
comm_world.barrier()
def _all_gather(self, comm_world, input):
if comm_world:
self._barrier(comm_world)
output = comm_world.all_gather(input)
return output
else:
return None
def _all_reduce(self, comm_world, input, mode="sum"):
if not comm_world:
return None
input = np.array(input)
input_shape = input.shape
input_list = input.reshape(-1).tolist()
self._barrier(comm_world)
ans = comm_world.all_reduce(input_list, mode)
output = np.array(ans).reshape(input_shape)
return output
def is_worker(self):
"""
whether current process is worker
"""
if not self._role_is_generated:
self.generate_role()
return self._role == Role.WORKER
def is_server(self):
"""
whether current process is server
"""
if not self._role_is_generated:
self.generate_role()
return self._role == Role.SERVER
def is_first_worker(self):
"""
whether current process is worker of rank 0
"""
if not self._role_is_generated:
self.generate_role()
return self._role == Role.WORKER and self._current_id == 0
def worker_index(self):
"""
get index of current worker
"""
if not self._role_is_generated:
self.generate_role()
return self._current_id
def server_index(self):
"""
get index of current server
"""
if not self._role_is_generated:
self.generate_role()
return self._current_id
def role_id(self):
"""
get index of current node
"""
if self.is_server():
return self.server_index()
elif self.is_worker():
return self.worker_index()
def worker_num(self):
"""
retrun the current number of worker
"""
if not self._role_is_generated:
self.generate_role()
return self._trainers_num
def server_num(self):
"""
return the current number of server
"""
if not self._role_is_generated:
self.generate_role()
return self._trainers_num
def get_trainer_endpoints(self):
"""
get endpoint of all trainers
"""
if not self._role_is_generated:
self.generate_role()
return self._worker_endpoints
def get_pserver_endpoints(self):
"""
get endpoint of all pservers
"""
if not self._role_is_generated:
self.generate_role()
return self._server_endpoints
def _get_rank(self):
"""
get current rank in all workers and pservers
"""
if not self._role_is_generated:
self.generate_role()
return self._rank
def _get_size(self):
"""
get total num of all workers and pservers
"""
if not self._role_is_generated:
self.generate_role()
return self._size
def _ps_env(self):
try:
# Environment variable PADDLE_PSERVERS_IP_PORT_LIST must be set
# format: string(ip:port), eg. 127.0.0.1:6001
self._server_endpoints = os.environ[
"PADDLE_PSERVERS_IP_PORT_LIST"].split(",")
self._worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS",
"").split(",")
trainers_num = int(os.environ["PADDLE_TRAINERS_NUM"])
training_role = os.environ["TRAINING_ROLE"]
if training_role not in ["TRAINER", "PSERVER"]:
raise ValueError("TRAINING_ROLE must be PSERVER or TRAINER")
if training_role == "TRAINER":
role = Role.WORKER
current_id = int(os.environ["PADDLE_TRAINER_ID"])
if len(self._worker_endpoints) > 0:
self._cur_endpoint = self._worker_endpoints[current_id]
elif training_role == "PSERVER":
role = Role.SERVER
port = os.environ["PADDLE_PORT"]
ip = os.environ["POD_IP"]
self._cur_endpoint = ip + ":" + port
current_id = self._server_endpoints.index(self._cur_endpoint)
else:
raise ValueError("TRAINING_ROLE must be PSERVER or TRAINER")
except ValueError as ve:
raise ValueError(
"something wrong with PaddleCloud, please check environment")
self._trainers_num = trainers_num
self._role = role
self._current_id = current_id
def _collective_env(self):
self._current_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
self._training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
assert (self._training_role == "TRAINER")
self._worker_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS")
self._cur_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
assert self._worker_endpoints is not None, "can't find PADDLE_TRAINER_ENDPOINTS"
self._worker_endpoints = self._worker_endpoints.split(",")
self._trainers_num = len(self._worker_endpoints)
def _init_gloo_env(self):
def init_gloo_instance(role="trainer"):
role = role.lower()
assert role in ["trainer", "pserver", "all"]
if role == "trainer":
all_list = self._worker_endpoints
rank = self._current_id
elif role == "pserver":
all_list = self._server_endpoints
rank = self._current_id
else:
all_list = self._worker_endpoints + self._server_endpoints
rank = all_list.index(self._cur_endpoint)
gloo = fluid.core.Gloo()
gloo.set_rank(rank)
gloo.set_size(len(all_list))
gloo.set_prefix(self._prefix)
gloo.set_iface(self._iface)
gloo.set_timeout_seconds(self._init_timeout_seconds,
self._run_timeout_seconds)
if len(self._http_ip_port) != 0:
gloo.set_http_store(self._http_ip_port[0],
int(self._http_ip_port[1]), role)
else:
gloo.set_hdfs_store(self._hdfs_path + "/" + role,
self._hdfs_name, self._hdfs_ugi)
gloo.init()
return gloo
# paddlecloud support gloo
if self._role == Role.WORKER:
if self._current_id == 0 and len(self._http_ip_port) != 0:
size_d = {
"trainer": len(self._worker_endpoints),
"pserver": len(self._server_endpoints),
"all":
len(self._worker_endpoints) + len(self._server_endpoints)
}
# child process for http server
self._http_server = Process(
target=self.__start_kv_server,
args=(self._http_server_d, size_d))
self._http_server.daemon = True
# set running status to True
self._http_server_d["running"] = True
# start child process
self._http_server.start()
self._node_type = 1
gloo = init_gloo_instance("trainer")
self._node_type_comm = gloo
else:
assert self._role == Role.SERVER
self._node_type = 0
gloo = init_gloo_instance("pserver")
self._node_type_comm = gloo
all_list = self._worker_endpoints + self._server_endpoints
self._rank = all_list.index(self._cur_endpoint)
self._size = len(all_list)
gloo = init_gloo_instance("all")
self._all_comm = gloo
if self._http_server is not None:
# set running status to False
self._http_server_d["running"] = False
# wait until child process exits
self._http_server.join()
def generate_role(self):
"""
generate role for role maker
"""
if not self._role_is_generated:
if not self._is_collective:
self._ps_env()
if self._init_gloo:
self._init_gloo_env()
else:
self._collective_env()
self._role_is_generated = True
def __get_default_iface(self):
"""
get default physical interface
"""
default1 = self.__get_default_iface_from_gateway()
default2 = self.__get_default_iface_from_interfaces()
return default2 if default1 == "lo" else default1
def __get_default_iface_from_gateway(self):
"""
get default physical interface
"""
import netifaces
gateways = netifaces.gateways()
if gateways.get(netifaces.AF_INET) != None:
gateway = gateways[netifaces.AF_INET]
if len(gateway) > 0 and len(gateway[0]) > 1:
return gateway[0][1]
return "lo"
def __get_default_iface_from_interfaces(self):
"""
get default physical interface
"""
import netifaces
for intf_name in netifaces.interfaces():
addresses = netifaces.ifaddresses(intf_name)
if netifaces.AF_INET in addresses:
ipv4_addresses = addresses[netifaces.AF_INET]
for ipv4_address in ipv4_addresses:
if 'broadcast' in ipv4_address:
return intf_name
return "lo"
def __start_kv_server(self, http_server_d, size_d):
from paddle.fleet.utils import KVServer
http_server = KVServer(int(self._http_ip_port[1]), size_d)
http_server.start()
wait_seconds = 5
while http_server_d.get("running",
False) and not http_server.shoud_stop():
time.sleep(wait_seconds)
http_server.stop()
class UserDefinedRoleMaker(PaddleCloudRoleMaker):
def __init__(self, is_collective=False, init_gloo=False, **kwargs):
super(UserDefinedRoleMaker, self).__init__(
is_collective=is_collective, init_gloo=init_gloo, **kwargs)
def _user_defined_ps_env(self):
self._server_endpoints = self._kwargs.get("server_endpoints")
self._worker_endpoints = self._kwargs.get("worker_endpoints", [])
self._trainers_num = self._kwargs.get("worker_num", 0)
if self._trainers_num == 0:
assert (len(self._worker_endpoints) > 0)
self._trainers_num = len(self._worker_endpoints)
self._role = self._kwargs.get("role")
self._current_id = self._kwargs.get("current_id")
if self._role == Role.WORKER and len(
self._worker_endpoints) > self._current_id:
self._cur_endpoint = self._worker_endpoints[self._current_id]
elif self._role == Role.SERVER:
self._cur_endpoint = self._server_endpoints[self._current_id]
def _user_defined_collective_env(self):
self._worker_endpoints = self._kwargs.get("worker_endpoints")
self._current_id = self._kwargs.get("current_id")
self._trainers_num = len(self._worker_endpoints)
self._training_role = Role.Worker
def generate_role(self):
"""
generate role for role maker
"""
if not self._role_is_generated:
if not self._is_collective:
self._user_defined_ps_env()
if self._init_gloo:
self._init_gloo_env()
else:
self._user_defined_collective_env()
self._role_is_generated = True
......@@ -18,12 +18,27 @@
__all__ = ['UtilBase']
import numpy as np
import os
import subprocess
from paddle.fluid import core
from collections import OrderedDict
import paddle.fluid as fluid
from google.protobuf import text_format
from paddle.fluid import debugger
from paddle.fluid.framework import Program
from paddle.fluid.proto import framework_pb2
from ..utils.fs import FS, LocalFS, HDFSClient
class UtilFactory(object):
def _create_util(self, context):
def _create_util(self, context=None):
util = UtilBase()
util._set_strategy(context["valid_strategy"])
util._set_role_maker(context["role_maker"])
if context is not None and "valid_strategy" in context:
util._set_strategy(context["valid_strategy"])
if context is not None and "role_maker" in context:
util._set_role_maker(context["role_maker"])
return util
......@@ -38,43 +53,390 @@ class UtilBase(object):
def _set_role_maker(self, role_maker):
self.role_maker = role_maker
'''
def set_file_system(self, fs_client):
assert isinstance(
fs_client,
FS), "fs_client must be the instance of paddle.fleet.utils.FS"
self.fs_client = fs_client
def broadcast(self):
pass
def __check_comm_world(self, comm_world="worker"):
if not self.role_maker._role_is_generated:
self.role_maker.generate_role()
def all_gather(self):
pass
_comm_world = None
comm_world_upper = comm_world.upper()
if comm_world_upper == "WORKER":
if not self.role_maker.is_worker():
print(
"warning: current role is not worker in collective_func(comm_world=\"worker\")"
)
_comm_world = self.role_maker._node_type_comm
elif comm_world_upper == "SERVER":
if not self.role_maker.is_server():
print(
"warning: current role is not server in collective_func(comm_world=\"server\")"
)
_comm_world = self.role_maker._node_type_comm
elif comm_world_upper == "ALL":
_comm_world = self.role_maker._all_comm
else:
raise ValueError(
"not support comm_world, please choose one from [worker, server, all]"
)
def all_reduce(self):
pass
return _comm_world
def reduce_scatter(self):
def all_reduce(self, input, mode, comm_world="worker"):
_comm_world = self.__check_comm_world(comm_world)
return self.role_maker._all_reduce(_comm_world, input, mode)
def barrier(self, comm_world="worker"):
_comm_world = self.__check_comm_world(comm_world)
self.role_maker._barrier(_comm_world)
def all_gather(self, input, comm_world="worker"):
_comm_world = self.__check_comm_world(comm_world)
return self.role_maker._all_gather(_comm_world, input)
def broadcast(self):
pass
def reduce(self):
def scatter(self):
pass
def get_file_shard(self, files):
pass
"""
split files before distributed training,
example 1: files is [a, b, c ,d, e] and trainer_num = 2, then trainer
0 gets [a, b, c] and trainer 1 gets [d, e].
example 2: files is [a, b], and trainer_num = 3, then trainer 0 gets
[a], trainer 1 gets [b], trainer 2 gets []
def feed_gen(self, batch_size, feed_vars_dims, feeded_vars_filelist):
pass
Args:
files(list): file list need to be read.
def save_program(program, output_dir):
pass
Returns:
list: files belongs to this worker.
"""
if not isinstance(files, list):
raise TypeError("files should be a list of file need to be read.")
def load_program(input_dir):
pass
trainer_id = self.role_maker.worker_index()
trainers = self.role_maker.worker_num()
def load_var():
pass
remainder = len(files) % trainers
blocksize = int(len(files) / trainers)
def save_var():
pass
blocks = [blocksize] * trainers
for i in range(remainder):
blocks[i] += 1
def print_on_rank(self):
pass
'''
trainer_files = [[]] * trainers
begin = 0
for i in range(trainers):
trainer_files[i] = files[begin:begin + blocks[i]]
begin += blocks[i]
return trainer_files[trainer_id]
def print_on_rank(self, message, rank_id):
if self.role_maker.worker_index() != rank_id:
return
print(message)
def _save_program(self, program, model_filename='__model__', is_text=False):
if is_text:
with open(model_filename, "w") as f:
f.write(str(program))
else:
with open(model_filename, "wb") as f:
f.write(program.desc.serialize_to_string())
def _load_program(self, path, is_text):
def load_program_binary(path):
"""load program from binary string file"""
with open(path, "rb") as f:
program_desc_str = f.read()
return Program.parse_from_string(program_desc_str)
def load_program_text(path):
"""load program from human-readable text file"""
with open(path, "r") as f:
program_desc_text = f.read()
prog_desc = framework_pb2.ProgramDesc()
text_format.Merge(program_desc_text, prog_desc)
return Program.parse_from_string(prog_desc.SerializeToString())
if is_text:
return load_program_text(path)
else:
return load_program_binary(path)
def _program_type_trans(self, prog_dir, prog_fn, is_text):
prog = self._load_program(os.path.join(prog_dir, prog_fn), is_text)
prog_out_fn = prog_fn + ".bin" if is_text else prog_fn + ".pbtxt"
self._save_program(prog,
os.path.join(prog_dir, prog_out_fn), 1 - is_text)
return prog_out_fn
def _visualize_graphviz(self, program, output_dir, output_filename):
block = program.global_block()
dot_path = os.path.join(output_dir, output_filename + '.dot')
pdf_path = os.path.join(output_dir, output_filename + '.pdf')
debugger.draw_block_graphviz(block, path=dot_path)
cmd = ["dot", "-Tpdf", dot_path, "-o", pdf_path]
p = subprocess.Popen(
cmd,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
p.wait()
def _proto_check(self, config):
train_prog = self._load_program(config.train_prog_path,
config.is_text_train_program)
pruned_prog = self._load_program(config.pruned_prog_path,
config.is_text_pruned_program)
is_match = True
pruned_vars = [(v.name, v) for v in pruned_prog.list_vars()
if fluid.io.is_persistable(v)]
pruned_vars = OrderedDict(pruned_vars)
pruned_vars_name = [name for name in pruned_vars]
print("persistable vars in pruned program: {}".format(pruned_vars_name))
# feed and fetch op is added in pruned program when pruning, not need to be found in train program
feed_fetch_type_list = [
core.VarDesc.VarType.FEED_MINIBATCH, core.VarDesc.VarType.FETCH_LIST
]
for var_name in pruned_vars:
var = pruned_vars[var_name]
# feed and fetch op is added in pruned program when pruning, not need to be found in train program
if var.type in feed_fetch_type_list:
break
try:
train_prog_var = train_prog.global_block().var(var_name)
except ValueError as e:
print(
"Not find variable '%s' in train program. please check pruning."
% var_name)
is_match = False
continue
if var.shape != train_prog_var.shape or var.dtype != train_prog_var.dtype:
print(
"variable: {} not match. in pruned program shape: {} dtype:{}, in train program shape: {} dtype: {}".
format(var_name, var.shape, var.dtype, train_prog_var.shape,
train_prog_var.dtype))
is_match = False
return is_match
def _params_check(self, config):
def feed_gen(batch_size, feeded_vars_dims, feeded_vars_filelist):
def reader(batch_size, fn, dim):
data = []
if isinstance(dim, list) or isinstance(dim, tuple):
shape = list(dim)
_temp = 1
for x in dim:
_temp = _temp * x
dim = _temp
else:
shape = [dim]
shape = [batch_size] + shape
dim = dim * batch_size
for line in open(fn, 'r'):
fields = line.strip().split(' ')
fields = [float(d) for d in fields]
while len(fields) >= dim:
tmp = fields[:dim]
fields = fields[dim:]
data.append(np.array(tmp).reshape(shape))
return data
batch_feed = []
for i, fn in enumerate(feeded_vars_filelist):
batch_feed.append(reader(batch_size, fn, feeded_vars_dims[i]))
return batch_feed
prog = self._load_program(
os.path.join(config.dump_model_dir, config.dump_program_filename),
config.is_text_dump_program)
if config.is_text_dump_program:
model_filename = self._program_type_trans(
config.dump_model_dir, config.dump_program_filename,
config.is_text_dump_program)
saved_params = [
v for v in prog.list_vars() if fluid.io.is_persistable(v)
]
print("persistable vars in dump program: {}".format(
[v.name for v in saved_params]))
def check_not_expected_ops(prog, not_expected_op_types):
op_types_set = set()
for op in prog.global_block().ops:
if op.type in not_expected_op_types and op.type not in op_types_set:
op_types_set.add(op.type)
return op_types_set
not_expected_op_types = check_not_expected_ops(prog, ["lookup_table"])
if len(not_expected_op_types) > 0:
print(
"find op type '{}' in program, please check if your program is pruned correctly !".
format(list(not_expected_op_types)))
return False
place = fluid.CPUPlace()
exe = fluid.Executor(place)
scope = fluid.core.Scope()
with fluid.scope_guard(scope):
inference_program, feed_target_names, fetch_targets = \
fluid.io.load_inference_model(config.dump_model_dir, exe, model_filename=model_filename,
params_filename=config.save_params_filename)
# check program vars and saved vars shape
orig_para_shape = {
each_var.name: tuple(each_var.desc.shape())
for each_var in saved_params
}
for each_var in saved_params:
var_temp = fluid.global_scope().find_var(each_var.name)
assert var_temp != None, "can't not find var: " + each_var.name
new_shape = (np.array(var_temp.get_tensor())).shape
assert each_var.name in orig_para_shape, each_var.name + "MUST in var list"
orig_shape = orig_para_shape.get(each_var.name)
if new_shape != orig_shape:
raise RuntimeError(
"Shape not matching: the Program requires a parameter with a shape of ({}), "
"while the loaded parameter (namely [ {} ]) has a shape of ({}).".
format(orig_shape, each_var.name, new_shape))
# check feed/fetch vars in program and config
feed_config = config.feed_config
fetch_config = config.fetch_config
fetch_targets_names = [v.name for v in fetch_targets]
if not feed_target_names:
print("warning! no feed targets in program.")
if not fetch_targets_names:
print("warning! no fetch targets in program.")
fetch_list = fetch_targets
feed_name_list = feed_target_names
if feed_config.feeded_vars_names is not None and feed_target_names != feed_config.feeded_vars_names:
print(
"warning! feed vars in program and config are diff: feed in program: {}. feed in config {}.".
format(feed_target_names, feed_config.feeded_vars_names))
feed_name_list = feed_config.feeded_vars_names
# remove feed op in inference_program. new feed op will be added in exe.run
global_block = inference_program.global_block()
need_to_remove_op_index = []
for i, op in enumerate(global_block.ops):
op.desc.set_is_target(False)
if op.type == "feed": # only remove feed op here
need_to_remove_op_index.append(i)
for index in need_to_remove_op_index[::-1]:
global_block._remove_op(index)
if fetch_config.fetch_vars_names is not None and fetch_targets_names != fetch_config.fetch_vars_names:
print(
"warning! fetch vars in program and config are diff: fetch in program: {}. fetch in config {}.".
format(fetch_targets_names, fetch_config.fetch_vars_names))
fetch_list = [
inference_program.global_block().var(i)
for i in fetch_config.fetch_vars_names
]
# remove fetch op in inference_program. new fetch op will be added in exe.run
global_block = inference_program.global_block()
need_to_remove_op_index = []
for i, op in enumerate(global_block.ops):
op.desc.set_is_target(False)
if op.type == "fetch": # only remove fetch op here
need_to_remove_op_index.append(i)
for index in need_to_remove_op_index[::-1]:
global_block._remove_op(index)
# if fetch_list have lod tensor
return_numpy = all([v.lod_level == 0 for v in fetch_list])
# try dump fetch_targets
feed_tensors = []
assert len(feed_config.feeded_vars_names) == len(
feed_config.feeded_vars_dims) == len(
feed_config.feeded_vars_types)
# check program vars and feed tensor shape in config
for i in range(len(feed_config.feeded_vars_names)):
var = inference_program.global_block().var(
feed_config.feeded_vars_names[i])
if not isinstance(feed_config.feeded_vars_dims[i],
(list, tuple)):
tensor_shape = (feed_config.feeded_vars_dims[i], )
else:
tensor_shape = tuple(feed_config.feeded_vars_dims[i])
feed_config.feeded_vars_dims[i] = tensor_shape
var_shape = var.shape[1:]
if tensor_shape != var_shape:
raise RuntimeError(
"feed variable '{}' shape not match. infer program shape: {}. feed tensor shape: {}".
format(feed_config.feeded_vars_names[i], var_shape,
tensor_shape))
if not feed_config.feeded_vars_filelist:
print("generate random feed vars.")
for i in range(len(feed_config.feeded_vars_names)):
var = inference_program.global_block().var(
feed_config.feeded_vars_names[i])
# create fake feed tensor. if lod_level > 1, should create_lod_tensor()
if var.lod_level == 0:
feed_tensors.append(
np.array(
np.random.random(
tuple([config.batch_size] + list(
feed_config.feeded_vars_dims[i]))),
dtype=feed_config.feeded_vars_types[i]))
elif var.lod_level == 1:
t = np.array(
np.random.random(
tuple([config.batch_size] + list(
feed_config.feeded_vars_dims[i]))),
dtype=feed_config.feeded_vars_types[i])
feed_tensors.append(
fluid.create_lod_tensor(t, [[1] * config.batch_size
], place))
else:
raise RuntimeError(
"vars with lod_level >= 2 is not supported now in this infer program check tool."
)
results = exe.run(inference_program,
feed={
name: feed_tensors[i]
for i, name in enumerate(feed_name_list)
},
fetch_list=fetch_list,
return_numpy=return_numpy)
else:
print("load feed vars from files: {}.".format(
feed_config.feeded_vars_filelist))
feed_vars = [
inference_program.global_block().var(
feed_config.feeded_vars_names[i])
for i in range(len(feed_config.feeded_vars_names))
]
feeder = fluid.DataFeeder(feed_list=feed_vars, place=place)
batch_feed = feed_gen(config.batch_size,
feed_config.feeded_vars_dims,
feed_config.feeded_vars_filelist)
slots = [batch_feed]
results = exe.run(inference_program,
feed=feeder.feed(slots),
fetch_list=fetch_list,
return_numpy=return_numpy)
for i, v in enumerate(fetch_list):
print("fetch_targets name: %s" % v.name)
print("fetch_targets: {}".format(results[i]))
return results
fleet_util = UtilFactory()._create_util(None)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .fs import *
from .http_server import KVHandler, KVHTTPServer, KVServer
__all__ = ['KVHandler', 'KVHTTPServer', 'KVServer'] + fs.__all__
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import subprocess
import multiprocessing
from datetime import datetime
import re
import copy
import errno
import time
import logging
import six
import abc
import paddle.fluid as fluid
import functools
from pathlib import PurePosixPath, Path
import shutil
__all__ = [
'FS', 'LocalFS', 'HDFSClient', 'ExecuteError', 'FSTimeOut',
'FSFileExistsError', 'FSFileNotExistsError'
]
class ExecuteError(Exception):
pass
class FSFileExistsError(Exception):
pass
class FSFileNotExistsError(Exception):
pass
class FSTimeOut(Exception):
pass
class FS(object):
@abc.abstractmethod
def ls_dir(self, fs_path):
raise NotImplementedError
@abc.abstractmethod
def is_file(self, fs_path):
raise NotImplementedError
@abc.abstractmethod
def is_dir(self, fs_path):
raise NotImplementedError
@abc.abstractmethod
def is_exist(self, fs_path):
raise NotImplementedError
@abc.abstractmethod
def upload(self, local_path, fs_path):
raise NotImplementedError
@abc.abstractmethod
def download(self, fs_path, local_path):
raise NotImplementedError
@abc.abstractmethod
def mkdirs(self, fs_path):
raise NotImplementedError
@abc.abstractmethod
def delete(self, fs_path):
raise NotImplementedError
@abc.abstractmethod
def need_upload_download(self):
raise NotImplementedError
@abc.abstractmethod
def rename(self, fs_src_path, fs_dst_path):
raise NotImplementedError
@abc.abstractmethod
def mv(self, fs_src_path, fs_dst_path):
raise NotImplementedError
@abc.abstractmethod
def upload_dir(self, local_dir, dest_dir):
raise NotImplementedError
@abc.abstractmethod
def glob(self, fs_path):
raise NotImplementedError
@abc.abstractmethod
def stat(self, fs_path):
raise NotImplementedError
@abc.abstractmethod
def walk(self, fs_path):
raise NotImplementedError
class LocalFS(FS):
def ls_dir(self, fs_path):
if not self.is_exist(fs_path):
return [], []
dirs = []
files = []
for f in os.listdir(fs_path):
if os.path.isdir(fs_path + "/" + f):
dirs.append(f)
else:
files.append(f)
return dirs, files
def mkdirs(self, fs_path):
assert not os.path.isfile(fs_path), "{} is already a file".format(
fs_path)
os.system("mkdir -p {}".format(fs_path))
def is_file(self, fs_path):
return os.path.isfile(fs_path)
def is_dir(self, fs_path):
return os.path.isdir(fs_path)
def is_exist(self, fs_path):
return os.path.exists(fs_path)
def _rmr(self, fs_path):
shutil.rmtree(fs_path)
def _rm(self, fs_path):
os.remove(fs_path)
def delete(self, fs_path):
if not self.is_exist(fs_path):
return
if os.path.isfile(fs_path):
return self._rm(fs_path)
return self._rmr(fs_path)
def rename(self, fs_src_path, fs_dst_path):
os.rename(fs_src_path, fs_dst_path)
def need_upload_download(self):
return False
def touch(self, fs_path):
return Path(fs_path).touch()
def mv(self, src_path, dst_path):
if not self.is_exist(src_path):
raise FSFileNotExistsError
if self.is_exist(dst_path):
raise FSFileExistsError
return self.rename(src_path, dst_path)
"""HDFS Utils."""
def _handle_errors(f):
def handler(*args, **kwargs):
start = time.time()
while True:
try:
return f(*args, **kwargs)
except ExecuteError as e:
o = args[0]
time_out = float(o._time_out) / 1000.0
inter = float(o._sleep_inter) / 1000.0
if time.time() - start >= time_out:
raise FSTimeOut
time.sleep(inter)
return functools.wraps(f)(handler)
class HDFSClient(FS):
def __init__(
self,
hadoop_home,
configs,
time_out=5 * 60 * 1000, #ms
sleep_inter=1000): #ms
# Raise exception if JAVA_HOME not exists.
java_home = os.environ["JAVA_HOME"]
self.pre_commands = []
hadoop_bin = '%s/bin/hadoop' % hadoop_home
self.pre_commands.append(hadoop_bin)
dfs = 'fs'
self.pre_commands.append(dfs)
if configs:
for k, v in six.iteritems(configs):
self.pre_commands.append('-D%s=%s' % (k, v))
self._time_out = time_out
self._sleep_inter = sleep_inter
self._base_cmd = " ".join(self.pre_commands)
self._bd_err_re = re.compile(
r'\s?responseErrorMsg\s?\:.*, errorCode\:\s?[0-9]+, path\:')
def _run_cmd(self, cmd, redirect_stderr=False):
ret, output = fluid.core.shell_execute_cmd(cmd, 0, 0, redirect_stderr)
return int(ret), output.splitlines()
@_handle_errors
def ls_dir(self, fs_path):
"""
list directory under fs_path, and only give the pure name, not include the fs_path
"""
if not self.is_exist(fs_path):
return [], []
cmd = "{} -ls {}".format(self._base_cmd, fs_path)
ret, lines = self._run_cmd(cmd)
if ret != 0:
raise ExecuteError
dirs = []
files = []
for line in lines:
arr = line.split()
if len(arr) != 8:
continue
if fs_path not in arr[7]:
continue
p = PurePosixPath(arr[7])
if arr[0][0] == 'd':
dirs.append(p.name)
else:
files.append(p.name)
return dirs, files
def _test_match(self, lines):
for l in lines:
m = self._bd_err_re.match(l)
if m != None:
return m
return None
@_handle_errors
def is_dir(self, fs_path):
if not self.is_exist(fs_path):
return False
cmd = "{} -test -d {}".format(
self._base_cmd, fs_path, redirect_stderr=True)
ret, lines = self._run_cmd(cmd)
if ret:
# other error
if self._test_match(lines) != None:
raise ExecuteError
return False
return True
def is_file(self, fs_path):
if not self.is_exist(fs_path):
return False
return not self.is_dir(fs_path)
@_handle_errors
def is_exist(self, fs_path):
cmd = "{} -ls {} ".format(self._base_cmd, fs_path)
ret, out = self._run_cmd(cmd, redirect_stderr=True)
if ret != 0:
for l in out:
if "No such file or directory" in l:
return False
raise ExecuteError
return True
@_handle_errors
def upload(self, local_path, fs_path):
if self.is_exist(fs_path):
raise FSFileExistsError
local = LocalFS()
if not local.is_exist(local_path):
raise FSFileNotExistsError
cmd = "{} -put {} {}".format(self._base_cmd, local_path, fs_path)
ret, lines = self._run_cmd(cmd)
if ret != 0:
raise ExecuteError
@_handle_errors
def download(self, fs_path, local_path):
if self.is_exist(local_path):
raise FSFileExistsError
if not self.is_exist(fs_path):
raise FSFileNotExistsError
cmd = "{} -get {} {}".format(self._base_cmd, fs_path, local_path)
ret, lines = self._run_cmd(cmd)
if ret != 0:
raise ExecuteError
@_handle_errors
def mkdirs(self, fs_path):
if self.is_exist(fs_path):
return
cmd = "{} -mkdir {}".format(self._base_cmd, fs_path)
ret, lines = self._run_cmd(cmd)
if ret != 0:
raise ExecuteError
@_handle_errors
def mv(self, fs_src_path, fs_dst_path, test_exists=True):
if test_exists:
if not self.is_exist(fs_src_path):
raise FSFileNotExistsError
if self.is_exist(fs_dst_path):
raise FSFileExistsError
cmd = "{} -mv {} {}".format(self._base_cmd, fs_src_path, fs_dst_path)
ret, _ = self._run_cmd(cmd)
if ret != 0:
raise ExecuteError
@_handle_errors
def _rmr(self, fs_path):
cmd = "{} -rmr {}".format(self._base_cmd, fs_path)
ret, _ = self._run_cmd(cmd)
if ret != 0:
raise ExecuteError
@_handle_errors
def _rm(self, fs_path):
cmd = "{} -rm {}".format(self._base_cmd, fs_path)
ret, _ = self._run_cmd(cmd)
if ret != 0:
raise ExecuteError
def delete(self, fs_path):
if not self.is_exist(fs_path):
return
is_dir = self.is_dir(fs_path)
if is_dir:
return self._rmr(fs_path)
return self._rm(fs_path)
def need_upload_download(self):
return True
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Http Server."""
import logging
import six
# NOTE: HTTPServer has a different name in python2 and python3
if six.PY2:
from BaseHTTPServer import HTTPServer
import SimpleHTTPServer
else:
from http.server import HTTPServer
import http.server as SimpleHTTPServer
import time
import threading
import socket
def get_logger(name, level, fmt):
logger = logging.getLogger(name)
logger.setLevel(level)
handler = logging.FileHandler('http.log', mode='w')
formatter = logging.Formatter(fmt=fmt)
handler.setFormatter(formatter)
logger.addHandler(handler)
return logger
_http_server_logger = get_logger(
__name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
class KVHandler(SimpleHTTPServer.SimpleHTTPRequestHandler):
"""
kv handler class for kv http server,
it defines the way to get/set kv in server.
"""
def do_GET(self):
"""
get method for kv handler, get value according to key.
"""
log_str = "GET " + self.address_string() + self.path
paths = self.path.split('/')
if len(paths) < 3:
print('len of request path must be 3: ' + self.path)
self.send_status_code(400)
return
_, scope, key = paths
with self.server.kv_lock:
value = self.server.kv.get(scope, {}).get(key)
if value is None:
log_str += ' , key not found: ' + key
self.send_status_code(404)
else:
log_str += ' , key found: ' + key
self.send_response(200)
self.send_header("Content-Length", str(len(value)))
self.end_headers()
self.wfile.write(value)
_http_server_logger.info(log_str)
def do_PUT(self):
"""
put method for kv handler, set value according to key.
"""
log_str = "PUT " + self.address_string() + self.path
paths = self.path.split('/')
if len(paths) < 3:
print('len of request path must be 3: ' + self.path)
self.send_status_code(400)
return
_, scope, key = paths
content_length = int(self.headers['Content-Length'])
try:
value = self.rfile.read(content_length)
except:
print("receive error invalid request")
self.send_status_code(404)
return
with self.server.kv_lock:
if self.server.kv.get(scope) is None:
self.server.kv[scope] = {}
self.server.kv[scope][key] = value
self.send_status_code(200)
_http_server_logger.info(log_str)
def do_DELETE(self):
"""
delete method for kv handler, set value according to key.
"""
log_str = "DELETE " + self.address_string() + self.path
paths = self.path.split('/')
if len(paths) < 3:
print('len of request path must be 3: ' + self.path)
self.send_status_code(400)
return
_, scope, key = paths
with self.server.delete_kv_lock:
if self.server.delete_kv.get(scope) is None:
self.server.delete_kv[scope] = []
self.server.delete_kv[scope].append(key)
self.send_status_code(200)
_http_server_logger.info(log_str)
def log_message(self, format, *args):
"""
ignore all logging messages in kv handler.
"""
pass
def send_status_code(self, code):
"""
send status code back to client.
"""
self.send_response(code)
self.send_header("Content-Length", 0)
self.end_headers()
class KVHTTPServer(HTTPServer, object):
"""
it is a http server storing kv pairs.
"""
def __init__(self, port, handler):
"""Init."""
super(KVHTTPServer, self).__init__(('', port), handler)
self.delete_kv_lock = threading.Lock()
self.delete_kv = {}
self.kv_lock = threading.Lock()
self.kv = {}
def get_deleted_size(self, key):
"""
get deleted size in key.
"""
ret = 0
with self.delete_kv_lock:
ret = self.delete_kv.get(key, 0)
return ret
class KVServer:
"""
it is a server storing kv pairs, has a http server inside.
"""
def __init__(self, port, size={}):
"""Init."""
self.http_server = KVHTTPServer(port, KVHandler)
self.listen_thread = None
self.size = {}
def start(self):
"""
start server until user calls stop to let it quit.
"""
self.listen_thread = threading.Thread(
target=lambda: self.http_server.serve_forever())
self.listen_thread.start()
def stop(self):
"""
stop server and clear its resources.
"""
self.http_server.shutdown()
self.listen_thread.join()
self.http_server.server_close()
def shoud_stop(self):
"""
return whether the server should stop.
Returns:
ret(bool): whether the server should stop
"""
for key in self.size:
s = self.http_server.get_deleted_size(key)
if s != self.size.get(key, 0):
return False
return True
......@@ -166,17 +166,34 @@ def __bootstrap__():
os.environ['OMP_NUM_THREADS'] = str(num_threads)
sysstr = platform.system()
read_env_flags = [
'check_nan_inf', 'fast_check_nan_inf', 'benchmark',
'eager_delete_scope', 'fraction_of_cpu_memory_to_use',
'initial_cpu_memory_in_mb', 'init_allocated_mem', 'paddle_num_threads',
'dist_threadpool_size', 'eager_delete_tensor_gb',
'fast_eager_deletion_mode', 'memory_fraction_of_eager_deletion',
'allocator_strategy', 'reader_queue_speed_test_mode',
'print_sub_graph_dir', 'pe_profile_fname', 'inner_op_parallelism',
'enable_parallel_graph', 'fuse_parameter_groups_size',
'multiple_of_cupti_buffer_size', 'fuse_parameter_memory_size',
'tracer_profile_fname', 'dygraph_debug', 'use_system_allocator',
'enable_unused_var_check', 'free_idle_chunk', 'free_when_no_cache_hit'
'check_nan_inf',
'fast_check_nan_inf',
'benchmark',
'eager_delete_scope',
'fraction_of_cpu_memory_to_use',
'initial_cpu_memory_in_mb',
'init_allocated_mem',
'paddle_num_threads',
'dist_threadpool_size',
'eager_delete_tensor_gb',
'fast_eager_deletion_mode',
'memory_fraction_of_eager_deletion',
'allocator_strategy',
'reader_queue_speed_test_mode',
'print_sub_graph_dir',
'pe_profile_fname',
'inner_op_parallelism',
'enable_parallel_graph',
'fuse_parameter_groups_size',
'multiple_of_cupti_buffer_size',
'fuse_parameter_memory_size',
'tracer_profile_fname',
'dygraph_debug',
'use_system_allocator',
'enable_unused_var_check',
'free_idle_chunk',
'free_when_no_cache_hit',
'call_stack_level',
]
if 'Darwin' not in sysstr:
read_env_flags.append('use_pinned_memory')
......@@ -208,12 +225,19 @@ def __bootstrap__():
if core.is_compiled_with_cuda():
read_env_flags += [
'fraction_of_gpu_memory_to_use', 'initial_gpu_memory_in_mb',
'reallocate_gpu_memory_in_mb', 'cudnn_deterministic',
'enable_cublas_tensor_op_math', 'conv_workspace_size_limit',
'cudnn_exhaustive_search', 'selected_gpus', 'sync_nccl_allreduce',
'cudnn_batchnorm_spatial_persistent', 'gpu_allocator_retry_time',
'local_exe_sub_scope_limit', 'gpu_memory_limit_mb'
'fraction_of_gpu_memory_to_use',
'initial_gpu_memory_in_mb',
'reallocate_gpu_memory_in_mb',
'cudnn_deterministic',
'enable_cublas_tensor_op_math',
'conv_workspace_size_limit',
'cudnn_exhaustive_search',
'selected_gpus',
'sync_nccl_allreduce',
'cudnn_batchnorm_spatial_persistent',
'gpu_allocator_retry_time',
'local_exe_sub_scope_limit',
'gpu_memory_limit_mb',
]
core.init_gflags(["--tryfromenv=" + ",".join(read_env_flags)])
core.init_glog(sys.argv[0])
......
......@@ -16,12 +16,13 @@ from __future__ import print_function
import os
import collections
from ..framework import Variable, default_main_program, in_dygraph_mode, dygraph_only, Parameter, ParamBase
from ..framework import Variable, default_main_program, in_dygraph_mode, dygraph_only, Parameter, ParamBase, _varbase_creator, _dygraph_tracer
import pickle
import six
from . import learning_rate_scheduler
import warnings
from .. import core
from paddle.fluid.dygraph.io import VARIABLE_FILENAME, EXTRA_VAR_INFO_FILENAME, _load_persistable_vars
__all__ = [
'save_dygraph',
......@@ -140,22 +141,83 @@ def load_dygraph(model_path, keep_name_table=False):
elif model_prefix.endswith(".pdopt"):
model_prefix = model_prefix[:-6]
params_file_path = model_prefix + ".pdparams"
if not os.path.exists(params_file_path):
raise RuntimeError("Parameter file [ {} ] not exists".format(
params_file_path))
with open(params_file_path, 'rb') as f:
para_dict = pickle.load(f) if six.PY2 else pickle.load(
f, encoding='latin1')
if not keep_name_table and "StructuredToParameterName@@" in para_dict:
del para_dict["StructuredToParameterName@@"]
para_dict = None
opti_dict = None
params_file_path = model_prefix + ".pdparams"
opti_file_path = model_prefix + ".pdopt"
if os.path.exists(opti_file_path):
with open(opti_file_path, 'rb') as f:
opti_dict = pickle.load(f) if six.PY2 else pickle.load(
f, encoding='latin1')
if not os.path.exists(params_file_path) and not os.path.exists(
opti_file_path):
# Load state dict by `jit.save` save format
# TODO(chenweihang): [Why not support `io.save_infernece_model` save format here]
# The model saved by `save_inference_model` does not completely correspond to
# the information required by the `state_dict` under the dygraph.
# Although we reluctantly restore the `state_dict` in some scenarios,
# this may not be complete and there are some limitations, so this function
# will be considered later. The limitations include:
# 1. `save_inference_model` not save structured name, we need to remind
# the user to configure the `use_structured_name` argument when `set_dict`,
# but this argument is currently not public
# 2. if `save_inference_model` save all persistable variables in a single file,
# user need to give the variable name list to load `state_dict`
# 1. check model path
if not os.path.isdir(model_prefix):
raise ValueError("Model saved directory '%s' is not exists." %
model_prefix)
# 2. load `__variables.info__`
var_info_path = os.path.join(model_prefix, EXTRA_VAR_INFO_FILENAME)
if not os.path.exists(var_info_path):
raise RuntimeError(
"No target can be loaded. Now only supports loading `state_dict` from "
"the result saved by `imperative.save` and `imperative.jit.save`."
)
with open(var_info_path, 'rb') as f:
extra_var_info = pickle.load(f)
# 3. load `__variables__`
# TODO(chenweihang): now only supports loading from default save format:
# - all persistable vars saved in one file named `__variables__`
# for other case, we may need to modify the arguments of this API
var_file_path = os.path.join(model_prefix, VARIABLE_FILENAME)
if not os.path.exists(var_file_path):
raise RuntimeError(
"The parameter file to be loaded was not found. "
"Now only supports loading from the default save format, "
"and does not support custom params_filename and "
"save parameters separately.")
# 4. load all persistable vars
load_var_list = []
for name in sorted(extra_var_info):
var = _varbase_creator(name=name, persistable=True)
load_var_list.append(var)
_dygraph_tracer().trace_op(
type='load_combine',
inputs={},
outputs={'Out': load_var_list},
attrs={'file_path': var_file_path})
# 5. construct state_dict
para_dict = dict()
for var in load_var_list:
structured_name = extra_var_info[var.name].get('structured_name',
None)
if structured_name is None:
raise RuntimeError(
"Cannot find saved variable (%s)'s structured name in saved model.",
var.name)
para_dict[structured_name] = var.numpy()
# NOTE: `jit.save` doesn't save optimizer state
else:
# Load state dict by `save_dygraph` save format
if os.path.exists(params_file_path):
with open(params_file_path, 'rb') as f:
para_dict = pickle.load(f) if six.PY2 else pickle.load(
f, encoding='latin1')
if not keep_name_table and "StructuredToParameterName@@" in para_dict:
del para_dict["StructuredToParameterName@@"]
if os.path.exists(opti_file_path):
with open(opti_file_path, 'rb') as f:
opti_dict = pickle.load(f) if six.PY2 else pickle.load(
f, encoding='latin1')
return para_dict, opti_dict
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import traceback
from paddle.fluid.dygraph.dygraph_to_static.origin_info import Location, OriginInfo, global_origin_info_map
ERROR_DATA = "Error data about original source code information and traceback."
def attach_error_data(error, in_runtime=False):
"""
Attachs error data about original source code information and traceback to an error.
Args:
error(Exception): An native error.
in_runtime(bool): `error` is raised in runtime if in_runtime is True, otherwise in compile time
Returns:
An error attached data about original source code information and traceback.
"""
e_type, e_value, e_traceback = sys.exc_info()
tb = traceback.extract_tb(e_traceback)[1:]
error_data = ErrorData(e_type, e_value, tb, global_origin_info_map)
error_data.in_runtime = in_runtime
setattr(error, ERROR_DATA, error_data)
return error
class TraceBackFrame(OriginInfo):
"""
Traceback frame information.
"""
def __init__(self, location, function_name, source_code):
self.location = location
self.function_name = function_name
self.source_code = source_code
class ErrorData(object):
"""
Error data attached to an exception which is raised in un-transformed code.
"""
def __init__(self, error_type, error_value, origin_traceback,
origin_info_map):
self.error_type = error_type
self.error_value = error_value
self.origin_traceback = origin_traceback
self.origin_info_map = origin_info_map
self.in_runtime = False
def create_exception(self):
message = self.create_message()
new_exception = self.error_type(message)
setattr(new_exception, ERROR_DATA, self)
return new_exception
def create_message(self):
"""
Creates a custom error message which includes trace stack with source code information of dygraph from user.
"""
message_lines = []
# Step1: Adds header message to prompt users that the following is the original information.
header_message = "In user code:"
message_lines.append(header_message)
message_lines.append("")
# Simplify error value to improve readability if error is raised in runtime
if self.in_runtime:
self._simplify_error_value()
message_lines.append(str(self.error_value))
return '\n'.join(message_lines)
# Step2: Optimizes stack information with source code information of dygraph from user.
for filepath, lineno, funcname, code in self.origin_traceback:
loc = Location(filepath, lineno)
dygraph_func_info = self.origin_info_map.get(loc.line_location,
None)
if dygraph_func_info:
# TODO(liym27): more information to prompt users that this is the original information.
# Replaces trace stack information about transformed static code with original dygraph code.
traceback_frame = self.origin_info_map[loc.line_location]
else:
traceback_frame = TraceBackFrame(loc, funcname, code)
message_lines.append(traceback_frame.formated_message())
# Step3: Adds error message like "TypeError: dtype must be int32, but received float32".
error_message = " " * 4 + traceback.format_exception_only(
self.error_type, self.error_value)[0].strip("\n")
message_lines.append(error_message)
return '\n'.join(message_lines)
def _simplify_error_value(self):
"""
Simplifies error value to improve readability if error is raised in runtime.
NOTE(liym27): The op callstack information about transformed static code has been replaced with original dygraph code.
TODO(liym27):
1. Need a more robust way because the code of start_trace may change.
2. Set the switch to determine whether to simplify error_value
"""
assert self.in_runtime is True
error_value_lines = str(self.error_value).split("\n")
error_value_lines_strip = [mes.lstrip(" ") for mes in error_value_lines]
start_trace = "outputs = static_func(*inputs)"
start_idx = error_value_lines_strip.index(start_trace)
error_value_lines = error_value_lines[start_idx + 1:]
error_value_str = '\n'.join(error_value_lines)
self.error_value = self.error_type(error_value_str)
......@@ -39,32 +39,21 @@ GENERATE_VARIABLE_PREFIX = 'generate_variable'
def create_while_node(condition_name, body_name, loop_var_names):
while_args = []
while_args.append(
gast.Name(
id=condition_name,
ctx=gast.Param(),
annotation=None,
type_comment=None))
while_args.append(
gast.Name(
id=body_name, ctx=gast.Param(), annotation=None, type_comment=None))
assign_targets = [
gast.Name(
id=var_name, ctx=gast.Param(), annotation=None, type_comment=None)
for var_name in loop_var_names
]
while_args.append(gast.List(elts=assign_targets, ctx=gast.Param()))
while_func_id = gast.parse(
'fluid.dygraph.dygraph_to_static.convert_operators.convert_while_loop'
).body[0].value
while_node = gast.Call(func=while_func_id, args=while_args, keywords=[])
assign_node = gast.Assign(
targets=[gast.Tuple(
elts=assign_targets, ctx=gast.Store())],
value=while_node)
return assign_node
# NOTE(liym27):
# It's better to parse the source code into an AST node than to customize an AST node
# including child nodes, because it is easy to mistake the ast node type when customizing the node.
#
# For example: loop_var_names = [a, b, foo.x], the type of `a` or `b` is gast.Name,
# but the type of `foo.x` gast.Attribute.
while_func_name = "fluid.dygraph.dygraph_to_static.convert_operators.convert_while_loop"
while_node_str = "[{}] = {}({}, {}, [{}])".format(
",".join(loop_var_names), while_func_name, condition_name, body_name,
",".join(loop_var_names))
while_node = gast.parse(while_node_str).body[0]
return while_node
class NameVisitor(gast.NodeVisitor):
......
......@@ -19,8 +19,12 @@ import inspect
import gast
from paddle.fluid import core
from paddle.fluid.framework import Program
# NOTE(liym27): Please use `getattr(ast_node, ORIGI_INFO)` instead of . operation to get the original information of ast node.
ORIGI_INFO = "Original information of source code for ast node."
ORIGI_INFO_MAP = "Original information map of source code."
class Location(object):
......@@ -64,6 +68,15 @@ class OriginInfo(object):
return "{} \nsource_code: {} in function {}\n ".format(
self.location, self.source_code, self.function_name)
def formated_message(self):
return ' File "{}", line {}, in {}\n\t{}'.format(
self.location.filepath, self.location.lineno, self.function_name,
self.source_code.lstrip())
def as_frame(self):
return (self.location.filepath, self.location.lineno,
self.function_name, self.source_code.lstrip())
class OriginInfoAttacher(gast.NodeTransformer):
"""
......@@ -119,7 +132,12 @@ class OriginInfoAttacher(gast.NodeTransformer):
return self.col_offset + node.col_offset
def create_origin_info_map(transformed_node, static_func):
global_origin_info_map = {}
def create_and_update_origin_info_map(transformed_node,
static_func,
is_global=True):
"""
Creates a original information map between transformed static function and original dygraph function.
......@@ -156,6 +174,10 @@ def create_origin_info_map(transformed_node, static_func):
origin_info_map[static_loc] = dygraph_info
global_origin_info_map.update(origin_info_map)
if is_global:
return global_origin_info_map
return origin_info_map
......@@ -234,3 +256,63 @@ def ast_walk(transformed_node, static_node):
if isinstance(d_item, gast.AST):
transformed_node_list.append(d_item)
static_node_list.append(s_item)
def update_op_callstack_with_origin_info(program):
"""
Replaces op callstack information about transformed static code with original dygraph code.
"""
assert isinstance(program, Program)
def get_new_op_callstack(callstack):
"""
An example of callstack:
File "path1/to/file.py", line 10, in func_1
y = fluid.layers.fill_constant(x, shape=[1], dtype="int32")
File "path2/to/file.py", line 740, in fill_constant
stop_gradient=True)
File "path3/to/file.py", line 43, in append_op
return self.main_program.current_block().append_op(*args, **kwargs)
File "path4/to/file.py", line 2811, in append_op
attrs=kwargs.get("attrs", None))
File "path5/to/file.py", line 1919, in __init__
for frame in traceback.extract_stack():
"""
assert len(callstack) % 2 == 0
for i in range(0, len(callstack), 2):
file_line = callstack[i].lstrip(" ").split(",")
filepath = file_line[0][6:-1]
lineno = int(file_line[1][6:])
funcname = file_line[2][4:]
code = callstack[i + 1].lstrip(" ")
loc = Location(filepath, lineno)
dygraph_func_info = global_origin_info_map.get(loc.line_location)
if dygraph_func_info:
filepath, lineno, funcname, code = \
dygraph_func_info.as_frame()
callstack[i] = ' File "{}", line {}, in {}'.format(
filepath, lineno, funcname)
callstack[i + 1] = ' {}'.format(code)
return callstack
op_maker = core.op_proto_and_checker_maker
callstack_var_name = op_maker.kOpCreationCallstackAttrName()
for block in program.blocks:
for i, op in enumerate(block.ops):
if op.has_attr(callstack_var_name):
callstack = op.attr(callstack_var_name)
callstack = get_new_op_callstack(callstack)
op._set_attr(callstack_var_name, callstack)
return program
......@@ -130,8 +130,6 @@ class PartialProgramLayer(layers.Layer):
self._check_params_all_inited(main_program)
# 2. Prune the parameters not used anywhere in the program.
self._prune_unused_params(main_program)
# 3. Remove op's python call stack with redundant low-level error messages.
main_program = self._remove_op_call_stack(main_program)
return main_program
......
......@@ -47,8 +47,7 @@ class PrintTransformer(gast.NodeTransformer):
# NOTE: deal with print in PY3
def visit_Call(self, node):
if isinstance(node.func, gast.Name) and node.func.id == 'print':
convert_print_node = self._create_print_node(node.args)
return gast.Expr(value=convert_print_node)
node = self._create_print_node(node.args)
return node
# NOTE: deal with print in PY2
......
......@@ -36,6 +36,9 @@ from paddle.fluid.wrapped_decorator import signature_safe_contextmanager
from paddle.fluid.dygraph.base import param_guard
from paddle.fluid.data_feeder import check_type
from paddle.fluid.dygraph.dygraph_to_static.partial_program import partial_program_from
from paddle.fluid.dygraph.dygraph_to_static.origin_info import attach_origin_info, create_and_update_origin_info_map
from paddle.fluid.dygraph.dygraph_to_static.origin_info import update_op_callstack_with_origin_info
from paddle.fluid.dygraph.dygraph_to_static.error import attach_error_data, ERROR_DATA
__all__ = ['ProgramTranslator', 'convert_to_static']
......@@ -88,15 +91,23 @@ class FunctionCache(object):
# with decorator directly and function.__wrapped__ holds the actual function.
func = getattr(func, '__wrapped__', func)
source_code = func_to_source_code(func)
# TODO(liym27):
# Consider this case: source_code in self._code_to_ast_caches,
# but actually they are methods in different classes.
# Maybe use (__class__, source_code) as key
if source_code in self._code_to_ast_caches:
root_wrapper = self._code_to_ast_caches[source_code]
else:
root = gast.parse(source_code)
root = attach_origin_info(root, func)
root_wrapper = self._dygraph_to_static.get_static_ast(root)
self._code_to_ast_caches[source_code] = root_wrapper
# Get static function from AST
static_func, file_name = ast_to_func(root_wrapper.node, func)
create_and_update_origin_info_map(root_wrapper.node, static_func)
return static_func
def exist(self, func):
......@@ -125,6 +136,7 @@ class FunctionSpec(object):
self._args = args
self._kwargs = kwargs
# TODO(liym27): func has multi layer decorator
dyfunc = getattr(func, '__wrapped__', func)
self._dyfunc_code = inspect.getsource(dyfunc)
......@@ -282,11 +294,19 @@ class ConcreteProgram(object):
# 3. Builds program only once and returns the output Variables.
with param_guard(func_spec.parameters(False)), param_guard(
func_spec.buffers(False)):
outputs = static_func(*inputs)
try:
outputs = static_func(*inputs)
except BaseException as e:
# NOTE: If e is raised in compile time, e should be attached to ERROR_DATA here.
attach_error_data(e)
raise
if not isinstance(outputs,
(tuple, list)) and outputs is not None:
outputs = [outputs]
main_program = update_op_callstack_with_origin_info(main_program)
return ConcreteProgram(
inputs=inputs,
outputs=outputs,
......@@ -483,14 +503,24 @@ class ProgramTranslator(object):
return dygraph_func(*args, **kwargs)
function_spec = FunctionSpec(dygraph_func, args, kwargs)
_, partial_program_layer = self._program_cache[function_spec]
concrete_program, partial_program_layer = self._program_cache[
function_spec]
if args and isinstance(args[0], layers.Layer):
# Synchronize self.training attribute.
partial_program_layer.training = args[0].training
args = args[1:]
return partial_program_layer(args)
try:
return partial_program_layer(args)
except BaseException as e:
# NOTE:
# 1. If e is raised in compile time, e should have been attached to ERROR_DATA before;
# 2. If e raised in runtime, e should be attached to ERROR_DATA here.
if not hasattr(e, ERROR_DATA):
# runtime error
attach_error_data(e, in_runtime=True)
raise
def get_func(self, dygraph_func):
"""
......
......@@ -425,8 +425,7 @@ def _load_persistable_vars(model_path,
params_filename=None):
# 1. load extra var info
with open(var_info_path, 'rb') as f:
extra_var_info = pickle.load(f) if six.PY2 else pickle.load(
f, encoding='latin1')
extra_var_info = pickle.load(f)
# 2. construct var dict
load_var_dict = dict()
......
......@@ -15,20 +15,23 @@
from __future__ import print_function
import os
import six
import pickle
import warnings
import six
from paddle.fluid import core
from paddle.fluid.compiler import BuildStrategy, CompiledProgram, ExecutionStrategy
from paddle.fluid.data_feeder import check_type
from paddle.fluid.dygraph.base import program_desc_tracing_guard, switch_to_static_graph
from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator, FunctionSpec
from paddle.fluid.dygraph.dygraph_to_static.error import ERROR_DATA
from paddle.fluid.dygraph.dygraph_to_static.program_translator import FunctionSpec, ProgramTranslator
from paddle.fluid.dygraph.io import EXTRA_VAR_INFO_FILENAME, VARIABLE_FILENAME, TranslatedLayer
from paddle.fluid.dygraph.layers import Layer
from paddle.fluid.executor import Executor, scope_guard
from paddle.fluid.framework import Program, Block, Variable, ParamBase, _dygraph_tracer, dygraph_only, _dygraph_guard, _current_expected_place, in_dygraph_mode
from paddle.fluid.framework import Block, ParamBase, Program, Variable
from paddle.fluid.framework import _current_expected_place, _dygraph_guard, _dygraph_tracer
from paddle.fluid.framework import dygraph_only, in_dygraph_mode
from paddle.fluid.wrapped_decorator import wrap_decorator
from paddle.fluid.dygraph.io import TranslatedLayer, VARIABLE_FILENAME, EXTRA_VAR_INFO_FILENAME
__all__ = ['TracedLayer', 'declarative', 'dygraph_to_static_func']
......@@ -167,7 +170,25 @@ def _declarative_(dygraph_func):
"The decorator 'declarative' doesn't work when setting ProgramTranslator.enable=False. "
"We will just return dygraph output.")
return dygraph_func(*args, **kwargs)
return program_translator.get_output(dygraph_func, *args, **kwargs)
try:
return program_translator.get_output(dygraph_func, *args, **kwargs)
except Exception as e:
error_data = getattr(e, ERROR_DATA, None)
if error_data:
new_exception = error_data.create_exception()
if six.PY3:
# NOTE(liym27):
# 1. Why `raise new_exception from None`?
# In Python 3, by default, an new exception is raised with trace information of the caught exception.
# This only raises new_exception and hides unwanted implementation details from tracebacks of the
# caught exception.
# 2. Use exec to bypass syntax error checking in Python 2.
six.exec_("raise new_exception from None")
else:
raise new_exception
else:
raise
return __impl__
......
......@@ -21,7 +21,7 @@ from paddle.fluid.executor import Executor
from paddle.fluid.optimizer import SGD
from paddle.fluid.incubate.fleet.base.mode import Mode
from paddle.fluid.incubate.fleet.base.role_maker import RoleMakerBase
from paddle.fleet.base.role_maker import RoleMakerBase
from paddle.fluid.contrib.mixed_precision.decorator import OptimizerWithMixedPrecision
from . import mode
......@@ -209,7 +209,10 @@ class Fleet(object):
self._executor = Executor(fluid.CPUPlace())
if role_maker and not isinstance(role_maker, RoleMakerBase):
raise TypeError("role_maker must be an instance of RoleMakerBase")
from paddle.fluid.incubate.fleet.base.role_maker import RoleMakerBase as RoleMakerBaseIncubate
if role_maker and not isinstance(role_maker, RoleMakerBaseIncubate):
raise TypeError(
"role_maker must be an instance of RoleMakerBase")
self._role_maker = role_maker
self._role_maker.generate_role()
......
......@@ -579,7 +579,7 @@ class FleetTranspiler(Fleet):
block.append_op(
type='recv_save',
attrs={
"trainer_id": self._role_maker.worker_id(),
"trainer_id": self._role_maker.worker_index(),
"shape": var.shape,
"slice_shapes":
[",".join([str(i) for i in var.shape])],
......
......@@ -329,7 +329,7 @@ class CompileTimeStrategy(object):
is_distributed = True if param_name in distibuted_varnames else False
ctx = self.build_ctx(grad, self.grad_var_mapping, True, False,
ctx = self.build_ctx(grad, self.grad_var_mapping, True, True,
True, is_distributed)
send_ctx[ctx.var_name()] = ctx
......
......@@ -6200,7 +6200,7 @@ def squeeze(input, axes, name=None):
Out.shape = [1,3,5]
Args:
input (Variable): The input Tensor. Support data type: float16, float32, float64, int8, int32, int64.
input (Variable): The input Tensor. Supported data type: float32, float64, bool, int8, int32, int64.
axes (list): One integer or List of integers, indicating the dimensions to be squeezed.
Axes range is :math:`[-rank(input), rank(input))`.
If axes is negative, :math:`axes=axes+rank(input)`.
......@@ -6226,8 +6226,9 @@ def squeeze(input, axes, name=None):
helper = LayerHelper("squeeze", **locals())
check_variable_and_dtype(
input, 'input',
['float16', 'float32', 'float64', 'int8', 'int32', 'int64'], 'squeeze')
check_type(axes, 'axes', (list, tuple), 'squeeze')
['float16', 'float32', 'float64', 'bool', 'int8', 'int32', 'int64'],
'squeeze')
check_type(axes, 'axis/axes', (list, tuple), 'squeeze')
out = helper.create_variable_for_type_inference(dtype=input.dtype)
x_shape = helper.create_variable_for_type_inference(dtype=input.dtype)
helper.append_op(
......@@ -6254,12 +6255,12 @@ def unsqueeze(input, axes, name=None):
then Unsqueezed tensor with axes=[0, 4] has shape [1, 3, 4, 5, 1].
Args:
input (Variable): The input Tensor to be unsqueezed. It is a N-D Tensor of data types float32, float64, int32.
input (Variable): The input Tensor to be unsqueezed. Supported data type: float32, float64, bool, int8, int32, int64.
axes (int|list|tuple|Variable): Indicates the dimensions to be inserted. The data type is ``int32`` . If ``axes`` is a list or tuple, the elements of it should be integers or Tensors with shape [1]. If ``axes`` is an Variable, it should be an 1-D Tensor .
name (str|None): Name for this layer.
Returns:
Variable: Output unsqueezed Tensor, with data type being float32, float64, int32, int64.
Variable: Unsqueezed Tensor, with the same data type as input.
Examples:
.. code-block:: python
......@@ -6269,10 +6270,15 @@ def unsqueeze(input, axes, name=None):
y = fluid.layers.unsqueeze(input=x, axes=[1])
"""
if not isinstance(axes, (int, list, tuple, Variable)):
raise TypeError(
"The type of 'axes' in unsqueeze must be int, list, tuple or Variable, but "
"received %s." % (type(axes)))
if in_dygraph_mode():
out, _ = core.ops.unsqueeze2(input, 'axes', axes)
return out
check_type(axes, 'axis/axes', (int, list, tuple, Variable), 'unsqueeze')
check_variable_and_dtype(
input, 'input',
['float16', 'float32', 'float64', 'bool', 'int8', 'int32', 'int64'],
'unsqueeze')
helper = LayerHelper("unsqueeze2", **locals())
inputs = {"X": input}
attrs = {}
......@@ -9966,7 +9972,7 @@ def stack(x, axis=0, name=None):
must be the same. Supposing input is N dims
Tensors :math:`[d_0, d_1, ..., d_{n-1}]`, the output is N+1 dims
Tensor :math:`[d_0, d_1, d_{axis-1}, len(x), d_{axis}, ..., d_{n-1}]`.
Support data types: float32, float64, int32, int64.
Supported data types: float32, float64, int32, int64.
axis (int, optional): The axis along which all inputs are stacked. ``axis`` range is :math:`[-(R+1), R+1)`.
R is the first tensor of inputs. If ``axis`` < 0, :math:`axis=axis+rank(x[0])+1`.
The default value of axis is 0.
......
......@@ -685,8 +685,9 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
"""
attrs = {'force_cpu': force_cpu}
dtype = convert_dtype(dtype)
if not isinstance(value, Variable):
if convert_dtype(dtype) in ['int64', 'int32']:
if dtype in ['int64', 'int32']:
attrs['str_value'] = str(int(value))
else:
attrs['str_value'] = str(float(value))
......@@ -697,7 +698,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
out = _varbase_creator(dtype=dtype)
if isinstance(value, Variable):
if convert_dtype(dtype) in ['int64', 'int32']:
if dtype in ['int64', 'int32']:
attrs['str_value'] = str(int(value.numpy()))
else:
attrs['str_value'] = str(float(value.numpy()))
......@@ -712,6 +713,8 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
helper = LayerHelper("fill_constant", **locals())
inputs = {}
if isinstance(value, Variable):
if convert_dtype(value.dtype) != dtype:
value = cast(value, dtype)
inputs['ValueTensor'] = value
check_dtype(dtype, 'dtype',
......
......@@ -345,7 +345,6 @@ if(WITH_DISTRIBUTE)
# FIXME(typhoonzero): add these tests back
list(REMOVE_ITEM DIST_TEST_OPS "test_dist_transformer")
list(REMOVE_ITEM DIST_TEST_OPS "test_dist_transpiler")
list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_ctr")
#not need
list(REMOVE_ITEM DIST_TEST_OPS "test_dist_base")
......
......@@ -28,6 +28,7 @@ import numpy as np
import ctr_dataset_reader
from test_dist_fleet_base import runtime_main, FleetDistRunnerBase
from paddle.fleet.base.util_factory import fleet_util
# Fix seed for test
fluid.default_startup_program().random_seed = 1
......@@ -181,8 +182,14 @@ class TestDistCTR2x2(FleetDistRunnerBase):
loss_val = exe.run(program=compiled_prog,
fetch_list=[self.avg_cost.name])
loss_val = np.mean(loss_val)
print("TRAIN ---> pass: {} loss: {}\n".format(epoch_id,
loss_val))
reduce_output = fleet_util.all_reduce(
np.array(loss_val), mode="sum")
loss_all_trainer = fleet_util.all_gather(float(loss_val))
loss_val = float(reduce_output) / len(loss_all_trainer)
message = "TRAIN ---> pass: {} loss: {}\n".format(epoch_id,
loss_val)
fleet_util.print_on_rank(message, 0)
pass_time = time.time() - pass_start
except fluid.core.EOFException:
self.reader.reset()
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import inspect
import unittest
import numpy as np
import paddle.fluid as fluid
from paddle.fluid.core import EnforceNotMet
from paddle.fluid.dygraph.dygraph_to_static.error import ERROR_DATA, ErrorData
from paddle.fluid.dygraph.dygraph_to_static.origin_info import unwrap
from paddle.fluid.dygraph.jit import declarative
def inner_func():
fluid.layers.fill_constant(shape=[1, 2], value=9, dtype="int")
return
@declarative
def func_error_in_compile_time(x):
x = fluid.dygraph.to_variable(x)
inner_func()
if fluid.layers.mean(x) < 0:
x_v = x - 1
else:
x_v = x + 1
return x_v
@declarative
def func_error_in_compile_time_2(x):
x = fluid.dygraph.to_variable(x)
x = fluid.layers.reshape(x, shape=[1, 2])
return x
@declarative
def func_error_in_runtime(x, iter_num=3):
x = fluid.dygraph.to_variable(x)
two = fluid.layers.fill_constant(shape=[1], value=2, dtype="int32")
x = fluid.layers.reshape(x, shape=[1, two])
return x
class TestErrorInCompileTime(unittest.TestCase):
def setUp(self):
self.set_func()
self.set_input()
self.set_exception_type()
def set_func(self):
self.func = func_error_in_compile_time
def set_exception_type(self):
self.exception_type = TypeError
def set_input(self):
self.input = np.ones([3, 2])
def set_message(self):
self.expected_message = \
['File "{}", line 36, in func_error_in_compile_time'.format(self.filepath),
'inner_func()',
'File "{}", line 29, in inner_func'.format(self.filepath),
'fluid.layers.fill_constant(shape=[1, 2], value=9, dtype="int")',
]
def _test_create_message(self, error_data):
self.filepath = inspect.getfile(unwrap(self.func))
self.set_message()
error_message = error_data.create_message()
self.assertIn('In user code:', error_message)
for m in self.expected_message:
self.assertIn(m, error_message)
def test(self):
with fluid.dygraph.guard():
with self.assertRaises(self.exception_type) as cm:
self.func(self.input)
exception = cm.exception
error_data = getattr(exception, ERROR_DATA)
self.assertIsInstance(error_data, ErrorData)
self._test_create_message(error_data)
class TestErrorInCompileTime2(TestErrorInCompileTime):
def set_func(self):
self.func = func_error_in_compile_time_2
def set_exception_type(self):
self.exception_type = EnforceNotMet
def set_message(self):
self.expected_message = \
[
'File "{}", line 47, in func_error_in_compile_time_2'.format(self.filepath),
'x = fluid.layers.reshape(x, shape=[1, 2])'
]
class TestErrorInRuntime(TestErrorInCompileTime):
def set_func(self):
self.func = func_error_in_runtime
def set_exception_type(self):
self.exception_type = EnforceNotMet
def set_message(self):
self.expected_message = \
[
'File "{}", line 55, in func_error_in_runtime'.format(self.filepath),
'x = fluid.layers.reshape(x, shape=[1, two])'
]
def _test_create_message(self, error_data):
self.filepath = inspect.getfile(unwrap(self.func))
self.set_message()
with self.assertRaises(ValueError):
error_data.create_message()
error_data.in_runtime = False
error_message = error_data.create_message()
self.assertIn('In user code:', error_message)
for m in self.expected_message:
self.assertIn(m, error_message)
if __name__ == '__main__':
unittest.main()
......@@ -90,7 +90,8 @@ class TestOriginInfo(unittest.TestCase):
# step3
self.static_func, _ = ast_to_func(transformed_ast, self.dygraph_func)
info_map = create_origin_info_map(dygraph_ast, self.static_func)
info_map = create_and_update_origin_info_map(dygraph_ast,
self.static_func)
return info_map
......
......@@ -17,7 +17,7 @@ import sys
import time
def train():
def train(prefix):
selected_gpus = os.getenv("FLAGS_selected_gpus")
trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
......@@ -29,11 +29,12 @@ def train():
.format(selected_gpus, worker_endpoints, trainers_num, current_endpoint,trainer_id)
print(name)
with open("multi_process.check_{}.log".format(trainer_id), "w") as f:
with open("multi_process_{}.check_{}.log".format(prefix, trainer_id),
"w") as f:
f.write(name)
def train_abort():
def train_abort(prefix):
selected_gpus = os.getenv("FLAGS_selected_gpus")
trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
......@@ -49,8 +50,9 @@ def train_abort():
name = "abort>>> selected_gpus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}"\
.format(selected_gpus, worker_endpoints, trainers_num, current_endpoint,trainer_id)
print(name)
with open("multi_process.check_{}.log".format(trainer_id),
"w") as f:
with open(
"multi_process_{}.check_{}.log".format(prefix, trainer_id),
"w") as f:
f.write(name)
raise
else:
......@@ -60,12 +62,15 @@ def train_abort():
.format(selected_gpus, worker_endpoints, trainers_num, current_endpoint,trainer_id)
print(name)
with open("multi_process.check_{}.log".format(trainer_id), "w") as f:
with open("multi_process_{}.check_{}.log".format(prefix, trainer_id),
"w") as f:
f.write(name)
if __name__ == '__main__':
if len(sys.argv) == 2 and sys.argv[1] == "abort":
train_abort()
if len(sys.argv) == 3 and sys.argv[2] == "abort":
prefix = sys.argv[1]
train_abort(prefix)
else:
train()
prefix = sys.argv[1]
train(prefix)
......@@ -63,18 +63,104 @@ class TestAddMMOpError(unittest.TestCase):
def test_errors(self):
with program_guard(Program(), Program()):
# The input type of addmm_op must be Variable.
input = fluid.create_lod_tensor(
np.array([[-1]]), [[1]], fluid.CPUPlace())
np.array([[-1, -1], [-1, -1]]), [[2]], fluid.CPUPlace())
x1 = fluid.create_lod_tensor(
np.array([[-1]]), [[1]], fluid.CPUPlace())
np.array([[-1, -1], [-1, -1]]), [[2]], fluid.CPUPlace())
x2 = fluid.create_lod_tensor(
np.array([[-1]]), [[1]], fluid.CPUPlace())
np.array([[-1, -1], [-1, -1]]), [[2]], fluid.CPUPlace())
self.assertRaises(TypeError, paddle.addmm, input, x1, x2)
# The input dtype of mul_op must be float32 or float64.
input = fluid.layers.data(name='input', shape=[4], dtype="int32")
x3 = fluid.layers.data(name='x3', shape=[4], dtype="int32")
x4 = fluid.layers.data(name='x4', shape=[4], dtype="int32")
input = fluid.layers.data(
name='input',
shape=[4, 4],
dtype="int32",
append_batch_size=False)
x3 = fluid.layers.data(
name='x3', shape=[4, 4], dtype="int32", append_batch_size=False)
x4 = fluid.layers.data(
name='x4', shape=[4, 4], dtype="int32", append_batch_size=False)
self.assertRaises(TypeError, paddle.addmm, input, x3, x4)
# x and y dimension mismatch
x5 = fluid.layers.data(
name='x5',
shape=[4, 5],
dtype="float32",
append_batch_size=False)
x6 = fluid.layers.data(
name='x6',
shape=[4, 4],
dtype="float32",
append_batch_size=False)
self.assertRaises(ValueError, paddle.addmm, input, x5, x6)
# input and x are not broadcastable
x7 = fluid.layers.data(
name='x7',
shape=[4, 4],
dtype="float32",
append_batch_size=False)
x8 = fluid.layers.data(
name='x8',
shape=[4, 4],
dtype="float32",
append_batch_size=False)
input1 = fluid.layers.data(
name='input1',
shape=[2, 4],
dtype="float32",
append_batch_size=False)
self.assertRaises(ValueError, paddle.addmm, input1, x7, x8)
# input and x are not broadcastable
x9 = fluid.layers.data(
name='x9',
shape=[4, 4],
dtype="float32",
append_batch_size=False)
x10 = fluid.layers.data(
name='x10',
shape=[4, 4],
dtype="float32",
append_batch_size=False)
input2 = fluid.layers.data(
name='input2',
shape=[1, 2],
dtype="float32",
append_batch_size=False)
self.assertRaises(ValueError, paddle.addmm, input2, x9, x10)
x11 = fluid.layers.data(
name='x11',
shape=[4, 4],
dtype="float32",
append_batch_size=False)
x12 = fluid.layers.data(
name='x12',
shape=[4, 4],
dtype="float32",
append_batch_size=False)
input3 = fluid.layers.data(
name='input3',
shape=[4, 2],
dtype="float32",
append_batch_size=False)
self.assertRaises(ValueError, paddle.addmm, input3, x11, x12)
x13 = fluid.layers.data(
name='x13',
shape=[4, 4],
dtype="float32",
append_batch_size=False)
x14 = fluid.layers.data(
name='x14',
shape=[4, 4],
dtype="float32",
append_batch_size=False)
input4 = fluid.layers.data(
name='input4',
shape=[3, 1],
dtype="float32",
append_batch_size=False)
self.assertRaises(ValueError, paddle.addmm, input4, x13, x14)
class TestAddMMOp2(TestAddMMOp):
......@@ -147,5 +233,23 @@ class TestAddMMOp4(unittest.TestCase):
assert np.allclose(np_input + np.dot(np_x, np_y), out.numpy())
'''
class TestAddMMAPI(unittest.TestCase):
def test_api_error(self):
data_x = np.ones((2, 2)).astype(np.float32)
data_y = np.ones((2, 2)).astype(np.float32)
data_input = np.ones((2, 2)).astype(np.float32)
paddle.enable_imperative()
def test_error1():
data_x_wrong = np.ones((2, 3)).astype(np.float32)
x = paddle.imperative.to_variable(data_x_wrong)
y = paddle.imperative.to_variable(data_y)
input = paddle.imperative.to_variable(data_input)
out = paddle.tensor.addmm( input=input, x=x, y=y, beta=0.5, alpha=5.0 )
self.assertRaises(ValueError, test_error1)
'''
if __name__ == "__main__":
unittest.main()
......@@ -73,5 +73,15 @@ class API_TestDygraphBmm(unittest.TestCase):
self.assertTrue(np.allclose(expected_result, out_np))
class TestBmmAPIError(unittest.TestCase):
def test_api_error(self):
x_data = np.arange(24, dtype='float32').reshape((2, 3, 4))
y_data = np.arange(16, dtype='float32').reshape((2, 4, 2))
y_data_wrong1 = np.arange(16, dtype='float32').reshape((2, 2, 4))
y_data_wrong2 = np.arange(16, dtype='float32').reshape((2, 2, 2, 2))
self.assertRaises(ValueError, paddle.bmm, x_data, y_data_wrong1)
self.assertRaises(ValueError, paddle.bmm, x_data, y_data_wrong2)
if __name__ == "__main__":
unittest.main()
......@@ -21,6 +21,9 @@ import os
import sys
import subprocess
import six
import shutil
import numpy as np
import argparse
from contextlib import closing
import socket
......@@ -29,7 +32,8 @@ import tempfile
import unittest
import paddle.fluid as fluid
import paddle.fluid.incubate.fleet.base.role_maker as role_maker
import paddle.fleet.base.role_maker as role_maker
from paddle.fleet.base.util_factory import fleet_util
from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import StrategyFactory
......@@ -48,18 +52,26 @@ class FleetDistRunnerBase(object):
"""
def build_role(self, args):
if args.role.upper() == "PSERVER":
role = role_maker.UserDefinedRoleMaker(
is_collective=False,
init_gloo=True,
path=args.gloo_path,
current_id=args.current_id,
role=role_maker.Role.SERVER,
worker_num=args.trainers,
worker_endpoints=args.trainer_endpoints.split(","),
server_endpoints=args.endpoints.split(","))
else:
role = role_maker.UserDefinedRoleMaker(
is_collective=False,
init_gloo=True,
path=args.gloo_path,
current_id=args.current_id,
role=role_maker.Role.WORKER,
worker_num=args.trainers,
worker_endpoints=args.trainer_endpoints.split(","),
server_endpoints=args.endpoints.split(","))
self.role = role
return role
def build_strategy(self, args):
......@@ -114,26 +126,13 @@ class FleetDistRunnerBase(object):
optimizer.minimize(avg_cost)
def run_pserver(self, args):
fleet.init(self.build_role(args))
strategy = self.build_strategy(args)
avg_cost = self.net(args)
self.build_optimizer(avg_cost, strategy)
fleet.init_server()
fleet.run_server()
def run_dataset_trainer(self, args):
fleet.init(self.build_role(args))
strategy = self.build_strategy(args)
avg_cost = self.net(args)
self.build_optimizer(avg_cost, strategy)
out = self.do_dataset_training(fleet)
def run_pyreader_trainer(self, args):
fleet.init(self.build_role(args))
strategy = self.build_strategy(args)
avg_cost = self.net(args)
self.build_optimizer(avg_cost, strategy)
out = self.do_pyreader_training(fleet)
def net(self, args, batch_size=4, lr=0.01):
......@@ -173,10 +172,14 @@ class TestFleetBase(unittest.TestCase):
print("set begin_port:", DIST_UT_PORT)
self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
DIST_UT_PORT, DIST_UT_PORT + 1)
DIST_UT_PORT += 2
self._tr_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
DIST_UT_PORT + 2, DIST_UT_PORT + 3)
DIST_UT_PORT += 4
else:
self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
self._find_free_port(), self._find_free_port())
self._tr_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
self._find_free_port(), self._find_free_port())
self._python_interp = sys.executable
self._geo_sgd_need_push_nums = 5
......@@ -236,18 +239,22 @@ class TestFleetBase(unittest.TestCase):
def _run_cluster(self, model, envs):
env = {'GRAD_CLIP': str(self._grad_clip_mode)}
python_path = self._python_interp
gloo_path = tempfile.mkdtemp()
if os.getenv('WITH_COVERAGE', 'OFF') == 'ON':
envs['COVERAGE_FILE'] = os.getenv('COVERAGE_FILE', '')
python_path += " -m coverage run --branch -p"
env.update(envs)
tr_cmd = "{0} {1} --role trainer --endpoints {2} --current_id {{}} --trainers {3} --mode {4} --geo_sgd_need_push_nums {5} --reader {6}".format(
python_path, model, self._ps_endpoints, self._trainers, self._mode,
self._geo_sgd_need_push_nums, self._reader)
tr_cmd = "{0} {1} --role trainer --endpoints {2} --trainer_endpoints {3} --current_id {{}} --trainers {4} --mode {5} --geo_sgd_need_push_nums {6} --reader {7} --gloo_path {8}".format(
python_path, model, self._ps_endpoints, self._tr_endpoints,
self._trainers, self._mode, self._geo_sgd_need_push_nums,
self._reader, gloo_path)
ps_cmd = "{0} {1} --role pserver --endpoints {2} --current_id {{}} --trainers {3} --mode {4} --geo_sgd_need_push_nums {5} --reader {6}".format(
python_path, model, self._ps_endpoints, self._trainers, self._mode,
self._geo_sgd_need_push_nums, self._reader)
ps_cmd = "{0} {1} --role pserver --endpoints {2} --trainer_endpoints {3} --current_id {{}} --trainers {4} --mode {5} --geo_sgd_need_push_nums {6} --reader {7} --gloo_path {8}".format(
python_path, model, self._ps_endpoints, self._tr_endpoints,
self._trainers, self._mode, self._geo_sgd_need_push_nums,
self._reader, gloo_path)
# Run dist train to compare with local results
ps0, ps1, ps0_pipe, ps1_pipe = self._start_pserver(ps_cmd, env)
......@@ -284,6 +291,7 @@ class TestFleetBase(unittest.TestCase):
ps0.terminate()
ps1.terminate()
shutil.rmtree(gloo_path)
return 0, 0
def check_with_place(self,
......@@ -313,6 +321,9 @@ def runtime_main(test_class):
parser.add_argument(
'--role', type=str, required=True, choices=['pserver', 'trainer'])
parser.add_argument('--endpoints', type=str, required=False, default="")
parser.add_argument(
'--trainer_endpoints', type=str, required=False, default="")
parser.add_argument('--gloo_path', type=str, required=False, default="")
parser.add_argument('--current_id', type=int, required=False, default=0)
parser.add_argument('--trainers', type=int, required=False, default=1)
parser.add_argument('--mode', type=str, required=False, default='geo')
......@@ -322,6 +333,13 @@ def runtime_main(test_class):
args = parser.parse_args()
model = test_class()
role = model.build_role(args)
fleet.init(role)
strategy = model.build_strategy(args)
avg_cost = model.net(args)
model.build_optimizer(avg_cost, strategy)
fleet_util._set_strategy(strategy)
fleet_util._set_role_maker(role)
if args.role == "pserver":
model.run_pserver(args)
else:
......
......@@ -22,7 +22,7 @@ from test_dist_fleet_base import TestFleetBase
class TestDistMnistSync2x2(TestFleetBase):
def _setup_config(self):
self._mode = "sync"
self._mode = "async"
self._reader = "pyreader"
def check_with_place(self,
......
......@@ -269,18 +269,26 @@ class TestFillConstantAPI(unittest.TestCase):
out_6 = fluid.layers.fill_constant(
shape=shape_tensor_int64, dtype=np.float32, value=1.1)
val = fluid.layers.fill_constant(shape=[1], dtype=np.float32, value=1.1)
val1 = fluid.layers.fill_constant(
shape=[1], dtype=np.float32, value=1.1)
val2 = fluid.layers.fill_constant(
shape=[1], dtype=np.float64, value=1.1)
out_7 = fluid.layers.fill_constant(
shape=shape_tensor_int64, dtype=np.float32, value=val)
shape=shape_tensor_int64, dtype=np.float32, value=val1)
out_8 = fluid.layers.fill_constant(
shape=shape_tensor_int64, dtype=np.float32, value=val2)
exe = fluid.Executor(place=fluid.CPUPlace())
res_1, res_2, res_3, res_4, res_5, res_6, res_7 = exe.run(
res_1, res_2, res_3, res_4, res_5, res_6, res_7, res_8 = exe.run(
fluid.default_main_program(),
feed={
"shape_tensor_int32": np.array([1, 2]).astype("int32"),
"shape_tensor_int64": np.array([1, 2]).astype("int64"),
},
fetch_list=[out_1, out_2, out_3, out_4, out_5, out_6, out_7])
fetch_list=[
out_1, out_2, out_3, out_4, out_5, out_6, out_7, out_8
])
assert np.array_equal(res_1, np.full([1, 2], 1.1, dtype="float32"))
assert np.array_equal(res_2, np.full([1, 2], 1.1, dtype="float32"))
......@@ -289,6 +297,31 @@ class TestFillConstantAPI(unittest.TestCase):
assert np.array_equal(res_5, np.full([1, 2], 1.1, dtype="float32"))
assert np.array_equal(res_6, np.full([1, 2], 1.1, dtype="float32"))
assert np.array_equal(res_7, np.full([1, 2], 1.1, dtype="float32"))
assert np.array_equal(res_8, np.full([1, 2], 1.1, dtype="float32"))
class TestFillConstantImperative(unittest.TestCase):
def test_api(self):
with fluid.dygraph.guard():
data1 = np.array([1, 2]).astype('int32')
data2 = np.array([1.1]).astype('float32')
shape = fluid.dygraph.to_variable(data1)
val = fluid.dygraph.to_variable(data2)
res1 = fluid.layers.fill_constant(
shape=[1, 2], dtype='float32', value=1.1)
res2 = fluid.layers.fill_constant(
shape=shape, dtype='float32', value=1.1)
res3 = fluid.layers.fill_constant(
shape=shape, dtype='float32', value=val)
assert np.array_equal(
res1.numpy(), np.full(
[1, 2], 1.1, dtype="float32"))
assert np.array_equal(
res2.numpy(), np.full(
[1, 2], 1.1, dtype="float32"))
assert np.array_equal(
res3.numpy(), np.full(
[1, 2], 1.1, dtype="float32"))
class TestFillConstantOpError(unittest.TestCase):
......
......@@ -4,7 +4,6 @@ set -e
function test_launch_ps(){
fleetrun --server_num=2 --worker_num=2 fleet_ps_training.py 2> ut.elog
if grep -q "server are killed" ut.elog; then
echo "test pserver launch succeed"
else
......@@ -20,7 +19,7 @@ fi
test_launch_ps
# use default values
fleetrun multi_process.py
fleetrun multi_process.py fleetrun
# use paddlecloud
echo "begin test use paddlecloud"
......@@ -30,16 +29,16 @@ export POD_IP=127.0.0.1
export PADDLE_TRAINERS=127.0.0.1,127.0.0.2
export PADDLE_TRAINER_ID=0
export PADDLE_PORT=35019
export PADDLE_PORT=35789
export TRAINER_PORTS_NUM=2
distributed_args="--ips=${cluster_node_ips} --gpus=0,1 --log_dir=testlog"
CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} multi_process.py
CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} multi_process.py fleetrun
str1="selected_gpus:0 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35019 trainer_id:0"
str2="selected_gpus:1 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35020 trainer_id:1"
file_0="multi_process.check_0.log"
file_1="multi_process.check_1.log"
str1="selected_gpus:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35789 trainer_id:0"
str2="selected_gpus:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35790 trainer_id:1"
file_0="multi_process_fleetrun.check_0.log"
file_1="multi_process_fleetrun.check_1.log"
echo "paddlecloud params test"
if grep -q "$str1" "$file_0"; then
......@@ -70,7 +69,7 @@ unset TRAINER_PORTS_NUM
echo ""
echo "paddle.distributed.launch async poll process test"
if ! CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} multi_process.py abort; then
if ! CUDA_VISIBLE_DEVICES=0,1 fleetrun ${distributed_args} multi_process.py fleetrun abort; then
echo "train abort as planned"
fi
......
......@@ -40,10 +40,9 @@ class TestCloudRoleMaker(unittest.TestCase):
from paddle.fluid.incubate.fleet.parameter_server.pslib import PSLib
from paddle.fluid.incubate.fleet.base.role_maker import \
GeneralRoleMaker
from paddle.fluid.incubate.fleet.utils.http_server import KVHandler
from paddle.fluid.incubate.fleet.utils.http_server import KVServer
from paddle.fluid.incubate.fleet.utils.http_server import \
KVHTTPServer
from paddle.fleet.utils import KVHandler
from paddle.fleet.utils import KVServer
from paddle.fleet.utils import KVHTTPServer
except:
print("warning: no fleet, skip test_pslib_4")
return
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Test cloud role maker."""
from __future__ import print_function
import os
import unittest
import paddle.fleet.base.role_maker as role_maker
class TestRoleMakerBase(unittest.TestCase):
"""
Test cases for RoleMakerBase
"""
def test_rolemaker_base(self):
role = role_maker.RoleMakerBase()
self.assertRaises(Exception, role.is_worker)
self.assertRaises(Exception, role.is_server)
self.assertRaises(Exception, role.is_first_worker)
self.assertRaises(Exception, role.worker_num)
self.assertRaises(Exception, role.server_num)
self.assertRaises(Exception, role.worker_index)
self.assertRaises(Exception, role.server_index)
self.assertRaises(Exception, role.role_id)
trainer_endpoints = role.get_trainer_endpoints()
self.assertTrue(len(trainer_endpoints) == 0)
pserver_endpoints = role.get_pserver_endpoints()
self.assertTrue(len(pserver_endpoints) == 0)
print(role.to_string())
self.assertTrue(role._all_gather(role._node_type_comm, 1) is None)
self.assertTrue(role._all_reduce(role._node_type_comm, 1) is None)
role._barrier(role._node_type_comm)
class TestCloudRoleMaker(unittest.TestCase):
"""
Test cases for PaddleCloudRoleMaker.
"""
def setUp(self):
"""Set up, set envs."""
os.environ["PADDLE_TRAINERS_NUM"] = "2"
os.environ[
"PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001,127.0.0.2:36001"
os.environ[
"PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.2:36001"
os.environ["POD_IP"] = "127.0.0.1"
def test_tr_rolemaker(self):
"""Test tr rolenamer."""
os.environ["TRAINING_ROLE"] = "TRAINER"
os.environ["PADDLE_TRAINER_ID"] = "0"
try:
import netifaces
except:
print("warning: no netifaces, skip test_tr_rolemaker")
return
ro = role_maker.PaddleCloudRoleMaker(
is_collective=False, init_gloo=False)
self.assertTrue(ro.is_worker())
self.assertFalse(ro.is_server())
self.assertEqual(ro.worker_num(), 2)
self.assertTrue(ro.is_first_worker())
worker_endpoints = ro.get_trainer_endpoints()
self.assertEqual(worker_endpoints[0], '127.0.0.1:36001')
self.assertEqual(ro.role_id(), 0)
def test_tr_rolemaker_collective(self):
ro = role_maker.PaddleCloudRoleMaker(is_collective=True)
self.assertEqual(ro.worker_num(), 2)
def test_ps_rolemaker(self):
"""Test ps rolemaker."""
os.environ["TRAINING_ROLE"] = "PSERVER"
os.environ["POD_IP"] = "127.0.0.1"
os.environ["PADDLE_PORT"] = "36001"
try:
import netifaces
except:
print("warning: no netifaces, skip test_ps_rolemaker")
return
ro = role_maker.PaddleCloudRoleMaker(
is_collective=False, init_gloo=False)
self.assertEqual(ro.server_index(), 0)
self.assertFalse(ro.is_worker())
self.assertTrue(ro.is_server())
self.assertEqual(ro.server_num(), 2)
pserver_endpoints = ro.get_pserver_endpoints()
self.assertEqual(pserver_endpoints[0], '127.0.0.1:36001')
self.assertTrue(ro._all_gather(ro._all_comm, 1) is None)
self.assertTrue(ro._all_reduce(ro._all_comm, 1) is None)
def test_traing_role(self):
"""Test training role."""
os.environ["TRAINING_ROLE"] = "TEST"
try:
import netifaces
except:
print("warning: no netifaces, skip test_training_role")
return
ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
self.assertRaises(ValueError, ro.generate_role)
class TestUserDefinedRoleMaker(unittest.TestCase):
"""
Test cases for UserDefinedRoleMaker.
"""
def setUp(self):
pass
def test_ps_rolemaker(self):
try:
import netifaces
except:
print("warning: no netifaces, skip test_ps_rolemaker")
return
ro = role_maker.UserDefinedRoleMaker(
is_collective=False,
init_gloo=False,
server_endpoints="127.0.0.1:36001,127.0.0.1:36001",
role=role_maker.Role.SERVER,
current_id=0,
worker_num=2)
self.assertEqual(ro.server_num(), 2)
ro.generate_role()
self.assertTrue(ro.is_server())
self.assertEqual(ro.role_id(), 0)
def test_tr_rolemaker(self):
try:
import netifaces
except:
print("warning: no netifaces, skip test_tr_rolemaker")
return
ro = role_maker.UserDefinedRoleMaker(
is_collective=False,
init_gloo=False,
server_endpoints="127.0.0.1:36001,127.0.0.1:36001",
role=role_maker.Role.WORKER,
current_id=0,
worker_num=2)
self.assertIn("127.0.0.1:36001", ro.get_pserver_endpoints())
self.assertTrue(ro.is_worker())
self.assertEqual(ro.role_id(), 0)
if __name__ == "__main__":
unittest.main()
......@@ -12,12 +12,27 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from __future__ import print_function
import paddle
import paddle.fluid as fluid
import unittest
import numpy as np
import tarfile
import tempfile
import os
import sys
from paddle.dataset.common import download, DATA_HOME
from paddle.fleet.base.util_factory import fleet_util
import paddle.fleet.base.role_maker as role_maker
class TestFleetUtil(unittest.TestCase):
proto_data_url = "https://fleet.bj.bcebos.com/fleet_util_data.tgz"
proto_data_md5 = "59b7f12fd9dc24b64ae8e4629523a92a"
module_name = "fleet_util_data"
pruned_dir = os.path.join("fleet_util_data", "pruned_model")
train_dir = os.path.join("fleet_util_data", "train_program")
def test_util_base(self):
import paddle.fleet as fleet
util = fleet.UtilBase()
......@@ -65,6 +80,262 @@ class TestFleetUtil(unittest.TestCase):
user_id = fleet.util.get_user_id()
self.assertEqual(user_id, 10)
def test_fs(self):
from paddle.fleet.utils import LocalFS
fs = LocalFS()
dirs, files = fs.ls_dir("test_tmp")
dirs, files = fs.ls_dir("./")
self.assertFalse(fs.need_upload_download())
fleet_util.set_file_system(fs)
def test_barrier(self):
try:
import netifaces
except:
print("warning: no netifaces, skip test_barrier")
return
gloo = fluid.core.Gloo()
gloo.set_rank(0)
gloo.set_size(1)
gloo.set_prefix("123")
gloo.set_iface("lo")
gloo.set_hdfs_store("./tmp_test_fleet_barrier", "", "")
gloo.init()
role = role_maker.UserDefinedRoleMaker(
is_collective=False,
init_gloo=False,
current_id=0,
role=role_maker.Role.SERVER,
worker_endpoints=["127.0.0.1:6003"],
server_endpoints=["127.0.0.1:6001"])
role._node_type_comm = gloo
role._role_is_generated = True
fleet_util._set_role_maker(role)
fleet_util.barrier("worker")
def test_all_reduce(self):
try:
import netifaces
except:
print("warning: no netifaces, skip test_all_reduce")
return
gloo = fluid.core.Gloo()
gloo.set_rank(0)
gloo.set_size(1)
gloo.set_prefix("123")
gloo.set_iface("lo")
gloo.set_hdfs_store("./tmp_test_fleet_reduce", "", "")
gloo.init()
role = role_maker.UserDefinedRoleMaker(
is_collective=False,
init_gloo=False,
current_id=0,
role=role_maker.Role.WORKER,
worker_endpoints=["127.0.0.1:6003"],
server_endpoints=["127.0.0.1:6001"])
role._node_type_comm = gloo
role._role_is_generated = True
fleet_util._set_role_maker(role)
output = fleet_util.all_reduce(1, "sum", comm_world="server")
print(output)
# self.assertEqual(output, 1)
def test_all_gather(self):
try:
import netifaces
except:
print("warning: no netifaces, skip test_all_gather")
return
gloo = fluid.core.Gloo()
gloo.set_rank(0)
gloo.set_size(1)
gloo.set_prefix("123")
gloo.set_iface("lo")
gloo.set_hdfs_store("./tmp_test_fleet_reduce", "", "")
gloo.init()
role = role_maker.UserDefinedRoleMaker(
is_collective=False,
init_gloo=False,
current_id=0,
role=role_maker.Role.SERVER,
worker_endpoints=["127.0.0.1:6003"],
server_endpoints=["127.0.0.1:6001"])
role._node_type_comm = gloo
role._all_comm = gloo
role._role_is_generated = True
fleet_util._set_role_maker(role)
output = fleet_util.all_gather(1, comm_world="all")
print(output)
# self.assertTrue(len(output) == 1 and output[0] == 1)
self.assertRaises(Exception, fleet_util.all_gather, 1, "test")
def download_files(self):
path = download(self.proto_data_url, self.module_name,
self.proto_data_md5)
print('data is downloaded at ' + path)
tar = tarfile.open(path)
unzip_folder = tempfile.mkdtemp()
tar.extractall(unzip_folder)
return unzip_folder
def test_get_file_shard(self):
self.assertRaises(Exception, fleet_util.get_file_shard, "files")
try:
import netifaces
except:
print("warning: no netifaces, skip test_get_file_shard")
return
role = role_maker.UserDefinedRoleMaker(
is_collective=False,
init_gloo=False,
current_id=0,
role=role_maker.Role.WORKER,
worker_endpoints=["127.0.0.1:6003", "127.0.0.1:6004"],
server_endpoints=["127.0.0.1:6001", "127.0.0.1:6002"])
fleet_util._set_role_maker(role)
files = fleet_util.get_file_shard(["1", "2", "3"])
self.assertTrue(len(files) == 2 and "1" in files and "2" in files)
def test_program_type_trans(self):
data_dir = self.download_files()
program_dir = os.path.join(data_dir, self.pruned_dir)
text_program = "pruned_main_program.pbtxt"
binary_program = "pruned_main_program.bin"
text_to_binary = fleet_util._program_type_trans(program_dir,
text_program, True)
binary_to_text = fleet_util._program_type_trans(program_dir,
binary_program, False)
self.assertTrue(
os.path.exists(os.path.join(program_dir, text_to_binary)))
self.assertTrue(
os.path.exists(os.path.join(program_dir, binary_to_text)))
def test_prams_check(self):
data_dir = self.download_files()
class config:
pass
feed_config = config()
feed_config.feeded_vars_names = ['concat_1.tmp_0', 'concat_2.tmp_0']
feed_config.feeded_vars_dims = [682, 1199]
feed_config.feeded_vars_types = [np.float32, np.float32]
feed_config.feeded_vars_filelist = [
os.path.join(data_dir, os.path.join(self.pruned_dir, "concat_1")),
os.path.join(data_dir, os.path.join(self.pruned_dir, "concat_2"))
]
fetch_config = config()
fetch_config.fetch_vars_names = ['similarity_norm.tmp_0']
conf = config()
conf.batch_size = 1
conf.feed_config = feed_config
conf.fetch_config = fetch_config
conf.dump_model_dir = os.path.join(data_dir, self.pruned_dir)
conf.dump_program_filename = "pruned_main_program.pbtxt"
conf.is_text_dump_program = True
conf.save_params_filename = None
# test saved var's shape
conf.dump_program_filename = "pruned_main_program.save_var_shape_not_match"
self.assertRaises(Exception, fleet_util._params_check)
# test program.proto without feed_op and fetch_op
conf.dump_program_filename = "pruned_main_program.no_feed_fetch"
results = fleet_util._params_check(conf)
self.assertTrue(len(results) == 1)
np.testing.assert_array_almost_equal(
results[0], np.array(
[[3.0590223e-07]], dtype=np.float32))
# test feed_var's shape
conf.dump_program_filename = "pruned_main_program.feed_var_shape_not_match"
self.assertRaises(Exception, fleet_util._params_check)
# test correct case with feed_vars_filelist
conf.dump_program_filename = "pruned_main_program.pbtxt"
results = fleet_util._params_check(conf)
self.assertTrue(len(results) == 1)
np.testing.assert_array_almost_equal(
results[0], np.array(
[[3.0590223e-07]], dtype=np.float32))
# test correct case without feed_vars_filelist
conf.feed_config.feeded_vars_filelist = None
# test feed var with lod_level >= 2
conf.dump_program_filename = "pruned_main_program.feed_lod2"
self.assertRaises(Exception, fleet_util._params_check)
conf.dump_program_filename = "pruned_main_program.pbtxt"
results = fleet_util._params_check(conf)
self.assertTrue(len(results) == 1)
def test_proto_check(self):
data_dir = self.download_files()
class config:
pass
conf = config()
conf.train_prog_path = os.path.join(
data_dir, os.path.join(self.train_dir, "join_main_program.pbtxt"))
conf.is_text_train_program = True
# test not match
conf.pruned_prog_path = os.path.join(
data_dir,
os.path.join(self.pruned_dir,
"pruned_main_program.save_var_shape_not_match"))
conf.is_text_pruned_program = True
conf.draw = False
res = fleet_util._proto_check(conf)
self.assertFalse(res)
# test match
conf.pruned_prog_path = os.path.join(
data_dir,
os.path.join(self.pruned_dir, "pruned_main_program.pbtxt"))
if sys.platform == 'win32' or sys.platform == 'sys.platform':
conf.draw = False
else:
conf.draw = True
conf.draw_out_name = "pruned_check"
res = fleet_util._proto_check(conf)
self.assertTrue(res)
def test_visualize(self):
if sys.platform == 'win32' or sys.platform == 'sys.platform':
pass
else:
data_dir = self.download_files()
program_path = os.path.join(
data_dir,
os.path.join(self.train_dir, "join_main_program.pbtxt"))
is_text = True
program = fleet_util._load_program(program_path, is_text)
output_dir = os.path.join(data_dir, self.train_dir)
output_filename = "draw_prog"
fleet_util._visualize_graphviz(program, output_dir, output_filename)
self.assertTrue(
os.path.exists(
os.path.join(output_dir, output_filename + ".dot")))
self.assertTrue(
os.path.exists(
os.path.join(output_dir, output_filename + ".pdf")))
if __name__ == "__main__":
unittest.main()
......@@ -20,9 +20,7 @@ import os
import sys
import inspect
from paddle.fluid.incubate.fleet.utils.fs import LocalFS, FS
from paddle.fluid.incubate.fleet.utils.hdfs import HDFSClient
from paddle.fluid.incubate.fleet.utils.hdfs import FSTimeOut, FSFileExistsError, FSFileNotExistsError
from paddle.fleet.utils import LocalFS, FS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
class FSTest(unittest.TestCase):
......
......@@ -19,9 +19,7 @@ from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer, fleet, T
import os
import sys
from paddle.fluid.incubate.fleet.utils.fs import LocalFS
from paddle.fluid.incubate.fleet.utils.hdfs import HDFSClient
from paddle.fluid.incubate.fleet.utils.hdfs import FSTimeOut, FSFileExistsError, FSFileNotExistsError
from paddle.fleet.utils import LocalFS, HDFSClient, FSTimeOut, FSFileExistsError, FSFileNotExistsError
java_home = os.environ["JAVA_HOME"]
......
......@@ -14,13 +14,15 @@
from __future__ import print_function
import os
import unittest
import numpy as np
import paddle
import paddle.fluid as fluid
from paddle.fluid.dygraph import Linear
from paddle.fluid.dygraph import declarative
from paddle.fluid.dygraph import declarative, ProgramTranslator
from paddle.fluid.dygraph.io import VARIABLE_FILENAME, EXTRA_VAR_INFO_FILENAME
BATCH_SIZE = 32
BATCH_NUM = 20
......@@ -77,8 +79,8 @@ class LinearNetReturnLoss(fluid.dygraph.Layer):
def train(layer):
# create optimizer
adam = fluid.optimizer.AdamOptimizer(
learning_rate=0.1, parameter_list=layer.parameters())
adam = fluid.optimizer.SGDOptimizer(
learning_rate=0.01, parameter_list=layer.parameters())
# create data loader
train_loader = fluid.io.DataLoader.from_generator(capacity=5)
train_loader.set_batch_generator(random_batch_reader())
......@@ -111,37 +113,43 @@ class TestJitSaveLoad(unittest.TestCase):
# config seed
fluid.default_main_program().random_seed = SEED
def train_and_save_model(self):
def train_and_save_model(self, model_path=None, configs=None):
layer = LinearNet(784, 1)
example_inputs, layer, _ = train(layer)
final_model_path = model_path if model_path else self.model_path
orig_input_types = [type(x) for x in example_inputs]
fluid.dygraph.jit.save(
layer=layer, model_path=self.model_path, input_spec=example_inputs)
layer=layer,
model_path=final_model_path,
input_spec=example_inputs,
configs=configs)
new_input_types = [type(x) for x in example_inputs]
self.assertEqual(orig_input_types, new_input_types)
return layer
def test_save(self):
# train and save model
self.train_and_save_model()
def test_load_infernece(self):
def test_save_load(self):
# train and save model
train_layer = self.train_and_save_model()
# load model
infer_layer = fluid.dygraph.jit.load(self.model_path)
program_translator = ProgramTranslator()
program_translator.enable(False)
loaded_layer = fluid.dygraph.jit.load(self.model_path)
self.load_and_inference(train_layer, loaded_layer)
self.load_dygraph_state_dict(train_layer)
self.load_and_finetune(train_layer, loaded_layer)
program_translator.enable(True)
def load_and_inference(self, train_layer, infer_layer):
train_layer.eval()
infer_layer.eval()
# inference & compare
x = fluid.dygraph.to_variable(
np.random.random((1, 784)).astype('float32'))
self.assertTrue(
np.array_equal(train_layer(x).numpy(), infer_layer(x).numpy()))
def test_load_finetune(self):
# train and save model
train_layer = self.train_and_save_model()
# load model
load_train_layer = fluid.dygraph.jit.load(self.model_path)
def load_and_finetune(self, train_layer, load_train_layer):
train_layer.train()
load_train_layer.train()
# train & compare
_, _, train_loss = train(train_layer)
......@@ -149,6 +157,19 @@ class TestJitSaveLoad(unittest.TestCase):
self.assertTrue(
np.array_equal(train_loss.numpy(), load_train_loss.numpy()))
def load_dygraph_state_dict(self, train_layer):
train_layer.eval()
# contruct new model
new_layer = LinearNet(784, 1)
model_dict, _ = fluid.dygraph.load_dygraph(self.model_path)
new_layer.set_dict(model_dict)
new_layer.eval()
# inference & compare
x = fluid.dygraph.to_variable(
np.random.random((1, 784)).astype('float32'))
self.assertTrue(
np.array_equal(train_layer(x).numpy(), new_layer(x).numpy()))
def test_save_get_program_failed(self):
layer = LinearNetNotDeclarative(784, 1)
example_inputs, layer, _ = train(layer)
......@@ -158,6 +179,31 @@ class TestJitSaveLoad(unittest.TestCase):
model_path=self.model_path,
input_spec=example_inputs)
def test_load_dygraoh_no_path(self):
model_path = "model.test_jit_save_load.no_path"
new_layer = LinearNet(784, 1)
with self.assertRaises(ValueError):
model_dict, _ = fluid.dygraph.load_dygraph(model_path)
def test_load_dygraph_no_var_info(self):
model_path = "model.test_jit_save_load.no_var_info"
self.train_and_save_model(model_path=model_path)
# remove `__variables.info__`
var_info_path = os.path.join(model_path, EXTRA_VAR_INFO_FILENAME)
os.remove(var_info_path)
new_layer = LinearNet(784, 1)
with self.assertRaises(RuntimeError):
model_dict, _ = fluid.dygraph.load_dygraph(model_path)
def test_load_dygraph_not_var_file(self):
model_path = "model.test_jit_save_load.no_var_file"
configs = fluid.dygraph.jit.SaveLoadConfig()
configs.params_filename = "__params__"
self.train_and_save_model(model_path=model_path, configs=configs)
new_layer = LinearNet(784, 1)
with self.assertRaises(RuntimeError):
model_dict, _ = fluid.dygraph.load_dygraph(model_path)
class TestJitSaveLoadConfig(unittest.TestCase):
def setUp(self):
......
......@@ -3,7 +3,7 @@ set -e
# use default values
# FIXME: random fails on Unknown command lines -c (or -m).
launch_py=${PADDLE_BINARY_DIR}/python/paddle/distributed/launch.py
python ${launch_py} multi_process.py
python ${launch_py} multi_process.py launch
# use paddlecloud
echo "begin test use paddlecloud"
......@@ -18,12 +18,12 @@ export PADDLE_PORT=35019
export TRAINER_PORTS_NUM=2
distributed_args="--use_paddlecloud --cluster_node_ips=${cluster_node_ips} --node_ip=${node_ip} --selected_gpus=0,1 --log_dir=testlog"
CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} ${distributed_args} multi_process.py
CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} ${distributed_args} multi_process.py launch
str1="selected_gpus:0 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35019 trainer_id:0"
str2="selected_gpus:1 worker_endpoints:127.0.0.1:35019,127.0.0.1:35020,127.0.0.2:35019,127.0.0.2:35020 trainers_num:4 current_endpoint:127.0.0.1:35020 trainer_id:1"
file_0="multi_process.check_0.log"
file_1="multi_process.check_1.log"
file_0="multi_process_launch.check_0.log"
file_1="multi_process_launch.check_1.log"
echo "paddlecloud params test"
if grep -q "$str1" "$file_0"; then
......@@ -54,7 +54,7 @@ unset TRAINER_PORTS_NUM
echo ""
echo "paddle.distributed.launch async poll process test"
if ! CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} ${distributed_args} multi_process.py abort; then
if ! CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} ${distributed_args} multi_process.py launch abort; then
echo "train abort as planned"
fi
......
......@@ -63,7 +63,7 @@ def case_generator(op_type, Xshape, diagonal, expected):
"diagonal: TypeError":
"diagonal in {} must be a python Int".format(op_type),
"input: ValueError":
"input shape in {} must be at least 2-D".format(op_type),
"x shape in {} must be at least 2-D".format(op_type),
}
class FailureCase(unittest.TestCase):
......@@ -71,7 +71,7 @@ def case_generator(op_type, Xshape, diagonal, expected):
data = fluid.data(shape=Xshape, dtype='float64', name=cls_name)
with self.assertRaisesRegexp(
eval(expected.split(':')[-1]), errmsg[expected]):
getattr(tensor, op_type)(input=data, diagonal=diagonal)
getattr(tensor, op_type)(x=data, diagonal=diagonal)
class SuccessCase(TrilTriuOpDefaultTest):
def initTestCase(self):
......
......@@ -81,7 +81,7 @@ class API_TestUnsqueeze(unittest.TestCase):
def test_out(self):
with fluid.program_guard(fluid.Program(), fluid.Program()):
data1 = fluid.layers.data('data1', shape=[-1, 10], dtype='float64')
result_squeeze = paddle.unsqueeze(data1, axes=[1])
result_squeeze = paddle.unsqueeze(data1, axis=[1])
place = fluid.CPUPlace()
exe = fluid.Executor(place)
input1 = np.random.random([5, 1, 10]).astype('float64')
......@@ -98,7 +98,7 @@ class TestUnsqueezeOpError(unittest.TestCase):
def test_axes_type():
x6 = fluid.layers.data(
shape=[-1, 10], dtype='float16', name='x3')
paddle.unsqueeze(x6, axes=3.2)
paddle.unsqueeze(x6, axis=3.2)
self.assertRaises(TypeError, test_axes_type)
......@@ -108,7 +108,7 @@ class API_TestUnsqueeze2(unittest.TestCase):
with fluid.program_guard(fluid.Program(), fluid.Program()):
data1 = fluid.data('data1', shape=[-1, 10], dtype='float64')
data2 = fluid.data('data2', shape=[1], dtype='int32')
result_squeeze = paddle.unsqueeze(data1, axes=data2)
result_squeeze = paddle.unsqueeze(data1, axis=data2)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
input1 = np.random.random([5, 1, 10]).astype('float64')
......@@ -125,7 +125,7 @@ class API_TestUnsqueeze3(unittest.TestCase):
with fluid.program_guard(fluid.Program(), fluid.Program()):
data1 = fluid.data('data1', shape=[-1, 10], dtype='float64')
data2 = fluid.data('data2', shape=[1], dtype='int32')
result_squeeze = paddle.unsqueeze(data1, axes=[data2, 3])
result_squeeze = paddle.unsqueeze(data1, axis=[data2, 3])
place = fluid.CPUPlace()
exe = fluid.Executor(place)
input1 = np.random.random([5, 1, 10, 1]).astype('float64')
......@@ -143,7 +143,7 @@ class API_TestDyUnsqueeze(unittest.TestCase):
input_1 = np.random.random([5, 1, 10]).astype("int32")
input1 = np.squeeze(input_1, axis=1)
input = fluid.dygraph.to_variable(input_1)
output = paddle.unsqueeze(input, axes=[1])
output = paddle.unsqueeze(input, axis=[1])
out_np = output.numpy()
self.assertTrue(np.allclose(input1, out_np))
......@@ -154,7 +154,7 @@ class API_TestDyUnsqueeze2(unittest.TestCase):
input_1 = np.random.random([5, 1, 10]).astype("int32")
input1 = np.squeeze(input_1, axis=1)
input = fluid.dygraph.to_variable(input_1)
output = paddle.unsqueeze(input, axes=1)
output = paddle.unsqueeze(input, axis=1)
out_np = output.numpy()
self.assertTrue(np.allclose(input1, out_np))
......
......@@ -248,7 +248,7 @@ def zeros(shape, dtype=None, name=None):
# shape is a Tensor
shape = paddle.fill_constant(shape=[2], dtype='int32', value=2)
data3 = paddle.ones(shape=shape, dtype='int32')
data3 = paddle.zeros(shape=shape, dtype='int32')
# [[0 0]
# [0 0]]
"""
......@@ -490,14 +490,13 @@ def _tril_triu_op(helper):
"""Base op of tril_op and triu_op
"""
op_type = helper.layer_type
x = helper.kwargs.get('input', None)
x = helper.kwargs.get('x', None)
assert x is not None, 'x cannot be None in {}'.format(op_type)
check_variable_and_dtype(x, 'x', ['float32', 'float64', 'int32', 'int64'],
op_type)
if len(x.shape) < 2:
raise ValueError("input shape in {} must be at least 2-D".format(
op_type))
raise ValueError("x shape in {} must be at least 2-D".format(op_type))
diagonal = helper.kwargs.get('diagonal', 0)
if not isinstance(diagonal, (int, )):
raise TypeError("diagonal in {} must be a python Int".format(op_type))
......@@ -521,18 +520,18 @@ def _tril_triu_op(helper):
return out
def tril(input, diagonal=0, name=None):
def tril(x, diagonal=0, name=None):
"""
:alias_main: paddle.tril
:alias: paddle.tril,paddle.tensor.tril,paddle.tensor.creation.tril
This op returns the lower triangular part of a matrix (2-D tensor) or batch
of matrices :attr:`input`, the other elements of the result tensor are set
of matrices :attr:`x`, the other elements of the result tensor are set
to 0. The lower triangular part of the matrix is defined as the elements
on and below the diagonal.
Args:
input (Variable): The input variable which is a Tensor.
x (Variable): The input variable x which is a Tensor.
Support data types: ``float64``, ``float32``, ``int32``, ``int64``.
diagonal (int, optional): The diagonal to consider, default value is 0.
If :attr:`diagonal` = 0, all elements on and below the main diagonal are
......@@ -545,47 +544,41 @@ def tril(input, diagonal=0, name=None):
user to set this property. For more information, please refer to :ref:`api_guide_Name`.
Returns:
Variable: Tensor, results of lower triangular operation by the specified diagonal of input tensor,
it's data type is the same as input's Tensor.
Variable: Tensor, results of lower triangular operation by the specified diagonal of input tensor x,
it's data type is the same as x's Tensor.
Raises:
TypeError: diagonal is not a int type.
ValueError: dimension of :attr:`input` is less than 2.
ValueError: dimension of :attr:`x` is less than 2.
Examples:
.. code-block:: python
import numpy as np
import paddle.tensor as tensor
import paddle.fluid as fluid
import paddle
data = np.arange(1, 13, dtype="int64").reshape(3,-1)
# array([[ 1, 2, 3, 4],
# [ 5, 6, 7, 8],
# [ 9, 10, 11, 12]])
x = fluid.data(shape=(-1, 4), dtype='int64', name='x')
exe = fluid.Executor(fluid.CPUPlace())
# example 1, default diagonal
tril = tensor.tril(x)
tril_out, = exe.run(fluid.default_main_program(), feed={"x": data},
fetch_list=[tril], return_numpy=True)
paddle.enable_imperative()
x = paddle.imperative.to_variable(data)
tril1 = paddle.tensor.tril(x)
# array([[ 1, 0, 0, 0],
# [ 5, 6, 0, 0],
# [ 9, 10, 11, 0]])
# example 2, positive diagonal value
tril = tensor.tril(x, diagonal=2)
tril_out, = exe.run(fluid.default_main_program(), feed={"x": data},
fetch_list=[tril], return_numpy=True)
tril2 = paddle.tensor.tril(x, diagonal=2)
# array([[ 1, 2, 3, 0],
# [ 5, 6, 7, 8],
# [ 9, 10, 11, 12]])
# example 3, negative diagonal value
tril = tensor.tril(x, diagonal=-1)
tril_out, = exe.run(fluid.default_main_program(), feed={"x": data},
fetch_list=[tril], return_numpy=True)
tril3 = paddle.tensor.tril(x, diagonal=-1)
# array([[ 0, 0, 0, 0],
# [ 5, 0, 0, 0],
# [ 9, 10, 0, 0]])
......@@ -593,23 +586,23 @@ def tril(input, diagonal=0, name=None):
"""
if in_dygraph_mode():
op = getattr(core.ops, 'tril_triu')
return op(input, 'diagonal', diagonal, "lower", True)
return op(x, 'diagonal', diagonal, "lower", True)
return _tril_triu_op(LayerHelper('tril', **locals()))
def triu(input, diagonal=0, name=None):
def triu(x, diagonal=0, name=None):
"""
:alias_main: paddle.triu
:alias: paddle.triu,paddle.tensor.triu,paddle.tensor.creation.triu
This op returns the upper triangular part of a matrix (2-D tensor) or batch of matrices
:attr:`input`, the other elements of the result tensor are set to 0.
:attr:`x`, the other elements of the result tensor are set to 0.
The upper triangular part of the matrix is defined as the elements on and
above the diagonal.
Args:
input (Variable): The input variable which is a Tensor.
x (Variable): The input variable x which is a Tensor.
Support data types: ``float64``, ``float32``, ``int32``, ``int64``.
diagonal (int, optional): The diagonal to consider, default value is 0.
If :attr:`diagonal` = 0, all elements on and above the main diagonal are
......@@ -622,47 +615,41 @@ def triu(input, diagonal=0, name=None):
user to set this property. For more information, please refer to :ref:`api_guide_Name`.
Returns:
Variable: Tensor, results of upper triangular operation by the specified diagonal of input tensor,
it's data type is the same as input's Tensor.
Variable: Tensor, results of upper triangular operation by the specified diagonal of input tensor x,
it's data type is the same as x's Tensor.
Raises:
TypeError: diagonal is not a int type.
ValueError: dimension of :attr:`input` is less than 2.
ValueError: dimension of :attr:`x` is less than 2.
Examples:
.. code-block:: python
import numpy as np
import paddle.fluid as fluid
import paddle.tensor as tensor
import paddle
data = np.arange(1, 13, dtype="int64").reshape(3,-1)
# array([[ 1, 2, 3, 4],
# [ 5, 6, 7, 8],
# [ 9, 10, 11, 12]])
x = fluid.data(shape=(-1, 4), dtype='int64', name='x')
exe = fluid.Executor(fluid.CPUPlace())
paddle.enable_imperative()
# example 1, default diagonal
triu = tensor.triu(x)
triu_out, = exe.run(fluid.default_main_program(), feed={"x": data},
fetch_list=[triu], return_numpy=True)
x = paddle.imperative.to_variable(data)
triu1 = paddle.tensor.triu(x)
# array([[ 1, 2, 3, 4],
# [ 0, 6, 7, 8],
# [ 0, 0, 11, 12]])
# example 2, positive diagonal value
triu = tensor.triu(x, diagonal=2)
triu_out, = exe.run(fluid.default_main_program(), feed={"x": data},
fetch_list=[triu], return_numpy=True)
triu2 = paddle.tensor.triu(x, diagonal=2)
# array([[0, 0, 3, 4],
# [0, 0, 0, 8],
# [0, 0, 0, 0]])
# example 3, negative diagonal value
triu = tensor.triu(x, diagonal=-1)
triu_out, = exe.run(fluid.default_main_program(), feed={"x": data},
fetch_list=[triu], return_numpy=True)
triu3 = paddle.tensor.triu(x, diagonal=-1)
# array([[ 1, 2, 3, 4],
# [ 5, 6, 7, 8],
# [ 0, 10, 11, 12]])
......@@ -670,7 +657,7 @@ def triu(input, diagonal=0, name=None):
"""
if in_dygraph_mode():
op = getattr(core.ops, 'tril_triu')
return op(input, 'diagonal', diagonal, "lower", False)
return op(x, 'diagonal', diagonal, "lower", False)
return _tril_triu_op(LayerHelper('triu', **locals()))
......
......@@ -729,26 +729,32 @@ def bmm(x, y, name=None):
Examples:
import paddle
import paddle.fluid as fluid
x = fluid.layers.data(name='x', shape=[10, 3, 4], dtype='float32')
y = fluid.layers.data(name='y', shape=[10, 4, 5], dtype='float32')
out = paddle.bmm(x, y)
# In dygraph mode:
# In imperative mode:
# size input1: (2, 2, 3) and input2: (2, 3, 2)
input1 = np.array([[[1.0, 1.0, 1.0],[2.0, 2.0, 2.0]],[[3.0, 3.0, 3.0],[4.0, 4.0, 4.0]]])
input2 = np.array([[[1.0, 1.0],[2.0, 2.0],[3.0, 3.0]],[[4.0, 4.0],[5.0, 5.0],[6.0, 6.0]]])
with fluid.dygraph.guard():
x = fluid.dygraph.to_variable(input1)
y = fluid.dygraph.to_variable(input2)
out = paddle.bmm(x, y)
#output size: (2, 2, 2)
#output value:
#[[[6.0, 6.0],[12.0, 12.0]],[[45.0, 45.0],[60.0, 60.0]]]
out_np = out.numpy()
paddle.enable_imperative()
x = paddle.imperative.to_variable(input1)
y = paddle.imperative.to_variable(input2)
out = paddle.bmm(x, y)
#output size: (2, 2, 2)
#output value:
#[[[6.0, 6.0],[12.0, 12.0]],[[45.0, 45.0],[60.0, 60.0]]]
out_np = out.numpy()
"""
x_shape = x.shape
y_shape = y.shape
if not len(x_shape) == len(y_shape) == 3:
raise ValueError(
"x and y should be 3-dimensional. But received x's dimention: {}, y's dimention: {}".
format(x_shape, y_shape))
if x_shape[2] != y_shape[1]:
raise ValueError(
"x's width must be equal with y's height. But received x's shape: {}, y's shape: {}".
format(x_shape, y_shape))
helper = LayerHelper('bmm', **locals())
if in_dygraph_mode():
return core.ops.bmm(x, y)
......
......@@ -42,11 +42,32 @@ from ..fluid import layers
import paddle
__all__ = [
'cast', 'concat', 'expand', 'expand_as', 'flatten', 'gather', 'gather_nd',
'reshape', 'reverse', 'scatter', 'scatter_nd_add', 'scatter_nd',
'shard_index', 'slice', 'split', 'squeeze', 'stack', 'strided_slice',
'transpose', 'unique', 'unique_with_counts', 'unsqueeze', 'unstack', 'flip',
'unbind', 'roll'
'cast',
'concat',
'expand',
'expand_as',
'flatten',
'gather',
'gather_nd',
'reshape',
'reverse',
'scatter',
'scatter_nd_add',
'scatter_nd',
'shard_index',
'slice',
'split',
'squeeze',
'stack',
'strided_slice',
'transpose',
'unique',
'unique_with_counts',
'unsqueeze',
'unstack',
'flip',
'unbind',
'roll',
]
......@@ -417,7 +438,7 @@ def stack(x, axis=0, name=None):
Args:
x (Tensor|list[Tensor]): Input ``x`` can be a single tensor, or a ``list`` of tensors.
If ``x`` is a ``list``, the Tensors in ``x``
must be of the same shape and dtype. Support data types: float32, float64, int32, int64.
must be of the same shape and dtype. Supported data types: float32, float64, int32, int64.
axis (int, optional): The axis along which all inputs are stacked. ``axis`` range is ``[-(R+1), R+1)``,
where ``R`` is the number of dimensions of the first input tensor ``x[0]``.
If ``axis < 0``, ``axis = axis+R+1``. The default value of axis is 0.
......@@ -559,18 +580,19 @@ def squeeze(x, axis=None, name=None):
out.shape = [1, 3, 5]
Args:
input (Tensor): The input Tensor. Support data type: float32, float64, int8, int32, int64.
x (Tensor): The input Tensor. Supported data type: float32, float64, bool, int8, int32, int64.
axis (int|list|tuple, optional): An integer or list of integers, indicating the dimensions to be squeezed. Default is None.
The range of axis is :math:`[-ndim(input), ndim(input))`.
If axis is negative, :math:`axis = axis + ndim(input)`.
If axis is None, all the dimensions of input of size 1 will be removed.
The range of axis is :math:`[-ndim(x), ndim(x))`.
If axis is negative, :math:`axis = axis + ndim(x)`.
If axis is None, all the dimensions of x of size 1 will be removed.
name (str, optional): Please refer to :ref:`api_guide_Name`, Default None.
Returns:
Tensor: Output squeezed Tensor. Data type is same as input Tensor.
Tensor: Squeezed Tensor with the same data type as input Tensor.
Examples:
.. code-block:: python
import paddle
paddle.enable_imperative()
......@@ -590,87 +612,50 @@ def squeeze(x, axis=None, name=None):
return layers.squeeze(x, axis, name)
def unsqueeze(input, axes, out=None, name=None):
def unsqueeze(x, axis, name=None):
"""
:alias_main: paddle.unsqueeze
:alias: paddle.unsqueeze,paddle.tensor.unsqueeze,paddle.tensor.manipulation.unsqueeze
Insert single-dimensional entries to the shape of a Tensor. Takes one
required argument axes, a list of dimensions that will be inserted.
Dimension indices in axes are as seen in the output tensor.
For example:
.. code-block:: text
:alias: paddle.unsqueeze, paddle.tensor.unsqueeze, paddle.tensor.manipulation.unsqueeze
Given a tensor such that tensor with shape [3, 4, 5],
then Unsqueezed tensor with axes=[0, 4] has shape [1, 3, 4, 5, 1].
Insert single-dimensional entries to the shape of input Tensor ``x``. Takes one
required argument axis, a dimension or list of dimensions that will be inserted.
Dimension indices in axis are as seen in the output tensor.
Args:
input (Variable): The input Tensor to be unsqueezed. It is a N-D Tensor of data types float32, float64, int32.
axes (int|list|tuple|Variable): Indicates the dimensions to be inserted. The data type is ``int32`` . If ``axes`` is a list or tuple, the elements of it should be integers or Tensors with shape [1]. If ``axes`` is an Variable, it should be an 1-D Tensor .
name (str|None): Name for this layer.
x (Tensor): The input Tensor to be unsqueezed. Supported data type: float32, float64, bool, int8, int32, int64.
axis (int|list|tuple|Tensor): Indicates the dimensions to be inserted. The data type is ``int32`` .
If ``axis`` is a list or tuple, the elements of it should be integers or Tensors with shape [1].
If ``axis`` is a Tensor, it should be an 1-D Tensor .
If ``axis`` is negative, ``axis = axis + ndim(x) + 1``.
name (str|None): Name for this layer. Please refer to :ref:`api_guide_Name`, Default None.
Returns:
Variable: Output unsqueezed Tensor, with data type being float32, float64, int32, int64.
Tensor: Unsqueezed Tensor with the same data type as input Tensor.
Examples:
.. code-block:: python
import numpy as np
import paddle
import paddle.fluid as fluid
with fluid.dygraph.guard():
input_1 = np.random.random([5, 10]).astype("int32")
# input is a variable which shape is [5, 10]
input = fluid.dygraph.to_variable(input_1)
paddle.enable_imperative()
x = paddle.rand([5, 10])
print(x.shape) # [5, 10]
out1 = paddle.unsqueeze(x, axis=0)
print(out1.shape) # [1, 5, 10]
out2 = paddle.unsqueeze(x, axis=[0, 2])
print(out2.shape) # [1, 5, 1, 10]
output = paddle.unsqueeze(input, axes=[1])
# output.shape [5, 1, 10]
axis = paddle.fluid.dygraph.to_variable([0, 1, 2])
out3 = paddle.unsqueeze(x, axis=axis)
print(out3.shape) # [1, 1, 1, 5, 10]
"""
if not isinstance(axes, (int, list, tuple, Variable)):
raise TypeError(
"The type of 'axes' in unsqueeze must be int, list, tuple or Variable, but "
"received %s." % (type(axes)))
helper = LayerHelper("unsqueeze2", **locals())
inputs = {"X": input}
attrs = {}
def _to_Variable_list(one_list):
Variable_list = []
for ele in one_list:
if isinstance(ele, Variable):
ele.stop_gradient = True
Variable_list.append(ele)
else:
assert (isinstance(ele, int))
temp_out = helper.create_variable_for_type_inference('int32')
fill_constant([1], 'int32', ele, force_cpu=True, out=temp_out)
Variable_list.append(temp_out)
return Variable_list
if isinstance(axes, int):
axes = [axes]
if isinstance(axes, Variable):
axes.stop_gradient = True
inputs["AxesTensor"] = axes
elif isinstance(axes, (list, tuple)):
contain_var = not all(not isinstance(ele, Variable) for ele in axes)
if contain_var:
inputs["AxesTensorList"] = _to_Variable_list(axes)
else:
attrs["axes"] = axes
out = helper.create_variable_for_type_inference(dtype=input.dtype)
x_shape = helper.create_variable_for_type_inference(dtype=input.dtype)
helper.append_op(
type="unsqueeze2",
inputs=inputs,
attrs=attrs,
outputs={"Out": out,
"XShape": x_shape})
if isinstance(axis, int):
axis = [axis]
return out
return layers.unsqueeze(x, axis, name)
def gather(input, index, overwrite=True):
......
......@@ -915,7 +915,7 @@ def mm(input, mat2, name=None):
return out
def addmm(input, x, y, alpha=1.0, beta=1.0, name=None):
def addmm(input, x, y, beta=1.0, alpha=1.0, name=None):
"""
:alias_main: paddle.addmm
:alias: paddle.addmm,paddle.tensor.addmm,paddle.tensor.math.addmm
......@@ -935,8 +935,8 @@ def addmm(input, x, y, alpha=1.0, beta=1.0, name=None):
input (Variable): The input Tensor/LoDTensor to be added to the final result.
x (Variable): The first input Tensor/LoDTensor for matrix multiplication.
y (Variable): The second input Tensor/LoDTensor for matrix multiplication.
alpha (float): Coefficient of $x*y$.
beta (float): Coefficient of $input$.
alpha (float): Coefficient of $x*y$.
name (str, optional): Name of the output. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`. Default is None.
Returns:
......@@ -947,25 +947,43 @@ def addmm(input, x, y, alpha=1.0, beta=1.0, name=None):
import numpy as np
import paddle
import paddle.fluid as fluid
input = fluid.data(name='input', shape=[2, 2], dtype='float32')
x = fluid.data(name='x', shape=[2, 2], dtype='float32')
y = fluid.data(name='y', shape=[2, 2], dtype='float32')
out = paddle.addmm( input=input, x=x, y=y, alpha=5.0, beta=0.5 )
data_x = np.ones((2, 2)).astype(np.float32)
data_y = np.ones((2, 2)).astype(np.float32)
data_input = np.ones((2, 2)).astype(np.float32)
place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda() else fluid.CPUPlace()
exe = fluid.Executor(place)
results = exe.run(fluid.default_main_program(),
fetch_list=[out], feed={"input": data_input, 'x': data_x, "y": data_y})
print( np.array(results[0]) )
paddle.enable_imperative()
x = paddle.imperative.to_variable(data_x)
y = paddle.imperative.to_variable(data_y)
input = paddle.imperative.to_variable(data_input)
out = paddle.tensor.addmm( input=input, x=x, y=y, beta=0.5, alpha=5.0 )
print( out.numpy() )
# [[10.5 10.5]
# [10.5 10.5]]
"""
input_shape = input.shape
x_shape = x.shape
y_shape = y.shape
if not len(input_shape) == len(x_shape) == len(y_shape) == 2:
raise ValueError("The dimention of input, x, y should be 2 but receive input's shape: {}, x's shape: {}, y's shape: {}".format(input_shape, x_shape, y_shape))
if input_shape[0] != x_shape[0]:
if input_shape[0] != 1:
raise ValueError( "When x's dimension[0] is not equal with input's dimension[0], input's dimension[0] must be 1 but got {}".format(input_shape[0]))
if input_shape[1] != y_shape[1] and input_shape[1] != 1:
raise ValueError( "When y's dimension[1] is not equal with input's dimension[1], input's dimension[1] must be 1 but got {}".format(input_shape[1]))
if input_shape[1] != y_shape[1]:
if input_shape[1] != 1:
raise ValueError( "When y's dimension[1] is not equal with input's dimension[1], input's dimension[1] must be 1 but got {}".format(input_shape[1]))
if input_shape[0] != x_shape[0] and input_shape[0] != 1:
raise ValueError( "When x's dimension[0] is not equal with input's dimension[0], input's dimension[0] must be 1 but got {}".format(input_shape[0]))
if x_shape[1] != y_shape[0]:
raise ValueError("The input Variable x's width must be equal with Variable y' height. But received x's shape = {}, y's shape = {}.".format(x_shape, y_shape))
if in_dygraph_mode():
out = core.ops.addmm(input, x, y, "Alpha", alpha, "Beta", beta)
return out
......@@ -974,7 +992,7 @@ def addmm(input, x, y, alpha=1.0, beta=1.0, name=None):
attrs = {'Alpha': alpha, 'Beta': beta}
helper = LayerHelper("addmm", **locals())
check_variable_and_dtype(x, 'Input', ['float32', 'float64'], 'addmm')
check_variable_and_dtype(input, 'Input', ['float32', 'float64'], 'addmm')
check_variable_and_dtype(x, 'X', ['float32', 'float64'], 'addmm')
check_variable_and_dtype(y, 'Y', ['float32', 'float64'], 'addmm')
out = helper.create_variable_for_type_inference(dtype=x.dtype)
......
......@@ -21,3 +21,4 @@ prettytable
objgraph
astor
pathlib
netifaces
......@@ -152,6 +152,7 @@ packages=['paddle',
'paddle.fleet.dataset',
'paddle.fleet.metrics',
'paddle.fleet.proto',
'paddle.fleet.utils',
'paddle.framework',
'paddle.fluid',
'paddle.fluid.dygraph',
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册